├── .gitignore ├── README.md ├── code_editing_examples.md ├── code_explanation_examples.md ├── code_writing_examples.md ├── examples ├── Backtranslation_of_SQL_queries.py ├── Classification_using_embeddings.ipynb ├── Clustering.ipynb ├── Clustering_for_transaction_classification.ipynb ├── Code_search.ipynb ├── Customizing_embeddings.ipynb ├── Embedding_long_inputs.ipynb ├── Fine-tuned_classification.ipynb ├── Get_embeddings.ipynb ├── How_to_count_tokens_with_tiktoken.ipynb ├── How_to_handle_rate_limits.ipynb ├── How_to_stream_completions.ipynb ├── Multiclass_classification_for_transactions.ipynb ├── Obtain_dataset.ipynb ├── Question_answering_using_embeddings.ipynb ├── Recommendation_using_embeddings.ipynb ├── Regression_using_embeddings.ipynb ├── Semantic_text_search_using_embeddings.ipynb ├── Unit_test_writing_using_a_multi-step_prompt.ipynb ├── User_and_product_embeddings.ipynb ├── Visualizing_embeddings_in_2D.ipynb ├── Visualizing_embeddings_in_3D.ipynb ├── Zero-shot_classification_with_embeddings.ipynb ├── api_request_parallel_processor.py ├── azure │ ├── completions.ipynb │ ├── embeddings.ipynb │ └── finetuning.ipynb ├── book_translation │ ├── data │ │ ├── geometry_English.tex │ │ └── geometry_slovenian.tex │ └── translate_latex_book.ipynb ├── dalle │ └── Image_generations_edits_and_variations_with_DALL-E.ipynb ├── data │ ├── 25000_spend_dataset_current.csv │ ├── AG_news_samples.csv │ ├── dbpedia_samples.jsonl │ ├── example_requests_to_parallel_process.jsonl │ ├── fine_food_reviews_1k.csv │ ├── fine_food_reviews_with_embeddings_1k.csv │ ├── labelled_transactions.csv │ ├── library_transactions_with_embeddings_359.csv │ ├── recommendations_embeddings_cache.pkl │ └── snli_1.0_train_2k.csv └── fine-tuned_qa │ ├── answers_with_ft.py │ ├── olympics-1-collect-data.ipynb │ ├── olympics-2-create-qa.ipynb │ └── olympics-3-train-qa.ipynb ├── how_to_work_with_large_language_models.md ├── images ├── OpenAI_Logo.png ├── chain_of_thought_fig1.png ├── chain_of_thought_fig11.png ├── chain_of_thought_fig3.png ├── chain_of_thought_fig5.png ├── faithful-reasoning_fig1.png ├── faithful-reasoning_fig2.png ├── faithful-reasoning_fig3.png ├── faithful-reasoning_fig4.png ├── faithful-reasoning_fig5.png ├── faithful-reasoning_fig7.png ├── faithful-reasoning_tab2.png ├── faithful-reasoning_tab5.png ├── least-to-most_fig1.png ├── least-to-most_tab11.png ├── least-to-most_tab4.png ├── least-to-most_tab9.png ├── lm_cascades_fig1.png ├── lm_cascades_fig3.png ├── lm_cascades_fig4.png ├── lm_cascades_fig5.png ├── lm_cascades_fig6.png ├── maieutic_fig2.png ├── maieutic_fig6.png ├── maieutic_tab1.png ├── selection-inference_fig1.png ├── selection-inference_fig4.png ├── self-consistency_fig1.png ├── self-consistency_fig3.png ├── star_fig1.png ├── star_tab1.png ├── verifiers_fig3.png ├── verifiers_fig5.png ├── zero-shot_reasoners_fig1.png ├── zero-shot_reasoners_fig2.png ├── zero-shot_reasoners_tab1.png └── zero-shot_reasoners_tab5.png ├── solutions └── web_crawl_Q&A │ ├── requirements.txt │ ├── web-qa.ipynb │ └── web-qa.py ├── techniques_to_improve_reliability.md ├── text_comparison_examples.md ├── text_editing_examples.md ├── text_explanation_examples.md ├── text_writing_examples.md └── transition_guides_for_deprecated_API_endpoints ├── README.md ├── answers_functionality_example.py ├── classification_functionality_example.py └── search_functionality_example.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Data 132 | *transactions*.jsonl 133 | /examples/data/transactions* 134 | *.DS_Store 135 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # OpenAI 中文手册(OpenAI Cookbook) 4 | 5 | ![openai-logo](https://raw.githubusercontent.com/imcda/openai-cookbook-zh-cn/main/images/OpenAI_Logo.png) 6 | 7 | OpenAI 中文手册分享了使用 [OpenAI API] 完成常见任务的示例代码。 8 | 9 | 要运行这些例子,你需要一个OpenAI账户和相关的API密钥([创建一个免费账户][API Signup]). 10 | 11 | 大多数代码示例是用Python编写的,尽管这些概念可以应用于任何语言。 12 | 13 | ## 指南 & 示例 14 | 15 | * API 的使用情况 16 | * [如何处理请求频率限制](examples/How_to_handle_rate_limits.ipynb) 17 | * [避免触及请求频率限制的并行处理脚本示例](examples/api_request_parallel_processor.py) 18 | * [如何用 tiktoken 计算 token](examples/How_to_count_tokens_with_tiktoken.ipynb) 19 | * [如何串联补全](examples/How_to_stream_completions.ipynb) 20 | * GPT-3 21 | * [指南:如何运行大型语言模型](how_to_work_with_large_language_models.md) 22 | * [指南:提高可靠性的技术](techniques_to_improve_reliability.md) 23 | * [如何使用多步骤提示来编写单元测试](examples/Unit_test_writing_using_a_multi-step_prompt.ipynb) 24 | * [文本写作范例](text_writing_examples.md) 25 | * [文本解释实例](text_explanation_examples.md) 26 | * [文本编辑实例](text_editing_examples.md) 27 | * [代码编写实例](code_writing_examples.md) 28 | * [代码解释实例](code_explanation_examples.md) 29 | * [代码编辑实例](code_editing_examples.md) 30 | * 词嵌入 31 | * [文本比较实例](text_comparison_examples.md) 32 | * [如何获得词嵌入](examples/Get_embeddings.ipynb) 33 | * [使用词嵌入回答问题](examples/Question_answering_using_embeddings.ipynb) 34 | * [使用词嵌入的语义搜索](examples/Semantic_text_search_using_embeddings.ipynb) 35 | * [使用词嵌入的建议](examples/Recommendation_using_embeddings.ipynb) 36 | * [对词嵌入进行聚类](examples/Clustering.ipynb) 37 | * [在 2D 中可视化词嵌入](examples/Visualizing_embeddings_in_2D.ipynb) 或 [3D](examples/Visualizing_embeddings_in_3D.ipynb) 38 | * [词嵌入长文本](examples/Embedding_long_inputs.ipynb) 39 | * 微调 GPT-3 40 | * [指南:微调 GPT-3 对文本进行分类的最佳做法](https://docs.google.com/document/d/1rqj7dkuvl7Byd5KQPUJRxc19BJt8wo0yHNwK84KfU3Q/edit) 41 | * [微调分类](examples/Fine-tuned_classification.ipynb) 42 | * DALL-E 43 | * [如何用 DALL-E 生成和编辑图像](examples/dalle/Image_generations_edits_and_variations_with_DALL-E.ipynb) 44 | * Azure OpenAI(基于微软 Azure 的 OpenAI) 45 | * [如何从 Azure OpenAI 获取补全功能](examples/azure/completions.ipynb) 46 | * [如何从 Azure OpenAI 获取词向量功能](examples/azure/embeddings.ipynb) 47 | * [如何用 Azure OpenAI 微调 GPT-3](examples/azure/finetuning.ipynb) 48 | 49 | ## 相关资源 50 | 51 | 除了这里的代码示例,你可以从以下资源中了解 [OpenAI API] : 52 | 53 | * 在试验室 [OpenAI Playground] 试试 API 的使用效果 54 | * 在文档 [OpenAI Documentation] 了解 API 的使用方法 55 | * 在论坛 [OpenAI Community Forum] 讨论 API 的使用经验 56 | * 在帮助中心 [OpenAI Help Center] 寻求使用问题的帮助 57 | * 在案例库 [OpenAI Examples] 找到更多使用案例 58 | * 或者不如亲自上手体验一下神奇的 [ChatGPT] 吧 59 | * 别忘了,在博客 [OpenAI Blog] 第一时间跟进我们的最新消息 60 | 61 | ## 贡献 62 | 63 | 如果有你想看到的示例或指南,请随时在 [issues page] 提出你的需求. 64 | 65 | [ChatGPT]: https://chat.openai.com/ 66 | [OpenAI API]: https://openai.com/api/ 67 | [API Signup]: https://beta.openai.com/signup 68 | [OpenAI Playground]: https://beta.openai.com/playground 69 | [OpenAI Documentation]: https://beta.openai.com/docs/introduction 70 | [OpenAI Community Forum]: https://community.openai.com/top?period=monthly 71 | [OpenAI Help Center]: https://help.openai.com/en/ 72 | [OpenAI Examples]: https://beta.openai.com/examples 73 | [OpenAI Blog]: https://openai.com/blog/ 74 | [issues page]: https://github.com/openai/openai-cookbook/issues 75 | 76 | ## 更多探讨 77 | 78 | 在翻译过程中,我也发现自己能力的不足,在一些词的翻译上,无法找到准确的表达,我使用了原词,同时有些词我按照我的理解做了翻译,但仍觉得不够完美,将他们罗列在下面,供大家共同讨论,欢迎大家在 issue 中一起讨论。 79 | 80 | ### 是否有更好更准确的翻译 81 | 1. token 82 | 2. How_to_stream_completions 83 | 3. Embeddings -------------------------------------------------------------------------------- /code_editing_examples.md: -------------------------------------------------------------------------------- 1 | # 代码编辑实例 2 | 3 | OpenAI的[edits](https://openai.com/blog/gpt-3-edit-insert/)端点对于编辑代码特别有用。 4 | 5 | 与补全不同,edits需要两个输入:要编辑的文本和一条指令。 6 | 7 | 例如,如果你想编辑一个Python函数,你可以提供该函数的文本和一个指令,如 "add a docstring"。 8 | 9 | 输入`code-davinci-edit-001`的文本示例。 10 | 11 | ```python 12 | def tribonacci(n): 13 | if n == 0: 14 | return 0 15 | elif n == 1: 16 | return 1 17 | elif n == 2: 18 | return 1 19 | elif n == 3: 20 | return 2 21 | else: 22 | return tribonacci(n-1) + tribonacci(n-2) + tribonacci(n-3) 23 | ``` 24 | 25 | 示例指令输入: 26 | 27 | ```text 28 | add a docstring 29 | ``` 30 | 31 | ```text 32 | Add typing, using Python 3.9 conventions 33 | ``` 34 | 35 | ```text 36 | improved the runtime 37 | ``` 38 | 39 | ```text 40 | Add a test. 41 | ``` 42 | 43 | ```text 44 | Translate to JavaScript (or Rust or Lisp or any language you like) 45 | ``` 46 | 47 | 改进运行时间并翻译成JavaScript后的输出示例: 48 | 49 | ```JavaScript 50 | function tribonacci(n) { 51 | let a = 0; 52 | let b = 1; 53 | let c = 1; 54 | for (let i = 0; i < n; i++) { 55 | [a, b, c] = [b, c, a + b + c]; 56 | } 57 | return a; 58 | } 59 | ``` 60 | 61 | 正如你所看到的,`code-davinci-edit-001`能够成功地将函数的运行时间从指数级降低到线性,以及从Python转换到JavaScript。 62 | 63 | 在 [OpenAI Playground](https://beta.openai.com/playground?mode=edit&model=code-davinci-edit-001) 中使用 `code-davinci-edit-001` 进行代码编辑实验。 -------------------------------------------------------------------------------- /code_explanation_examples.md: -------------------------------------------------------------------------------- 1 | # Code explanation examples 2 | 3 | GPT's understanding of code can be applied to many use cases, e.g.: 4 | 5 | * Generating in-code documentation (e.g., Python docstrings, git commit messages) 6 | * Generating out-of-code documentation (e.g., man pages) 7 | * An interactive code exploration tool 8 | * Communicating program results back to users via a natural language interface 9 | 10 | For example, if you wanted to understand a SQL query, you could give `code-davinci-002` the following example prompt: 11 | 12 | ````text 13 | A SQL query: 14 | ``` 15 | SELECT c.customer_id 16 | FROM Customers c 17 | JOIN Streaming s 18 | ON c.customer_id = s.customer_id 19 | WHERE c.signup_date BETWEEN '2020-03-01' AND '2020-03-31' 20 | AND s.watch_date BETWEEN c.signup_date AND DATE_ADD(c.signup_date, INTERVAL 30 DAY) 21 | GROUP BY c.customer_id 22 | HAVING SUM(s.watch_minutes) > 50 * 60 23 | ``` 24 | 25 | Questions: 26 | 1. What does the SQL query do? 27 | 2. Why might someone be interested in this time period? 28 | 3. Why might a company be interested in this SQL query? 29 | 30 | Answers: 31 | ```` 32 | 33 | [Output]((https://beta.openai.com/playground/p/Sv1VQKbJV1TZKmiTK9r6nlj3)): 34 | 35 | ```text 36 | 1. The SQL query finds all customers who signed up in March 2020 and watched more than 50 hours of content in the first 30 days after signing up. 37 | 2. The time period is interesting because it is the first month of the COVID-19 pandemic. 38 | 3. A company might be interested in this SQL query because it can help them understand how the pandemic has affected their business. 39 | ``` 40 | 41 | Note that `code-davinci-002` is not trained to follow instructions and therefore usually needs examples or other structure to help steer its output, as well as stop sequences to stop generating. For easier prompting, try `text-davinci-003`. -------------------------------------------------------------------------------- /code_writing_examples.md: -------------------------------------------------------------------------------- 1 | # Code writing examples 2 | 3 | GPT-3 is able to write code as well as text. 4 | 5 | Here's an example of `code-davinci-002` writing a SQL query: 6 | 7 | ````text 8 | SQL tables (and columns): 9 | * Customers(customer_id, signup_date) 10 | * Streaming(customer_id, video_id, watch_date, watch_minutes) 11 | 12 | A well-written SQL query that lists customers who signed up during March 2020 and watched more than 50 hours of video in their first 30 days: 13 | ``` 14 | ```` 15 | 16 | [Output](https://beta.openai.com/playground/p/r2mw99cANoa0TJHok725CeaC): 17 | 18 | ```sql 19 | SELECT c.customer_id 20 | FROM Customers c 21 | JOIN Streaming s 22 | ON c.customer_id = s.customer_id 23 | WHERE c.signup_date BETWEEN '2020-03-01' AND '2020-03-31' 24 | AND s.watch_date BETWEEN c.signup_date AND DATE_ADD(c.signup_date, INTERVAL 30 DAY) 25 | GROUP BY c.customer_id 26 | HAVING SUM(s.watch_minutes) > 50 * 60 27 | ``` 28 | 29 | Helpfully, `code-davinci-002` is able to make inferences from variable names; for example, it infers that `watch_minutes` has units of minutes and therefore needs to be converted by a factor of 60 before being compared with 50 hours. 30 | 31 | For easier prompting, you can also try `text-davinci-003`. -------------------------------------------------------------------------------- /examples/Backtranslation_of_SQL_queries.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | 3 | from smokey import Smokey 4 | 5 | import openai 6 | 7 | 8 | def get_candidates( 9 | prompt: str, 10 | stop: List[str], 11 | temperature: float, 12 | priming_prefix: str, 13 | engine: str, 14 | n: int = 5, 15 | ) -> List[str]: 16 | """ 17 | Generate N candidate completions based on the prompt, generated with a specific temperature. 18 | 19 | :param prompt: The prompt to start the conversation with. 20 | :param stop: A list of tokens that indicate the end of the generation. 21 | :param temperature: The temperature of the generation. 22 | :param priming_prefix: The prefix to use for the priming. 23 | :param engine: The engine to use for the generation. 24 | :param n: The number of completions to generate. 25 | :return: A list of completions. 26 | """ 27 | response = openai.Completion.create( 28 | engine=engine, 29 | prompt=prompt, 30 | temperature=temperature, 31 | max_tokens=150, 32 | top_p=1, 33 | frequency_penalty=0, 34 | presence_penalty=0, 35 | stop=stop, 36 | n=n, 37 | ) 38 | responses = [priming_prefix + choice.text for choice in response.choices] 39 | return responses 40 | 41 | 42 | def rindex(lst: List, value: str) -> int: 43 | """ 44 | Return the index of the last occurence of a value in a list. 45 | 46 | :param lst: The list to search in. 47 | :param value: The value to search for. 48 | :return: The index of the last occurence of the value. 49 | """ 50 | try: 51 | return len(lst) - lst[::-1].index(value) - 1 52 | except ValueError: 53 | raise ValueError(f"Answer start token `{value}` not found in the eval template") 54 | 55 | 56 | def eval_candidate( 57 | candidate_answer: str, 58 | original_instruction: str, 59 | eval_template: str, 60 | answer_start_token: str, 61 | engine: str, 62 | ) -> float: 63 | """ 64 | Evaluate a candidate answer by calculating the average log probability 65 | of the original instruction, given the candidate answer with a specific 66 | evaluation template, aimed at reconstructing the original instruction. 67 | 68 | :param candidate_answer: The candidate answer to evaluate. 69 | :param original_instruction: The original instruction. 70 | :param eval_template: The template to use for the evaluation. 71 | :param answer_start_token: The token to use to indicate the start of the answer. 72 | :param engine: The engine to use for the evaluation. 73 | :return: The evaluation of the candidate answer. 74 | """ 75 | response = openai.Completion.create( 76 | engine=engine, 77 | prompt=eval_template.format(candidate_answer, original_instruction), 78 | temperature=0, 79 | max_tokens=0, 80 | top_p=1, 81 | frequency_penalty=0, 82 | presence_penalty=0, 83 | logprobs=1, 84 | echo=True, 85 | ) 86 | 87 | answer_start = rindex( 88 | response["choices"][0]["logprobs"]["tokens"], answer_start_token 89 | ) 90 | logprobs = response["choices"][0]["logprobs"]["token_logprobs"][answer_start + 1 :] 91 | return sum(logprobs) / len(logprobs) 92 | 93 | 94 | def backtranslation( 95 | prompt_template: str, 96 | additional_info: str, 97 | instruction: str, 98 | eval_template: str, 99 | priming_prefix: str = "SELECT", 100 | stop1: List[str] = ["#", ";"], 101 | answer_start_token: str = "--", 102 | n: int = 5, 103 | temperature: float = 0.5, 104 | return_all_results: bool = False, 105 | engine: str = "davinci-codex", 106 | ) -> Union[str, List[str, float]]: 107 | """ 108 | Generate a number of SQL queries given a natural language instruction, 109 | and pick the best one based on the average log probability of explaining the 110 | candidate SQL query with the exact original instruction, when prompted for 111 | a natural language explanation of the candidate SQL query. 112 | 113 | :param prompt_template: The template to use for the prompt to generate SQL. 114 | :param additional_info: Additional information to include in the prompt 115 | (SQL Tables, and their properties). 116 | :param instruction: The instruction in natural language. 117 | :param eval_template: The template to use for the evaluation. 118 | :param priming_prefix: The prefix to use for the priming of the SQL query. 119 | :param stop1: A list of tokens that indicate the end of the generation. 120 | :param answer_start_token: The token to use to indicate the start of the 121 | natural answer. 122 | :param n: The number of candidates to generate. 123 | :param temperature: The temperature of the generation. 124 | :param return_all_results: Whether to return all results or just the best one. 125 | :param engine: The engine to use for the generation and evaluation. 126 | :return: The best SQL query, or a list of all scored generated SQL queries. 127 | """ 128 | prompt_template = prompt_template.format( 129 | additional_info, instruction, priming_prefix 130 | ) 131 | 132 | candidates = [] 133 | responses = get_candidates( 134 | prompt_template, stop1, temperature, priming_prefix, engine=engine, n=n 135 | ) 136 | for i in range(n): 137 | quality = eval_candidate( 138 | responses[i], 139 | instruction, 140 | eval_template, 141 | answer_start_token, 142 | engine=engine, 143 | ) 144 | candidates.append((responses[i], quality)) 145 | 146 | candidates.sort(key=lambda x: x[1], reverse=True) 147 | if return_all_results: 148 | return candidates 149 | return candidates[0][0] 150 | 151 | 152 | def main( 153 | nl_query: str = "Return the name of each department that had more than 10 employees in June 2021", 154 | eval_template: str = "{};\n-- Explanation of the above query in human readable format\n-- {}", 155 | table_definitions: str = "# Employee(id, name, department_id)\n# Department(id, name, address)\n# Salary_Payments(id, employee_id, amount, date)\n", 156 | prompt_template: str = "### Postgres SQL tables, with their properties:\n#\n{}#\n### {}\n{}", 157 | n: int = 3, 158 | temperature: float = 0.3, 159 | engine: str = "davinci-codex", 160 | ): 161 | """ 162 | Generate a number of SQL queries given a natural language instruction, 163 | and pick the best one based on the highest backtranslation score. 164 | 165 | :param nl_query: The natural language query. 166 | :param eval_template: The template to use for the evaluation. 167 | :param table_definitions: The definitions of the tables used in the query. 168 | :param prompt_template: The template to use for the prompt to generate SQL. 169 | :param n: The number of candidates to generate. 170 | :param temperature: The temperature of the generation. 171 | :param engine: The engine to use for the generation and evaluation. 172 | :return: The best SQL query, or a list of all scored generated SQL queries. 173 | """ 174 | 175 | result = backtranslation( 176 | prompt_template, 177 | table_definitions, 178 | nl_query, 179 | eval_template, 180 | priming_prefix="SELECT", 181 | temperature=temperature, 182 | n=n, 183 | engine=engine, 184 | ) 185 | print(result) 186 | 187 | 188 | if __name__ == "__main__": 189 | Smokey(main) 190 | -------------------------------------------------------------------------------- /examples/Code_search.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "## Code search\n", 9 | "\n", 10 | "We index our own [openai-python code repository](https://github.com/openai/openai-python), and show how it can be searched. We implement a simple version of file parsing and extracting of functions from python files." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "Total number of py files: 51\n", 23 | "Total number of functions extracted: 97\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "import os\n", 29 | "from glob import glob\n", 30 | "import pandas as pd\n", 31 | "\n", 32 | "def get_function_name(code):\n", 33 | " \"\"\"\n", 34 | " Extract function name from a line beginning with \"def \"\n", 35 | " \"\"\"\n", 36 | " assert code.startswith(\"def \")\n", 37 | " return code[len(\"def \"): code.index(\"(\")]\n", 38 | "\n", 39 | "def get_until_no_space(all_lines, i) -> str:\n", 40 | " \"\"\"\n", 41 | " Get all lines until a line outside the function definition is found.\n", 42 | " \"\"\"\n", 43 | " ret = [all_lines[i]]\n", 44 | " for j in range(i + 1, i + 10000):\n", 45 | " if j < len(all_lines):\n", 46 | " if len(all_lines[j]) == 0 or all_lines[j][0] in [\" \", \"\\t\", \")\"]:\n", 47 | " ret.append(all_lines[j])\n", 48 | " else:\n", 49 | " break\n", 50 | " return \"\\n\".join(ret)\n", 51 | "\n", 52 | "def get_functions(filepath):\n", 53 | " \"\"\"\n", 54 | " Get all functions in a Python file.\n", 55 | " \"\"\"\n", 56 | " whole_code = open(filepath).read().replace(\"\\r\", \"\\n\")\n", 57 | " all_lines = whole_code.split(\"\\n\")\n", 58 | " for i, l in enumerate(all_lines):\n", 59 | " if l.startswith(\"def \"):\n", 60 | " code = get_until_no_space(all_lines, i)\n", 61 | " function_name = get_function_name(code)\n", 62 | " yield {\"code\": code, \"function_name\": function_name, \"filepath\": filepath}\n", 63 | "\n", 64 | "\n", 65 | "# get user root directory\n", 66 | "root_dir = os.path.expanduser(\"~\")\n", 67 | "# note: for this code to work, the openai-python repo must be downloaded and placed in your root directory\n", 68 | "\n", 69 | "# path to code repository directory\n", 70 | "code_root = root_dir + \"/openai-python\"\n", 71 | "\n", 72 | "code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], '*.py'))]\n", 73 | "print(\"Total number of py files:\", len(code_files))\n", 74 | "\n", 75 | "if len(code_files) == 0:\n", 76 | " print(\"Double check that you have downloaded the openai-python repo and set the code_root variable correctly.\")\n", 77 | "\n", 78 | "all_funcs = []\n", 79 | "for code_file in code_files:\n", 80 | " funcs = list(get_functions(code_file))\n", 81 | " for func in funcs:\n", 82 | " all_funcs.append(func)\n", 83 | "\n", 84 | "print(\"Total number of functions extracted:\", len(all_funcs))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 2, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/html": [ 95 | "
\n", 96 | "\n", 109 | "\n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | "
codefunction_namefilepathcode_embedding
0def _console_log_level():\\n if openai.log i..._console_log_level/openai/util.py[0.03389773145318031, -0.004390408284962177, 0...
1def log_debug(message, **params):\\n msg = l...log_debug/openai/util.py[-0.004034275189042091, 0.004895383026450872, ...
2def log_info(message, **params):\\n msg = lo...log_info/openai/util.py[0.004882764536887407, 0.0033515947870910168, ...
3def log_warn(message, **params):\\n msg = lo...log_warn/openai/util.py[0.002535992069169879, -0.010829543694853783, ...
4def logfmt(props):\\n def fmt(key, val):\\n ...logfmt/openai/util.py[0.016732551157474518, 0.017367802560329437, 0...
\n", 157 | "
" 158 | ], 159 | "text/plain": [ 160 | " code function_name \\\n", 161 | "0 def _console_log_level():\\n if openai.log i... _console_log_level \n", 162 | "1 def log_debug(message, **params):\\n msg = l... log_debug \n", 163 | "2 def log_info(message, **params):\\n msg = lo... log_info \n", 164 | "3 def log_warn(message, **params):\\n msg = lo... log_warn \n", 165 | "4 def logfmt(props):\\n def fmt(key, val):\\n ... logfmt \n", 166 | "\n", 167 | " filepath code_embedding \n", 168 | "0 /openai/util.py [0.03389773145318031, -0.004390408284962177, 0... \n", 169 | "1 /openai/util.py [-0.004034275189042091, 0.004895383026450872, ... \n", 170 | "2 /openai/util.py [0.004882764536887407, 0.0033515947870910168, ... \n", 171 | "3 /openai/util.py [0.002535992069169879, -0.010829543694853783, ... \n", 172 | "4 /openai/util.py [0.016732551157474518, 0.017367802560329437, 0... " 173 | ] 174 | }, 175 | "execution_count": 2, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "from openai.embeddings_utils import get_embedding\n", 182 | "\n", 183 | "df = pd.DataFrame(all_funcs)\n", 184 | "df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))\n", 185 | "df['filepath'] = df['filepath'].apply(lambda x: x.replace(code_root, \"\"))\n", 186 | "df.to_csv(\"data/code_search_openai-python.csv\", index=False)\n", 187 | "df.head()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 3, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "/openai/tests/test_endpoints.py:test_completions score=0.826\n", 200 | "def test_completions():\n", 201 | " result = openai.Completion.create(prompt=\"This was a test\", n=5, engine=\"ada\")\n", 202 | " assert len(result.choices) == 5\n", 203 | "\n", 204 | "\n", 205 | "----------------------------------------------------------------------\n", 206 | "/openai/tests/test_endpoints.py:test_completions_model score=0.811\n", 207 | "def test_completions_model():\n", 208 | " result = openai.Completion.create(prompt=\"This was a test\", n=5, model=\"ada\")\n", 209 | " assert len(result.choices) == 5\n", 210 | " assert result.model.startswith(\"ada\")\n", 211 | "\n", 212 | "\n", 213 | "----------------------------------------------------------------------\n", 214 | "/openai/tests/test_endpoints.py:test_completions_multiple_prompts score=0.808\n", 215 | "def test_completions_multiple_prompts():\n", 216 | " result = openai.Completion.create(\n", 217 | " prompt=[\"This was a test\", \"This was another test\"], n=5, engine=\"ada\"\n", 218 | " )\n", 219 | " assert len(result.choices) == 10\n", 220 | "\n", 221 | "\n", 222 | "----------------------------------------------------------------------\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "from openai.embeddings_utils import cosine_similarity\n", 228 | "\n", 229 | "def search_functions(df, code_query, n=3, pprint=True, n_lines=7):\n", 230 | " embedding = get_embedding(code_query, engine='text-embedding-ada-002')\n", 231 | " df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))\n", 232 | "\n", 233 | " res = df.sort_values('similarities', ascending=False).head(n)\n", 234 | " if pprint:\n", 235 | " for r in res.iterrows():\n", 236 | " print(r[1].filepath+\":\"+r[1].function_name + \" score=\" + str(round(r[1].similarities, 3)))\n", 237 | " print(\"\\n\".join(r[1].code.split(\"\\n\")[:n_lines]))\n", 238 | " print('-'*70)\n", 239 | " return res\n", 240 | "\n", 241 | "res = search_functions(df, 'Completions API tests', n=3)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 4, 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "/openai/validators.py:format_inferrer_validator score=0.751\n", 254 | "def format_inferrer_validator(df):\n", 255 | " \"\"\"\n", 256 | " This validator will infer the likely fine-tuning format of the data, and display it to the user if it is classification.\n", 257 | " It will also suggest to use ada and explain train/validation split benefits.\n", 258 | " \"\"\"\n", 259 | " ft_type = infer_task_type(df)\n", 260 | " immediate_msg = None\n", 261 | "----------------------------------------------------------------------\n", 262 | "/openai/validators.py:get_validators score=0.748\n", 263 | "def get_validators():\n", 264 | " return [\n", 265 | " num_examples_validator,\n", 266 | " lambda x: necessary_column_validator(x, \"prompt\"),\n", 267 | " lambda x: necessary_column_validator(x, \"completion\"),\n", 268 | " additional_column_validator,\n", 269 | " non_empty_field_validator,\n", 270 | "----------------------------------------------------------------------\n", 271 | "/openai/validators.py:infer_task_type score=0.738\n", 272 | "def infer_task_type(df):\n", 273 | " \"\"\"\n", 274 | " Infer the likely fine-tuning task type from the data\n", 275 | " \"\"\"\n", 276 | " CLASSIFICATION_THRESHOLD = 3 # min_average instances of each class\n", 277 | " if sum(df.prompt.str.len()) == 0:\n", 278 | " return \"open-ended generation\"\n", 279 | "----------------------------------------------------------------------\n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "res = search_functions(df, 'fine-tuning input data validation logic', n=3)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 5, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "name": "stdout", 294 | "output_type": "stream", 295 | "text": [ 296 | "/openai/validators.py:get_common_xfix score=0.793\n", 297 | "def get_common_xfix(series, xfix=\"suffix\"):\n", 298 | " \"\"\"\n", 299 | " Finds the longest common suffix or prefix of all the values in a series\n", 300 | " \"\"\"\n", 301 | " common_xfix = \"\"\n", 302 | " while True:\n", 303 | " common_xfixes = (\n", 304 | " series.str[-(len(common_xfix) + 1) :]\n", 305 | " if xfix == \"suffix\"\n", 306 | " else series.str[: len(common_xfix) + 1]\n", 307 | "----------------------------------------------------------------------\n", 308 | "/openai/validators.py:common_completion_suffix_validator score=0.778\n", 309 | "def common_completion_suffix_validator(df):\n", 310 | " \"\"\"\n", 311 | " This validator will suggest to add a common suffix to the completion if one doesn't already exist in case of classification or conditional generation.\n", 312 | " \"\"\"\n", 313 | " error_msg = None\n", 314 | " immediate_msg = None\n", 315 | " optional_msg = None\n", 316 | " optional_fn = None\n", 317 | "\n", 318 | " ft_type = infer_task_type(df)\n", 319 | "----------------------------------------------------------------------\n" 320 | ] 321 | } 322 | ], 323 | "source": [ 324 | "res = search_functions(df, 'find common suffix', n=2, n_lines=10)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 6, 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "name": "stdout", 334 | "output_type": "stream", 335 | "text": [ 336 | "/openai/cli.py:tools_register score=0.773\n", 337 | "def tools_register(parser):\n", 338 | " subparsers = parser.add_subparsers(\n", 339 | " title=\"Tools\", help=\"Convenience client side tools\"\n", 340 | " )\n", 341 | "\n", 342 | " def help(args):\n", 343 | " parser.print_help()\n", 344 | "\n", 345 | " parser.set_defaults(func=help)\n", 346 | "\n", 347 | " sub = subparsers.add_parser(\"fine_tunes.prepare_data\")\n", 348 | " sub.add_argument(\n", 349 | " \"-f\",\n", 350 | " \"--file\",\n", 351 | " required=True,\n", 352 | " help=\"JSONL, JSON, CSV, TSV, TXT or XLSX file containing prompt-completion examples to be analyzed.\"\n", 353 | " \"This should be the local file path.\",\n", 354 | " )\n", 355 | " sub.add_argument(\n", 356 | " \"-q\",\n", 357 | "----------------------------------------------------------------------\n" 358 | ] 359 | } 360 | ], 361 | "source": [ 362 | "res = search_functions(df, 'Command line interface for fine-tuning', n=1, n_lines=20)" 363 | ] 364 | } 365 | ], 366 | "metadata": { 367 | "interpreter": { 368 | "hash": "be4b5d5b73a21c599de40d6deb1129796d12dc1cc33a738f7bac13269cfcafe8" 369 | }, 370 | "kernelspec": { 371 | "display_name": "openai-cookbook", 372 | "language": "python", 373 | "name": "openai-cookbook" 374 | }, 375 | "language_info": { 376 | "codemirror_mode": { 377 | "name": "ipython", 378 | "version": 3 379 | }, 380 | "file_extension": ".py", 381 | "mimetype": "text/x-python", 382 | "name": "python", 383 | "nbconvert_exporter": "python", 384 | "pygments_lexer": "ipython3", 385 | "version": "3.9.6" 386 | }, 387 | "orig_nbformat": 4 388 | }, 389 | "nbformat": 4, 390 | "nbformat_minor": 2 391 | } 392 | -------------------------------------------------------------------------------- /examples/Embedding_long_inputs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Embedding texts that are longer than the model's maximum context length\n", 9 | "\n", 10 | "OpenAI's embedding models cannot embed text that exceeds a maximum length. The maximum length varies by model, and is measured by _tokens_, not string length. If you are unfamiliar with tokenization, check out [How to count tokens with tiktoken](How_to_count_tokens_with_tiktoken.ipynb).\n", 11 | "\n", 12 | "This notebook shows how to handle texts that are longer than a model's maximum context length. We'll demonstrate using embeddings from `text-embedding-ada-002`, but the same ideas can be applied to other models and tasks. To learn more about embeddings, check out the [OpenAI Embeddings Guide](https://beta.openai.com/docs/guides/embeddings).\n" 13 | ] 14 | }, 15 | { 16 | "attachments": {}, 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## 1. Model context length\n", 21 | "\n", 22 | "First, we select the model and define a function to get embeddings from the API." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import openai\n", 32 | "from tenacity import retry, wait_random_exponential, stop_after_attempt, retry_if_not_exception_type\n", 33 | "\n", 34 | "\n", 35 | "EMBEDDING_MODEL = 'text-embedding-ada-002'\n", 36 | "EMBEDDING_CTX_LENGTH = 8191\n", 37 | "EMBEDDING_ENCODING = 'cl100k_base'\n", 38 | "\n", 39 | "# let's make sure to not retry on an invalid request, because that is what we want to demonstrate\n", 40 | "@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(openai.InvalidRequestError))\n", 41 | "def get_embedding(text_or_tokens, model=EMBEDDING_MODEL):\n", 42 | " return openai.Embedding.create(input=text_or_tokens, model=model)[\"data\"][0][\"embedding\"]" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "The `text-embedding-ada-002` model has a context length of 8191 tokens with the `cl100k_base` encoding, and we can see that going over that limit causes an error." 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 2, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "This model's maximum context length is 8191 tokens, however you requested 10001 tokens (10001 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "long_text = 'AGI ' * 5000\n", 67 | "try:\n", 68 | " get_embedding(long_text)\n", 69 | "except openai.InvalidRequestError as e:\n", 70 | " print(e)" 71 | ] 72 | }, 73 | { 74 | "attachments": {}, 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "Clearly we want to avoid these errors, particularly when handling programatically with a large number of embeddings. Yet, we still might be faced with texts that are longer than the maximum context length. Below we describe and provide recipes for the main approaches to handling these longer texts: (1) simply truncating the text to the maximum allowed length, and (2) chunking the text and embedding each chunk individually." 79 | ] 80 | }, 81 | { 82 | "attachments": {}, 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "## 1. Truncating the input text\n", 87 | "\n", 88 | "The simplest solution is to truncate the input text to the maximum allowed length. Because the context length is measured in tokens, we have to first tokenize the text before truncating it. The API accepts inputs both in the form of text or tokens, so as long as you are careful that you are using the appropriate encoding, there is no need to convert the tokens back into string form. Below is an example of such a truncation function." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 3, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "import tiktoken\n", 98 | "\n", 99 | "def truncate_text_tokens(text, encoding_name=EMBEDDING_ENCODING, max_tokens=EMBEDDING_CTX_LENGTH):\n", 100 | " \"\"\"Truncate a string to have `max_tokens` according to the given encoding.\"\"\"\n", 101 | " encoding = tiktoken.get_encoding(encoding_name)\n", 102 | " return encoding.encode(text)[:max_tokens]" 103 | ] 104 | }, 105 | { 106 | "attachments": {}, 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "Our example from before now works without error." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 4, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "1536" 122 | ] 123 | }, 124 | "execution_count": 4, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "truncated = truncate_text_tokens(long_text)\n", 131 | "len(get_embedding(truncated))" 132 | ] 133 | }, 134 | { 135 | "attachments": {}, 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "## 2. Chunking the input text\n", 140 | "\n", 141 | "Though truncation works, discarding potentially relevant text is a clear drawback. Another approach is to divide the input text into chunks and then embed each chunk individually. Then, we can either use the chunk embeddings separately, or combine them in some way, such as averaging (weighted by the size of each chunk).\n", 142 | "\n", 143 | "We will take a function from [Python's own cookbook](https://docs.python.org/3/library/itertools.html#itertools-recipes) that breaks up a sequence into chunks." 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 5, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "from itertools import islice\n", 153 | "\n", 154 | "def batched(iterable, n):\n", 155 | " \"\"\"Batch data into tuples of length n. The last batch may be shorter.\"\"\"\n", 156 | " # batched('ABCDEFG', 3) --> ABC DEF G\n", 157 | " if n < 1:\n", 158 | " raise ValueError('n must be at least one')\n", 159 | " it = iter(iterable)\n", 160 | " while (batch := tuple(islice(it, n))):\n", 161 | " yield batch" 162 | ] 163 | }, 164 | { 165 | "attachments": {}, 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "Now we define a function that encodes a string into tokens and then breaks it up into chunks." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 6, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "def chunked_tokens(text, encoding_name, chunk_length):\n", 179 | " encoding = tiktoken.get_encoding(encoding_name)\n", 180 | " tokens = encoding.encode(text)\n", 181 | " chunks_iterator = batched(tokens, chunk_length)\n", 182 | " yield from chunks_iterator" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "Finally, we can write a function that safely handles embedding requests, even when the input text is longer than the maximum context length, by chunking the input tokens and embedding each chunk individually. The `average` flag can be set to `True` to return the weighted average of the chunk embeddings, or `False` to simply return the unmodified list of chunk embeddings." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 7, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "import numpy as np\n", 199 | "\n", 200 | "\n", 201 | "def len_safe_get_embedding(text, model=EMBEDDING_MODEL, max_tokens=EMBEDDING_CTX_LENGTH, encoding_name=EMBEDDING_ENCODING, average=True):\n", 202 | " chunk_embeddings = []\n", 203 | " for chunk in chunked_tokens(text, encoding_name=encoding_name, chunk_length=max_tokens):\n", 204 | " chunk_embeddings.append(get_embedding(chunk, model=model))\n", 205 | "\n", 206 | " if average:\n", 207 | " chunk_embeddings = np.average(chunk_embeddings, axis=0, weights=[len(c) for c in chunk_embeddings])\n", 208 | " chunk_embeddings = chunk_embeddings / np.linalg.norm(chunk_embeddings) # normalizes length to 1\n", 209 | " chunk_embeddings = chunk_embeddings.tolist()\n", 210 | " return chunk_embeddings" 211 | ] 212 | }, 213 | { 214 | "attachments": {}, 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "Once again, we can now handle long input texts." 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 8, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "name": "stdout", 228 | "output_type": "stream", 229 | "text": [ 230 | "Setting average=True gives us a single 1536-dimensional embedding vector for our long text.\n", 231 | "Setting average=False gives us 2 embedding vectors, one for each of the chunks.\n" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "average_embedding_vector = len_safe_get_embedding(long_text, average=True)\n", 237 | "chunks_embedding_vectors = len_safe_get_embedding(long_text, average=False)\n", 238 | "\n", 239 | "print(f\"Setting average=True gives us a single {len(average_embedding_vector)}-dimensional embedding vector for our long text.\")\n", 240 | "print(f\"Setting average=False gives us {len(chunks_embedding_vectors)} embedding vectors, one for each of the chunks.\")\n" 241 | ] 242 | }, 243 | { 244 | "attachments": {}, 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "In some cases, it may make sense to split chunks on paragraph boundaries or sentence boundaries to help preserve the meaning of the text." 249 | ] 250 | } 251 | ], 252 | "metadata": { 253 | "kernelspec": { 254 | "display_name": "Python 3 (ipykernel)", 255 | "language": "python", 256 | "name": "python3" 257 | }, 258 | "language_info": { 259 | "codemirror_mode": { 260 | "name": "ipython", 261 | "version": 3 262 | }, 263 | "file_extension": ".py", 264 | "mimetype": "text/x-python", 265 | "name": "python", 266 | "nbconvert_exporter": "python", 267 | "pygments_lexer": "ipython3", 268 | "version": "3.9.9" 269 | }, 270 | "vscode": { 271 | "interpreter": { 272 | "hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97" 273 | } 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 2 278 | } 279 | -------------------------------------------------------------------------------- /examples/Get_embeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Get embeddings\n", 8 | "\n", 9 | "The function `get_embedding` will give us an embedding for an input text." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "1536" 21 | ] 22 | }, 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "import openai\n", 30 | "\n", 31 | "embedding = openai.Embedding.create(\n", 32 | " input=\"Your text goes here\", model=\"text-embedding-ada-002\"\n", 33 | ")[\"data\"][0][\"embedding\"]\n", 34 | "len(embedding)\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "1536\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "import openai\n", 52 | "from tenacity import retry, wait_random_exponential, stop_after_attempt\n", 53 | "\n", 54 | "\n", 55 | "@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))\n", 56 | "def get_embedding(text: str, model=\"text-embedding-ada-002\") -> list[float]:\n", 57 | " return openai.Embedding.create(input=[text], model=model)[\"data\"][0][\"embedding\"]\n", 58 | "\n", 59 | "\n", 60 | "embedding = get_embedding(\"Your text goes here\", model=\"text-embedding-ada-002\")\n", 61 | "print(len(embedding))\n" 62 | ] 63 | } 64 | ], 65 | "metadata": { 66 | "kernelspec": { 67 | "display_name": "Python 3.9.9 ('openai')", 68 | "language": "python", 69 | "name": "python3" 70 | }, 71 | "language_info": { 72 | "codemirror_mode": { 73 | "name": "ipython", 74 | "version": 3 75 | }, 76 | "file_extension": ".py", 77 | "mimetype": "text/x-python", 78 | "name": "python", 79 | "nbconvert_exporter": "python", 80 | "pygments_lexer": "ipython3", 81 | "version": "3.9.9" 82 | }, 83 | "orig_nbformat": 4, 84 | "vscode": { 85 | "interpreter": { 86 | "hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97" 87 | } 88 | } 89 | }, 90 | "nbformat": 4, 91 | "nbformat_minor": 2 92 | } 93 | -------------------------------------------------------------------------------- /examples/How_to_count_tokens_with_tiktoken.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# How to count tokens with tiktoken\n", 9 | "\n", 10 | "[`tiktoken`](https://github.com/openai/tiktoken/blob/main/README.md) is a fast open-source tokenizer by OpenAI.\n", 11 | "\n", 12 | "Given a text string (e.g., `\"tiktoken is great!\"`) and an encoding (e.g., `\"gpt2\"`), a tokenizer can split the text string into a list of tokens (e.g., `[\"t\", \"ik\", \"token\", \" is\", \" great\", \"!\"]`).\n", 13 | "\n", 14 | "Splitting text strings into tokens is useful because models like GPT-3 see text in the form of tokens. Knowing how many tokens are in a text string can tell you (a) whether the string is too long for a text model to process and (b) how much an OpenAI API call costs (as usage is priced by token). Different models use different encodings.\n", 15 | "\n", 16 | "`tiktoken` supports three encodings used by OpenAI models:\n", 17 | "\n", 18 | "| Encoding name | OpenAI models |\n", 19 | "|-------------------------|-----------------------------------------------------|\n", 20 | "| `gpt2` (or `r50k_base`) | Most GPT-3 models |\n", 21 | "| `p50k_base` | Code models, `text-davinci-002`, `text-davinci-003` |\n", 22 | "| `cl100k_base` | `text-embedding-ada-002` |\n", 23 | "\n", 24 | "`p50k_base` overlaps substantially with `gpt2`, and for non-code applications, they will usually give the same tokens.\n", 25 | "\n", 26 | "## Tokenizer libraries and languages\n", 27 | "\n", 28 | "For `gpt2` encodings, tokenizers are available in many languages.\n", 29 | "- Python: [tiktoken](https://github.com/openai/tiktoken/blob/main/README.md) (or alternatively [GPT2TokenizerFast](https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2TokenizerFast))\n", 30 | "- JavaScript: [gpt-3-encoder](https://www.npmjs.com/package/gpt-3-encoder)\n", 31 | "- .NET / C#: [GPT Tokenizer](https://github.com/dluc/openai-tools)\n", 32 | "- Java: [gpt2-tokenizer-java](https://github.com/hyunwoongko/gpt2-tokenizer-java)\n", 33 | "- PHP: [GPT-3-Encoder-PHP](https://github.com/CodeRevolutionPlugins/GPT-3-Encoder-PHP)\n", 34 | "\n", 35 | "(OpenAI makes no endorsements or guarantees of third-party libraries.)\n", 36 | "\n", 37 | "For `p50k_base` and `cl100k_base` encodings, `tiktoken` is the only tokenizer available as of January 2023.\n", 38 | "- Python: [tiktoken](https://github.com/openai/tiktoken/blob/main/README.md)\n", 39 | "\n", 40 | "## How strings are typically tokenized\n", 41 | "\n", 42 | "In English, tokens commonly range in length from one character to one word (e.g., `\"t\"` or `\" great\"`), though in some languages tokens can be shorter than one character or longer than one word. Spaces are usually grouped with the starts of words (e.g., `\" is\"` instead of `\"is \"` or `\" \"`+`\"is\"`). You can quickly check how a string is tokenized at the [OpenAI Tokenizer](https://beta.openai.com/tokenizer)." 43 | ] 44 | }, 45 | { 46 | "attachments": {}, 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## 0. Install `tiktoken`\n", 51 | "\n", 52 | "In your terminal, install `tiktoken` with `pip`:\n", 53 | "\n", 54 | "```bash\n", 55 | "pip install tiktoken\n", 56 | "```" 57 | ] 58 | }, 59 | { 60 | "attachments": {}, 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## 1. Import `tiktoken`" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 1, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "import tiktoken\n" 74 | ] 75 | }, 76 | { 77 | "attachments": {}, 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## 2. Load an encoding\n", 82 | "\n", 83 | "Use `tiktoken.get_encoding()` to load an encoding by name.\n", 84 | "\n", 85 | "The first time this runs, it will require an internet connection to download. Later runs won't need an internet connection." 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 2, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "encoding = tiktoken.get_encoding(\"gpt2\")\n" 95 | ] 96 | }, 97 | { 98 | "attachments": {}, 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## 3. Turn text into tokens with `encoding.encode()`\n", 103 | "\n" 104 | ] 105 | }, 106 | { 107 | "attachments": {}, 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "The `.encode()` method converts a text string into a list of token integers." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 3, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/plain": [ 122 | "[83, 1134, 30001, 318, 1049, 0]" 123 | ] 124 | }, 125 | "execution_count": 3, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "encoding.encode(\"tiktoken is great!\")\n" 132 | ] 133 | }, 134 | { 135 | "attachments": {}, 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "Count tokens by counting the length of the list returned by `.encode()`." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 4, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "def num_tokens_from_string(string: str, encoding_name: str) -> int:\n", 149 | " \"\"\"Returns the number of tokens in a text string.\"\"\"\n", 150 | " encoding = tiktoken.get_encoding(encoding_name)\n", 151 | " num_tokens = len(encoding.encode(string))\n", 152 | " return num_tokens\n" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 5, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "6" 164 | ] 165 | }, 166 | "execution_count": 5, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "num_tokens_from_string(\"tiktoken is great!\", \"gpt2\")\n" 173 | ] 174 | }, 175 | { 176 | "attachments": {}, 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "## 4. Turn tokens into text with `encoding.decode()`" 181 | ] 182 | }, 183 | { 184 | "attachments": {}, 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "`.decode()` converts a list of token integers to a string." 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 6, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "'tiktoken is great!'" 200 | ] 201 | }, 202 | "execution_count": 6, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "encoding.decode([83, 1134, 30001, 318, 1049, 0])\n" 209 | ] 210 | }, 211 | { 212 | "attachments": {}, 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "Warning: although `.decode()` can be applied to single tokens, beware that it can be lossy for tokens that aren't on utf-8 boundaries." 217 | ] 218 | }, 219 | { 220 | "attachments": {}, 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "For single tokens, `.decode_single_token_bytes()` safely converts a single integer token to the bytes it represents." 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 7, 230 | "metadata": {}, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "text/plain": [ 235 | "[b't', b'ik', b'token', b' is', b' great', b'!']" 236 | ] 237 | }, 238 | "execution_count": 7, 239 | "metadata": {}, 240 | "output_type": "execute_result" 241 | } 242 | ], 243 | "source": [ 244 | "[encoding.decode_single_token_bytes(token) for token in [83, 1134, 30001, 318, 1049, 0]]\n" 245 | ] 246 | }, 247 | { 248 | "attachments": {}, 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "(The `b` in front of the strings indicates that the strings are byte strings.)" 253 | ] 254 | }, 255 | { 256 | "attachments": {}, 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "## 5. Comparing encodings\n", 261 | "\n", 262 | "Different encodings can vary in how they split words, group spaces, and handle non-English characters. Using the methods above, we can compare different encodings on a few example strings." 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 8, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "def compare_encodings(example_string: str) -> None:\n", 272 | " \"\"\"Prints a comparison of three string encodings.\"\"\"\n", 273 | " # print the example string\n", 274 | " print(f'\\nExample string: \"{example_string}\"')\n", 275 | " # for each encoding, print the # of tokens, the token integers, and the token bytes\n", 276 | " for encoding_name in [\"gpt2\", \"p50k_base\", \"cl100k_base\"]:\n", 277 | " encoding = tiktoken.get_encoding(encoding_name)\n", 278 | " token_integers = encoding.encode(example_string)\n", 279 | " num_tokens = len(token_integers)\n", 280 | " token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integers]\n", 281 | " print()\n", 282 | " print(f\"{encoding_name}: {num_tokens} tokens\")\n", 283 | " print(f\"token integers: {token_integers}\")\n", 284 | " print(f\"token bytes: {token_bytes}\")\n", 285 | " " 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 9, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "name": "stdout", 295 | "output_type": "stream", 296 | "text": [ 297 | "\n", 298 | "Example string: \"antidisestablishmentarianism\"\n", 299 | "\n", 300 | "gpt2: 5 tokens\n", 301 | "token integers: [415, 29207, 44390, 3699, 1042]\n", 302 | "token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']\n", 303 | "\n", 304 | "p50k_base: 5 tokens\n", 305 | "token integers: [415, 29207, 44390, 3699, 1042]\n", 306 | "token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']\n", 307 | "\n", 308 | "cl100k_base: 6 tokens\n", 309 | "token integers: [519, 85342, 34500, 479, 8997, 2191]\n", 310 | "token bytes: [b'ant', b'idis', b'establish', b'ment', b'arian', b'ism']\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "compare_encodings(\"antidisestablishmentarianism\")\n" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 10, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "\n", 328 | "Example string: \"2 + 2 = 4\"\n", 329 | "\n", 330 | "gpt2: 5 tokens\n", 331 | "token integers: [17, 1343, 362, 796, 604]\n", 332 | "token bytes: [b'2', b' +', b' 2', b' =', b' 4']\n", 333 | "\n", 334 | "p50k_base: 5 tokens\n", 335 | "token integers: [17, 1343, 362, 796, 604]\n", 336 | "token bytes: [b'2', b' +', b' 2', b' =', b' 4']\n", 337 | "\n", 338 | "cl100k_base: 7 tokens\n", 339 | "token integers: [17, 489, 220, 17, 284, 220, 19]\n", 340 | "token bytes: [b'2', b' +', b' ', b'2', b' =', b' ', b'4']\n" 341 | ] 342 | } 343 | ], 344 | "source": [ 345 | "compare_encodings(\"2 + 2 = 4\")\n" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 11, 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "name": "stdout", 355 | "output_type": "stream", 356 | "text": [ 357 | "\n", 358 | "Example string: \"お誕生日おめでとう\"\n", 359 | "\n", 360 | "gpt2: 14 tokens\n", 361 | "token integers: [2515, 232, 45739, 243, 37955, 33768, 98, 2515, 232, 1792, 223, 30640, 30201, 29557]\n", 362 | "token bytes: [b'\\xe3\\x81', b'\\x8a', b'\\xe8\\xaa', b'\\x95', b'\\xe7\\x94\\x9f', b'\\xe6\\x97', b'\\xa5', b'\\xe3\\x81', b'\\x8a', b'\\xe3\\x82', b'\\x81', b'\\xe3\\x81\\xa7', b'\\xe3\\x81\\xa8', b'\\xe3\\x81\\x86']\n", 363 | "\n", 364 | "p50k_base: 14 tokens\n", 365 | "token integers: [2515, 232, 45739, 243, 37955, 33768, 98, 2515, 232, 1792, 223, 30640, 30201, 29557]\n", 366 | "token bytes: [b'\\xe3\\x81', b'\\x8a', b'\\xe8\\xaa', b'\\x95', b'\\xe7\\x94\\x9f', b'\\xe6\\x97', b'\\xa5', b'\\xe3\\x81', b'\\x8a', b'\\xe3\\x82', b'\\x81', b'\\xe3\\x81\\xa7', b'\\xe3\\x81\\xa8', b'\\xe3\\x81\\x86']\n", 367 | "\n", 368 | "cl100k_base: 9 tokens\n", 369 | "token integers: [33334, 45918, 243, 21990, 9080, 33334, 62004, 16556, 78699]\n", 370 | "token bytes: [b'\\xe3\\x81\\x8a', b'\\xe8\\xaa', b'\\x95', b'\\xe7\\x94\\x9f', b'\\xe6\\x97\\xa5', b'\\xe3\\x81\\x8a', b'\\xe3\\x82\\x81', b'\\xe3\\x81\\xa7', b'\\xe3\\x81\\xa8\\xe3\\x81\\x86']\n" 371 | ] 372 | } 373 | ], 374 | "source": [ 375 | "compare_encodings(\"お誕生日おめでとう\")\n" 376 | ] 377 | } 378 | ], 379 | "metadata": { 380 | "kernelspec": { 381 | "display_name": "openai", 382 | "language": "python", 383 | "name": "python3" 384 | }, 385 | "language_info": { 386 | "codemirror_mode": { 387 | "name": "ipython", 388 | "version": 3 389 | }, 390 | "file_extension": ".py", 391 | "mimetype": "text/x-python", 392 | "name": "python", 393 | "nbconvert_exporter": "python", 394 | "pygments_lexer": "ipython3", 395 | "version": "3.9.9" 396 | }, 397 | "orig_nbformat": 4, 398 | "vscode": { 399 | "interpreter": { 400 | "hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97" 401 | } 402 | } 403 | }, 404 | "nbformat": 4, 405 | "nbformat_minor": 2 406 | } 407 | -------------------------------------------------------------------------------- /examples/Obtain_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 1. Load the dataset\n", 8 | "\n", 9 | "The dataset used in this example is [fine-food reviews](https://www.kaggle.com/snap/amazon-fine-food-reviews) from Amazon. The dataset contains a total of 568,454 food reviews Amazon users left up to October 2012. We will use a subset of this dataset, consisting of 1,000 most recent reviews for illustration purposes. The reviews are in English and tend to be positive or negative. Each review has a ProductId, UserId, Score, review title (Summary) and review body (Text).\n", 10 | "\n", 11 | "We will combine the review summary and review text into a single combined text. The model will encode this combined text and it will output a single vector embedding." 12 | ] 13 | }, 14 | { 15 | "attachments": {}, 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "To run this notebook, you will need to install: pandas, openai, transformers, plotly, matplotlib, scikit-learn, torch (transformer dep), torchvision, and scipy." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 6, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "# imports\n", 29 | "import pandas as pd\n", 30 | "import tiktoken\n", 31 | "\n", 32 | "from openai.embeddings_utils import get_embedding\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 7, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "# embedding model parameters\n", 42 | "embedding_model = \"text-embedding-ada-002\"\n", 43 | "embedding_encoding = \"cl100k_base\" # this the encoding for text-embedding-ada-002\n", 44 | "max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191\n" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 8, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/html": [ 55 | "
\n", 56 | "\n", 69 | "\n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | "
TimeProductIdUserIdScoreSummaryTextcombined
01351123200B003XPF9BOA3R7JR3FMEBXQB5where does one start...and stop... with a tre...Wanted to save some to bring to my Chicago fam...Title: where does one start...and stop... wit...
11351123200B003JK537SA3JBPC3WFUT5ZP1Arrived in piecesNot pleased at all. When I opened the box, mos...Title: Arrived in pieces; Content: Not pleased...
\n", 105 | "
" 106 | ], 107 | "text/plain": [ 108 | " Time ProductId UserId Score \\\n", 109 | "0 1351123200 B003XPF9BO A3R7JR3FMEBXQB 5 \n", 110 | "1 1351123200 B003JK537S A3JBPC3WFUT5ZP 1 \n", 111 | "\n", 112 | " Summary \\\n", 113 | "0 where does one start...and stop... with a tre... \n", 114 | "1 Arrived in pieces \n", 115 | "\n", 116 | " Text \\\n", 117 | "0 Wanted to save some to bring to my Chicago fam... \n", 118 | "1 Not pleased at all. When I opened the box, mos... \n", 119 | "\n", 120 | " combined \n", 121 | "0 Title: where does one start...and stop... wit... \n", 122 | "1 Title: Arrived in pieces; Content: Not pleased... " 123 | ] 124 | }, 125 | "execution_count": 8, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "# load & inspect dataset\n", 132 | "input_datapath = \"data/fine_food_reviews_1k.csv\" # to save space, we provide a pre-filtered dataset\n", 133 | "df = pd.read_csv(input_datapath, index_col=0)\n", 134 | "df = df[[\"Time\", \"ProductId\", \"UserId\", \"Score\", \"Summary\", \"Text\"]]\n", 135 | "df = df.dropna()\n", 136 | "df[\"combined\"] = (\n", 137 | " \"Title: \" + df.Summary.str.strip() + \"; Content: \" + df.Text.str.strip()\n", 138 | ")\n", 139 | "df.head(2)\n" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 9, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "1000" 151 | ] 152 | }, 153 | "execution_count": 9, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "# subsample to 1k most recent reviews and remove samples that are too long\n", 160 | "top_n = 1000\n", 161 | "df = df.sort_values(\"Time\").tail(top_n * 2) # first cut to first 2k entries, assuming less than half will be filtered out\n", 162 | "df.drop(\"Time\", axis=1, inplace=True)\n", 163 | "\n", 164 | "encoding = tiktoken.get_encoding(embedding_encoding)\n", 165 | "\n", 166 | "# omit reviews that are too long to embed\n", 167 | "df[\"n_tokens\"] = df.combined.apply(lambda x: len(encoding.encode(x)))\n", 168 | "df = df[df.n_tokens <= max_tokens].tail(top_n)\n", 169 | "len(df)\n" 170 | ] 171 | }, 172 | { 173 | "attachments": {}, 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "## 2. Get embeddings and save them for future reuse" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 10, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage\n", 187 | "\n", 188 | "# This may take a few minutes\n", 189 | "df[\"embedding\"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))\n", 190 | "df.to_csv(\"data/fine_food_reviews_with_embeddings_1k.csv\")\n" 191 | ] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "openai", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.9.9 (main, Dec 7 2021, 18:04:56) \n[Clang 13.0.0 (clang-1300.0.29.3)]" 211 | }, 212 | "orig_nbformat": 4, 213 | "vscode": { 214 | "interpreter": { 215 | "hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97" 216 | } 217 | } 218 | }, 219 | "nbformat": 4, 220 | "nbformat_minor": 2 221 | } 222 | -------------------------------------------------------------------------------- /examples/Regression_using_embeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Regression using the embeddings\n", 8 | "\n", 9 | "Regression means predicting a number, rather than one of the categories. We will predict the score based on the embedding of the review's text. We split the dataset into a training and a testing set for all of the following tasks, so we can realistically evaluate performance on unseen data. The dataset is created in the [Obtain_dataset Notebook](Obtain_dataset.ipynb).\n", 10 | "\n", 11 | "We're predicting the score of the review, which is a number between 1 and 5 (1-star being negative and 5-star positive)." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "ada-002 embedding performance on 1k Amazon reviews: mse=0.62, mae=0.53\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import numpy as np\n", 30 | "\n", 31 | "from sklearn.ensemble import RandomForestRegressor\n", 32 | "from sklearn.model_selection import train_test_split\n", 33 | "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", 34 | "\n", 35 | "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n", 36 | "\n", 37 | "df = pd.read_csv(datafile_path)\n", 38 | "df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n", 39 | "\n", 40 | "X_train, X_test, y_train, y_test = train_test_split(list(df.embedding.values), df.Score, test_size=0.2, random_state=42)\n", 41 | "\n", 42 | "rfr = RandomForestRegressor(n_estimators=100)\n", 43 | "rfr.fit(X_train, y_train)\n", 44 | "preds = rfr.predict(X_test)\n", 45 | "\n", 46 | "mse = mean_squared_error(y_test, preds)\n", 47 | "mae = mean_absolute_error(y_test, preds)\n", 48 | "\n", 49 | "print(f\"ada-002 embedding performance on 1k Amazon reviews: mse={mse:.2f}, mae={mae:.2f}\")\n" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 2, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "Dummy mean prediction performance on Amazon reviews: mse=1.73, mae=1.03\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "bmse = mean_squared_error(y_test, np.repeat(y_test.mean(), len(y_test)))\n", 67 | "bmae = mean_absolute_error(y_test, np.repeat(y_test.mean(), len(y_test)))\n", 68 | "print(\n", 69 | " f\"Dummy mean prediction performance on Amazon reviews: mse={bmse:.2f}, mae={bmae:.2f}\"\n", 70 | ")\n" 71 | ] 72 | }, 73 | { 74 | "attachments": {}, 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "We can see that the embeddings are able to predict the scores with an average error of 0.53 per score prediction. This is roughly equivalent to predicting half of reviews perfectly, and half off by one star." 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "You could also train a classifier to predict the label, or use the embeddings within an existing ML model to encode free text features." 86 | ] 87 | } 88 | ], 89 | "metadata": { 90 | "kernelspec": { 91 | "display_name": "openai", 92 | "language": "python", 93 | "name": "python3" 94 | }, 95 | "language_info": { 96 | "codemirror_mode": { 97 | "name": "ipython", 98 | "version": 3 99 | }, 100 | "file_extension": ".py", 101 | "mimetype": "text/x-python", 102 | "name": "python", 103 | "nbconvert_exporter": "python", 104 | "pygments_lexer": "ipython3", 105 | "version": "3.9.9" 106 | }, 107 | "orig_nbformat": 4, 108 | "vscode": { 109 | "interpreter": { 110 | "hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97" 111 | } 112 | } 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 2 116 | } 117 | -------------------------------------------------------------------------------- /examples/Semantic_text_search_using_embeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Semantic text search using embeddings\n", 8 | "\n", 9 | "We can search through all our reviews semantically in a very efficient manner and at very low cost, by simply embedding our search query, and then finding the most similar reviews. The dataset is created in the [Obtain_dataset Notebook](Obtain_dataset.ipynb)." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import numpy as np\n", 20 | "\n", 21 | "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n", 22 | "\n", 23 | "df = pd.read_csv(datafile_path)\n", 24 | "df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "Remember to use the documents embedding engine for documents (in this case reviews), and query embedding engine for queries. Note that here we just compare the cosine similarity of the embeddings of the query and the documents, and show top_n best matches." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "Good Buy: I liked the beans. They were vacuum sealed, plump and moist. Would recommend them for any use. I personally split and stuck them in some vodka to make vanilla extract. Yum!\n", 44 | "\n", 45 | "Jamaican Blue beans: Excellent coffee bean for roasting. Our family just purchased another 5 pounds for more roasting. Plenty of flavor and mild on acidity when roasted to a dark brown bean and befor\n", 46 | "\n", 47 | "Delicious!: I enjoy this white beans seasoning, it gives a rich flavor to the beans I just love it, my mother in law didn't know about this Zatarain's brand and now she is traying different seasoning\n", 48 | "\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "from openai.embeddings_utils import get_embedding, cosine_similarity\n", 54 | "\n", 55 | "# search through the reviews for a specific product\n", 56 | "def search_reviews(df, product_description, n=3, pprint=True):\n", 57 | " product_embedding = get_embedding(\n", 58 | " product_description,\n", 59 | " engine=\"text-embedding-ada-002\"\n", 60 | " )\n", 61 | " df[\"similarity\"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))\n", 62 | "\n", 63 | " results = (\n", 64 | " df.sort_values(\"similarity\", ascending=False)\n", 65 | " .head(n)\n", 66 | " .combined.str.replace(\"Title: \", \"\")\n", 67 | " .str.replace(\"; Content:\", \": \")\n", 68 | " )\n", 69 | " if pprint:\n", 70 | " for r in results:\n", 71 | " print(r[:200])\n", 72 | " print()\n", 73 | " return results\n", 74 | "\n", 75 | "\n", 76 | "results = search_reviews(df, \"delicious beans\", n=3)\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 3, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "Tasty and Quick Pasta: Barilla Whole Grain Fusilli with Vegetable Marinara is tasty and has an excellent chunky vegetable marinara. I just wish there was more of it. If you aren't starving or on a \n", 89 | "\n", 90 | "sooo good: tastes so good. Worth the money. My boyfriend hates wheat pasta and LOVES this. cooks fast tastes great.I love this brand and started buying more of their pastas. Bulk is best.\n", 91 | "\n", 92 | "Handy: Love the idea of ready in a minute pasta and for that alone this product gets praise. The pasta is whole grain so that's a big plus and it actually comes out al dente. The vegetable marinara\n", 93 | "\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "results = search_reviews(df, \"whole wheat pasta\", n=3)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "We can search through these reviews easily. To speed up computation, we can use a special algorithm, aimed at faster search through embeddings." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 4, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "great product, poor delivery: The coffee is excellent and I am a repeat buyer. Problem this time was with the UPS delivery. They left the box in front of my garage door in the middle of the drivewa\n", 118 | "\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "results = search_reviews(df, \"bad delivery\", n=1)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "As we can see, this can immediately deliver a lot of value. In this example we show being able to quickly find the examples of delivery failures." 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 5, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "Extremely dissapointed: Hi,
I am very disappointed with the past shipment I received of the ONE coconut water. 3 of the boxes were leaking and the coconut water was spoiled.

Thanks." 117 | ] 118 | }, 119 | "metadata": { 120 | "needs_background": "light" 121 | }, 122 | "output_type": "display_data" 123 | } 124 | ], 125 | "source": [ 126 | "import matplotlib.pyplot as plt\n", 127 | "import statsmodels.api as sm\n", 128 | "\n", 129 | "\n", 130 | "correlation = X_test[['percentile_cosine_similarity', 'Score']].corr().values[0,1]\n", 131 | "print('Correlation between user & vector similarity percentile metric and review number of stars (score): %.2f%%' % (100*correlation))\n", 132 | "\n", 133 | "# boxplot of cosine similarity for each score\n", 134 | "X_test.boxplot(column='percentile_cosine_similarity', by='Score')\n", 135 | "plt.title('')\n", 136 | "plt.show()\n", 137 | "plt.close()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "We can observe a weak trend, showing that the higher the similarity score between the user and the product embedding, the higher the review score. Therefore, the user and product embeddings can weakly predict the review score - even before the user receives the product!\n", 145 | "\n", 146 | "Because this signal works in a different way than the more commonly used collaborative filtering, it can act as an additional feature to slightly improve the performance on existing problems." 147 | ] 148 | } 149 | ], 150 | "metadata": { 151 | "interpreter": { 152 | "hash": "be4b5d5b73a21c599de40d6deb1129796d12dc1cc33a738f7bac13269cfcafe8" 153 | }, 154 | "kernelspec": { 155 | "display_name": "Python 3.7.3 64-bit ('base': conda)", 156 | "name": "python3" 157 | }, 158 | "language_info": { 159 | "codemirror_mode": { 160 | "name": "ipython", 161 | "version": 3 162 | }, 163 | "file_extension": ".py", 164 | "mimetype": "text/x-python", 165 | "name": "python", 166 | "nbconvert_exporter": "python", 167 | "pygments_lexer": "ipython3", 168 | "version": "3.7.3" 169 | }, 170 | "orig_nbformat": 4 171 | }, 172 | "nbformat": 4, 173 | "nbformat_minor": 2 174 | } 175 | -------------------------------------------------------------------------------- /examples/azure/completions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Azure completions example\n", 8 | "In this example we'll try to go over all operations needed to get completions working using the Azure endpoints. \\\n", 9 | "This example focuses on completions but also touches on some other operations that are also available using the API. This example is meant to be a quick way of showing simple operations and is not meant as a tutorial." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import openai\n", 19 | "from openai import cli" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Setup\n", 27 | "For the following sections to work properly we first have to setup some things. Let's start with the `api_base` and `api_version`. To find your `api_base` go to https://portal.azure.com, find your resource and then under \"Resource Management\" -> \"Keys and Endpoints\" look for the \"Endpoint\" value." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "openai.api_version = '2022-12-01'\n", 37 | "openai.api_base = '' # Please add your endpoint here" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "We next have to setup the `api_type` and `api_key`. We can either get the key from the portal or we can get it through Microsoft Active Directory Authentication. Depending on this the `api_type` is either `azure` or `azure_ad`." 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "### Setup: Portal\n", 52 | "Let's first look at getting the key from the portal. Go to https://portal.azure.com, find your resource and then under \"Resource Management\" -> \"Keys and Endpoints\" look for one of the \"Keys\" values." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "openai.api_type = 'azure'\n", 62 | "openai.api_key = '' # Please add your api key here" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "### (Optional) Setup: Microsoft Active Directory Authentication\n", 70 | "Let's now see how we can get a key via Microsoft Active Directory Authentication. Uncomment the following code if you want to use Active Directory Authentication instead of keys from the portal." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "# from azure.identity import DefaultAzureCredential\n", 80 | "\n", 81 | "# default_credential = DefaultAzureCredential()\n", 82 | "# token = default_credential.get_token(\"https://cognitiveservices.azure.com/.default\")\n", 83 | "\n", 84 | "# openai.api_type = 'azure_ad'\n", 85 | "# openai.api_key = token.token" 86 | ] 87 | }, 88 | { 89 | "attachments": {}, 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "## Deployments\n", 94 | "In this section we are going to create a deployment using the `text-davinci-002` model that we can then use to create completions." 95 | ] 96 | }, 97 | { 98 | "attachments": {}, 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Deployments: Create manually\n", 103 | "Create a new deployment by going to your Resource in your portal under \"Resource Management\" -> \"Model deployments\". Select `text-davinci-002` as the model." 104 | ] 105 | }, 106 | { 107 | "attachments": {}, 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "### (Optional) Deployments: Create programatically\n", 112 | "We can also create a deployment using code:" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "model = \"text-davinci-002\"\n", 122 | "\n", 123 | "# Now let's create the deployment\n", 124 | "print(f'Creating a new deployment with model: {model}')\n", 125 | "result = openai.Deployment.create(model=model, scale_settings={\"scale_type\":\"standard\"})\n", 126 | "deployment_id = result[\"id\"]\n", 127 | "print(f'Successfully created deployment with id: {deployment_id}')" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "### (Optional) Deployments: Wait for deployment to succeed\n", 135 | "Now let's check the status of the newly created deployment and wait till it is succeeded." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "print(f'Checking for deployment status.')\n", 145 | "resp = openai.Deployment.retrieve(id=deployment_id)\n", 146 | "status = resp[\"status\"]\n", 147 | "print(f'Deployment {deployment_id} has status: {status}')\n", 148 | "while status not in [\"succeeded\", \"failed\"]:\n", 149 | " resp = openai.Deployment.retrieve(id=deployment_id)\n", 150 | " status = resp[\"status\"]\n", 151 | " print(f'Deployment {deployment_id} has status: {status}')" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "### Completions\n", 159 | "Now let's send a sample completion to the deployment." 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "prompt = \"The food was delicious and the waiter\"\n", 169 | "completion = openai.Completion.create(deployment_id=deployment_id,\n", 170 | " prompt=prompt, stop=\".\", temperature=0)\n", 171 | " \n", 172 | "print(f\"{prompt}{completion['choices'][0]['text']}.\")" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### (Optional) Deployments: Delete\n", 180 | "Finally let's delete the deployment" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "print(f'Deleting deployment: {deployment_id}')\n", 190 | "openai.Deployment.delete(sid=deployment_id)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [] 199 | } 200 | ], 201 | "metadata": { 202 | "kernelspec": { 203 | "display_name": "Python 3 (ipykernel)", 204 | "language": "python", 205 | "name": "python3" 206 | }, 207 | "language_info": { 208 | "codemirror_mode": { 209 | "name": "ipython", 210 | "version": 3 211 | }, 212 | "file_extension": ".py", 213 | "mimetype": "text/x-python", 214 | "name": "python", 215 | "nbconvert_exporter": "python", 216 | "pygments_lexer": "ipython3", 217 | "version": "3.10.8" 218 | }, 219 | "vscode": { 220 | "interpreter": { 221 | "hash": "3a5103089ab7e7c666b279eeded403fcec76de49a40685dbdfe9f9c78ad97c17" 222 | } 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 2 227 | } 228 | -------------------------------------------------------------------------------- /examples/azure/embeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Azure embeddings example\n", 9 | "In this example we'll try to go over all operations for embeddings that can be done using the Azure endpoints. \\\n", 10 | "This example focuses on embeddings but also touches some other operations that are also available using the API. This example is meant to be a quick way of showing simple operations and is not meant as a tutorial." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import openai\n", 20 | "from openai import cli" 21 | ] 22 | }, 23 | { 24 | "attachments": {}, 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Setup\n", 29 | "For the following sections to work properly we first have to setup some things. Let's start with the `api_base` and `api_version`. To find your `api_base` go to https://portal.azure.com, find your resource and then under \"Resource Management\" -> \"Keys and Endpoints\" look for the \"Endpoint\" value." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "openai.api_version = '2022-12-01'\n", 39 | "openai.api_base = '' # Please add your endpoint here" 40 | ] 41 | }, 42 | { 43 | "attachments": {}, 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "We next have to setup the `api_type` and `api_key`. We can either get the key from the portal or we can get it through Microsoft Active Directory Authentication. Depending on this the `api_type` is either `azure` or `azure_ad`." 48 | ] 49 | }, 50 | { 51 | "attachments": {}, 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "### Setup: Portal\n", 56 | "Let's first look at getting the key from the portal. Go to https://portal.azure.com, find your resource and then under \"Resource Management\" -> \"Keys and Endpoints\" look for one of the \"Keys\" values." 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "openai.api_type = 'azure'\n", 66 | "openai.api_key = '' # Please add your api key here" 67 | ] 68 | }, 69 | { 70 | "attachments": {}, 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "### (Optional) Setup: Microsoft Active Directory Authentication\n", 75 | "Let's now see how we can get a key via Microsoft Active Directory Authentication. Uncomment the following code if you want to use Active Directory Authentication instead of keys from the portal." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "# from azure.identity import DefaultAzureCredential\n", 85 | "\n", 86 | "# default_credential = DefaultAzureCredential()\n", 87 | "# token = default_credential.get_token(\"https://cognitiveservices.azure.com/.default\")\n", 88 | "\n", 89 | "# openai.api_type = 'azure_ad'\n", 90 | "# openai.api_key = token.token" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "## Deployments\n", 98 | "In this section we are going to create a deployment that we can use to create embeddings." 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "### Deployments: Create manually\n", 106 | "Let's create a deployment using the `text-similarity-curie-001` model. Create a new deployment by going to your Resource in your portal under \"Resource Management\" -> \"Model deployments\"." 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "### (Optional) Deployments: Create programatically\n", 114 | "We can also create a deployment using code:" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "model = \"text-similarity-curie-001\"\n", 124 | "\n", 125 | "# Now let's create the deployment\n", 126 | "print(f'Creating a new deployment with model: {model}')\n", 127 | "result = openai.Deployment.create(model=model, scale_settings={\"scale_type\":\"standard\"})\n", 128 | "deployment_id = result[\"id\"]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### (Optional) Deployments: Retrieving\n", 136 | "Now let's check the status of the newly created deployment" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "print(f'Checking for deployment status.')\n", 146 | "resp = openai.Deployment.retrieve(id=deployment_id)\n", 147 | "status = resp[\"status\"]\n", 148 | "print(f'Deployment {deployment_id} is with status: {status}')" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "### Deployments: Listing\n", 156 | "Now because creating a new deployment takes a long time, let's look in the subscription for an already finished deployment that succeeded." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "print('While deployment running, selecting a completed one that supports embeddings.')\n", 166 | "deployment_id = None\n", 167 | "result = openai.Deployment.list()\n", 168 | "for deployment in result.data:\n", 169 | " if deployment[\"status\"] != \"succeeded\":\n", 170 | " continue\n", 171 | " \n", 172 | " model = openai.Model.retrieve(deployment[\"model\"])\n", 173 | " if model[\"capabilities\"][\"embeddings\"] != True:\n", 174 | " continue\n", 175 | " \n", 176 | " deployment_id = deployment[\"id\"]\n", 177 | " break\n", 178 | "\n", 179 | "if not deployment_id:\n", 180 | " print('No deployment with status: succeeded found.')\n", 181 | "else:\n", 182 | " print(f'Found a succeeded deployment that supports embeddings with id: {deployment_id}.')" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "### Embeddings\n", 190 | "Now let's send a sample embedding to the deployment." 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "embeddings = openai.Embedding.create(deployment_id=deployment_id,\n", 200 | " input=\"The food was delicious and the waiter...\")\n", 201 | " \n", 202 | "print(embeddings)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "### (Optional) Deployments: Delete\n", 210 | "Finally let's delete the deployment" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "print(f'Deleting deployment: {deployment_id}')\n", 220 | "openai.Deployment.delete(sid=deployment_id)" 221 | ] 222 | } 223 | ], 224 | "metadata": { 225 | "kernelspec": { 226 | "display_name": "Python 3", 227 | "language": "python", 228 | "name": "python3" 229 | }, 230 | "language_info": { 231 | "codemirror_mode": { 232 | "name": "ipython", 233 | "version": 3 234 | }, 235 | "file_extension": ".py", 236 | "mimetype": "text/x-python", 237 | "name": "python", 238 | "nbconvert_exporter": "python", 239 | "pygments_lexer": "ipython3", 240 | "version": "3.10.8" 241 | }, 242 | "vscode": { 243 | "interpreter": { 244 | "hash": "3a5103089ab7e7c666b279eeded403fcec76de49a40685dbdfe9f9c78ad97c17" 245 | } 246 | } 247 | }, 248 | "nbformat": 4, 249 | "nbformat_minor": 2 250 | } 251 | -------------------------------------------------------------------------------- /examples/azure/finetuning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Azure Fine tuning example\n", 8 | "In this example we'll try to go over all operations that can be done using the Azure endpoints and their differences with the openAi endpoints (if any).
\n", 9 | "This example focuses on finetuning but also touches on the majority of operations that are available using the API. This example is meant to be a quick way of showing simple operations and is not meant as a finetune model adaptation tutorial.\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import openai\n", 19 | "from openai import cli" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Setup\n", 27 | "For the following sections to work properly we first have to setup some things. Let's start with the `api_base` and `api_version`. To find your `api_base` go to https://portal.azure.com, find your resource and then under \"Resource Management\" -> \"Keys and Endpoints\" look for the \"Endpoint\" value." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "openai.api_version = '2022-12-01'\n", 37 | "openai.api_base = '' # Please add your endpoint here" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "We next have to setup the `api_type` and `api_key`. We can either get the key from the portal or we can get it through Microsoft Active Directory Authentication. Depending on this the `api_type` is either `azure` or `azure_ad`." 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "### Setup: Portal\n", 52 | "Let's first look at getting the key from the portal. Go to https://portal.azure.com, find your resource and then under \"Resource Management\" -> \"Keys and Endpoints\" look for one of the \"Keys\" values." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "openai.api_type = 'azure'\n", 62 | "openai.api_key = '' # Please add your api key here" 63 | ] 64 | }, 65 | { 66 | "attachments": {}, 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### (Optional) Setup: Microsoft Active Directory Authentication\n", 71 | "Let's now see how we can get a key via Microsoft Active Directory Authentication. Uncomment the following code if you want to use Active Directory Authentication instead of keys from the portal." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# from azure.identity import DefaultAzureCredential\n", 81 | "\n", 82 | "# default_credential = DefaultAzureCredential()\n", 83 | "# token = default_credential.get_token(\"https://cognitiveservices.azure.com/.default\")\n", 84 | "\n", 85 | "# openai.api_type = 'azure_ad'\n", 86 | "# openai.api_key = token.token" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "## Files\n", 94 | "In the next section we will focus on the files operations: importing, listing, retrieving, deleting. For this we need to create 2 temporary files with some sample data. For the sake of simplicity, we will use the same data for training and validation." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "import shutil\n", 104 | "import json\n", 105 | "\n", 106 | "training_file_name = 'training.jsonl'\n", 107 | "validation_file_name = 'validation.jsonl'\n", 108 | "\n", 109 | "sample_data = [{\"prompt\": \"When I go to the store, I want an\", \"completion\": \"apple.\"},\n", 110 | " {\"prompt\": \"When I go to work, I want a\", \"completion\": \"coffee.\"},\n", 111 | " {\"prompt\": \"When I go home, I want a\", \"completion\": \"soda.\"}]\n", 112 | "\n", 113 | "print(f'Generating the training file: {training_file_name}')\n", 114 | "with open(training_file_name, 'w') as training_file:\n", 115 | " for entry in sample_data:\n", 116 | " json.dump(entry, training_file)\n", 117 | " training_file.write('\\n')\n", 118 | "\n", 119 | "print(f'Copying the training file to the validation file')\n", 120 | "shutil.copy(training_file_name, validation_file_name)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "### Files: Listing\n", 128 | "List all of the uploaded files and check for the ones that are named \"training.jsonl\" or \"validation.jsonl\"" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "print('Checking for existing uploaded files.')\n", 138 | "results = []\n", 139 | "files = openai.File.list().data\n", 140 | "print(f'Found {len(files)} total uploaded files in the subscription.')\n", 141 | "for item in files:\n", 142 | " if item[\"filename\"] in [training_file_name, validation_file_name]:\n", 143 | " results.append(item[\"id\"])\n", 144 | "print(f'Found {len(results)} already uploaded files that match our names.')\n" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "### Files: Deleting\n", 152 | "Let's now delete those found files (if any) since we're going to be re-uploading them next." 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "print(f'Deleting already uploaded files...')\n", 162 | "for id in results:\n", 163 | " openai.File.delete(sid = id)\n" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "### Files: Importing & Retrieving\n", 171 | "Now, let's import our two files ('training.jsonl' and 'validation.jsonl') and keep those IDs since we're going to use them later for finetuning.
\n", 172 | "For this operation we are going to use the cli wrapper which does a bit more checks before uploading and also gives us progress. In addition, after uploading we're going to check the status our import until it has succeeded (or failed if something goes wrong)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "import time\n", 182 | "\n", 183 | "def check_status(training_id, validation_id):\n", 184 | " train_status = openai.File.retrieve(training_id)[\"status\"]\n", 185 | " valid_status = openai.File.retrieve(validation_id)[\"status\"]\n", 186 | " print(f'Status (training_file | validation_file): {train_status} | {valid_status}')\n", 187 | " return (train_status, valid_status)\n", 188 | "\n", 189 | "#importing our two files\n", 190 | "training_id = cli.FineTune._get_or_upload(training_file_name, True)\n", 191 | "validation_id = cli.FineTune._get_or_upload(validation_file_name, True)\n", 192 | "\n", 193 | "#checking the status of the imports\n", 194 | "(train_status, valid_status) = check_status(training_id, validation_id)\n", 195 | "\n", 196 | "while train_status not in [\"succeeded\", \"failed\"] or valid_status not in [\"succeeded\", \"failed\"]:\n", 197 | " time.sleep(1)\n", 198 | " (train_status, valid_status) = check_status(training_id, validation_id)\n" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "### Files: Downloading\n", 206 | "Now let's download one of the files, the training file for example, to check that everything was in order during importing and all bits are there." 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "print(f'Downloading training file: {training_id}')\n", 216 | "result = openai.File.download(training_id)\n", 217 | "print(result.decode('utf-8'))" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "## Finetune\n", 225 | "In this section we are going to use the two training and validation files that we imported in the previous section, to train a finetune model." 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "### Finetune: Adapt\n", 233 | "First let's create the finetune adaptation job." 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "create_args = {\n", 243 | " \"training_file\": training_id,\n", 244 | " \"validation_file\": validation_id,\n", 245 | " \"model\": \"babbage\",\n", 246 | " \"compute_classification_metrics\": True,\n", 247 | " \"classification_n_classes\": 3,\n", 248 | " \"n_epochs\": 20,\n", 249 | " \"batch_size\": 3,\n", 250 | " \"learning_rate_multiplier\": 0.3\n", 251 | "}\n", 252 | "resp = openai.FineTune.create(**create_args)\n", 253 | "job_id = resp[\"id\"]\n", 254 | "status = resp[\"status\"]\n", 255 | "\n", 256 | "print(f'Fine-tunning model with jobID: {job_id}.')" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "### Finetune: Streaming\n", 264 | "While the job runs, we can subscribe to the streaming events to check the progress of the operation." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "import signal\n", 274 | "import datetime\n", 275 | "\n", 276 | "def signal_handler(sig, frame):\n", 277 | " status = openai.FineTune.retrieve(job_id).status\n", 278 | " print(f\"Stream interrupted. Job is still {status}.\")\n", 279 | " return\n", 280 | "\n", 281 | "print(f'Streaming events for the fine-tuning job: {job_id}')\n", 282 | "signal.signal(signal.SIGINT, signal_handler)\n", 283 | "\n", 284 | "events = openai.FineTune.stream_events(job_id)\n", 285 | "try:\n", 286 | " for event in events:\n", 287 | " print(f'{datetime.datetime.fromtimestamp(event[\"created_at\"])} {event[\"message\"]}')\n", 288 | "\n", 289 | "except Exception:\n", 290 | " print(\"Stream interrupted (client disconnected).\")" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "### Finetune: Listing and Retrieving\n", 298 | "Now let's check that our operation was successful and in addition we can look at all of the finetuning operations using a list operation." 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "status = openai.FineTune.retrieve(id=job_id)[\"status\"]\n", 308 | "if status not in [\"succeeded\", \"failed\"]:\n", 309 | " print(f'Job not in terminal status: {status}. Waiting.')\n", 310 | " while status not in [\"succeeded\", \"failed\"]:\n", 311 | " time.sleep(2)\n", 312 | " status = openai.FineTune.retrieve(id=job_id)[\"status\"]\n", 313 | " print(f'Status: {status}')\n", 314 | "else:\n", 315 | " print(f'Finetune job {job_id} finished with status: {status}')\n", 316 | "\n", 317 | "print('Checking other finetune jobs in the subscription.')\n", 318 | "result = openai.FineTune.list()\n", 319 | "print(f'Found {len(result.data)} finetune jobs.')" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "### Finetune: Deleting\n", 327 | "Finally we can delete our finetune job.
\n", 328 | "WARNING: Please skip this step if you want to continue with the next section as the finetune model is needed. (The delete code is commented out by default)" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "# openai.FineTune.delete(sid=job_id)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "## Deployments\n", 345 | "In this section we are going to create a deployment using the finetune model that we just adapted and then used the deployment to create a simple completion operation." 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "### Deployments: Create\n", 353 | "Let's create a deployment using the fine-tune model." 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "#Fist let's get the model of the previous job:\n", 363 | "result = openai.FineTune.retrieve(id=job_id)\n", 364 | "if result[\"status\"] == 'succeeded':\n", 365 | " model = result[\"fine_tuned_model\"]\n", 366 | "\n", 367 | "# Now let's create the deployment\n", 368 | "print(f'Creating a new deployment with model: {model}')\n", 369 | "result = openai.Deployment.create(model=model, scale_settings={\"scale_type\":\"standard\"})\n", 370 | "deployment_id = result[\"id\"]\n" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "### Deployments: Retrieving\n", 378 | "Now let's check the status of the newly created deployment" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "print(f'Checking for deployment status.')\n", 388 | "resp = openai.Deployment.retrieve(id=deployment_id)\n", 389 | "status = resp[\"status\"]\n", 390 | "print(f'Deployment {deployment_id} is with status: {status}')\n" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": {}, 396 | "source": [ 397 | "### Deployments: Listing\n", 398 | "Now because creating a new deployment takes a long time, let's look in the subscription for an already finished deployment that succeeded." 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "print('While deployment running, selecting a completed one.')\n", 408 | "deployment_id = None\n", 409 | "result = openai.Deployment.list()\n", 410 | "for deployment in result.data:\n", 411 | " if deployment[\"status\"] == \"succeeded\":\n", 412 | " deployment_id = deployment[\"id\"]\n", 413 | " break\n", 414 | "\n", 415 | "if not deployment_id:\n", 416 | " print('No deployment with status: succeeded found.')\n", 417 | "else:\n", 418 | " print(f'Found a successful deployment with id: {deployment_id}.')\n" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": {}, 424 | "source": [ 425 | "### Completions\n", 426 | "Now let's send a sample completion to the deployment." 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "print('Sending a test completion job')\n", 436 | "start_phrase = 'When I go home, I want a'\n", 437 | "response = openai.Completion.create(deployment_id=deployment_id, prompt=start_phrase, temperature=0, stop=\".\")\n", 438 | "text = response['choices'][0]['text'].replace('\\n', '').replace(' .', '.').strip()\n", 439 | "print(f'\"{start_phrase} {text}.\"')" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "### Deployments: Delete\n", 447 | "Finally let's delete the deployment" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "print(f'Deleting deployment: {deployment_id}')\n", 457 | "openai.Deployment.delete(sid=deployment_id)" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": {}, 463 | "source": [ 464 | "Thank you" 465 | ] 466 | } 467 | ], 468 | "metadata": { 469 | "kernelspec": { 470 | "display_name": "Python 3", 471 | "language": "python", 472 | "name": "python3" 473 | }, 474 | "language_info": { 475 | "codemirror_mode": { 476 | "name": "ipython", 477 | "version": 3 478 | }, 479 | "file_extension": ".py", 480 | "mimetype": "text/x-python", 481 | "name": "python", 482 | "nbconvert_exporter": "python", 483 | "pygments_lexer": "ipython3", 484 | "version": "3.10.8" 485 | }, 486 | "vscode": { 487 | "interpreter": { 488 | "hash": "3a5103089ab7e7c666b279eeded403fcec76de49a40685dbdfe9f9c78ad97c17" 489 | } 490 | } 491 | }, 492 | "nbformat": 4, 493 | "nbformat_minor": 2 494 | } 495 | -------------------------------------------------------------------------------- /examples/data/25000_spend_dataset_current.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/examples/data/25000_spend_dataset_current.csv -------------------------------------------------------------------------------- /examples/data/labelled_transactions.csv: -------------------------------------------------------------------------------- 1 | Date,Supplier,Description,Transaction value (£),Classification 2 | 15/08/2016,Creative Video Productions Ltd,Kelvin Hall,26866,Other 3 | 29/05/2017,John Graham Construction Ltd,Causewayside Refurbishment,74806,Building Improvement 4 | 29/05/2017,Morris & Spottiswood Ltd,George IV Bridge Work,56448,Building Improvement 5 | 31/05/2017,John Graham Construction Ltd,Causewayside Refurbishment,164691,Building Improvement 6 | 24/07/2017,John Graham Construction Ltd,Causewayside Refurbishment,27926,Building Improvement 7 | 24/07/2017,John Graham Construction Ltd,Causewayside Refurbishment,212690,Building Improvement 8 | 16/08/2017,John Graham Construction Ltd,Causewayside Refurbishment,59021,Building Improvement 9 | 16/08/2017,John Graham Construction Ltd,Causewayside Refurbishment,136379,Building Improvement 10 | 23/08/2017,Culture And Sport Glasgow,Kelvin Hall,60503,Building Improvement 11 | 23/08/2017,XMA Scotland Ltd,Kelvin Hall,31830,Building Improvement 12 | 31/08/2017,John Graham Construction Ltd,Causewayside Refurbishment,36313,Building Improvement 13 | 31/08/2017,Insight Direct (UK) Ltd,Causewayside Refurbishment,68222,Building Improvement 14 | 31/08/2017,Mark Finn Laboratory,George IV Bridge Work,53884,Building Improvement 15 | 11/09/2017,John Graham Construction Ltd,Causewayside Refurbishment,189483,Building Improvement 16 | 23/10/2017,John Graham Construction Ltd,Causewayside Refurbishment,151659,Building Improvement 17 | 23/10/2017,City Building LLP,Causewayside Refurbishment,53147,Building Improvement 18 | 07/02/2017,John Graham Construction Ltd,Causewayside Refurbishment,52404,Building Improvement 19 | 13/02/2017,John Graham Construction Ltd,Causewayside Refurbishment,272390,Building Improvement 20 | 06/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,31781,Building Improvement 21 | 06/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,198048,Building Improvement 22 | 31/03/2017,Nicholson Bros(Electrical Contractors) Ltd,Causewayside Refurbishment,33666,Building Improvement 23 | 31/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,222090,Building Improvement 24 | 31/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,63971,Building Improvement 25 | 24/04/2017,Scottish Historic Buildings Trust,Lawnmarket Work,50057,Building Improvement 26 | 30/04/2017,Morris & Spottiswood Ltd,George IV Bridge Work,63716,Building Improvement 27 | 15/05/2017,John Graham Construction Ltd,Causewayside Refurbishment,245381,Building Improvement 28 | 12/09/2016,Flexiform,Kelvin Hall,42623,Building Improvement 29 | 12/09/2016,John Graham Construction Ltd,Causewayside Refurbishment,228689,Building Improvement 30 | 26/09/2016,Senator International,Kelvin Hall,35706,Building Improvement 31 | 26/09/2016,John Graham Construction Ltd,Causewayside Refurbishment,28378,Building Improvement 32 | 30/09/2016,A McGillivray,Causewayside Refurbishment,44392,Building Improvement 33 | 10/10/2016,John Graham Construction Ltd,Causewayside Refurbishment,303999,Building Improvement 34 | 31/10/2016,John Graham Construction Ltd,Causewayside Refurbishment,74245,Building Improvement 35 | 07/11/2016,CBRE,Kelvin Hall,83736,Building Improvement 36 | 14/11/2016,University Of Glasgow,Kelvin Hall,188682,Building Improvement 37 | 14/11/2016,John Graham Construction Ltd,Causewayside Refurbishment,362326,Building Improvement 38 | 12/12/2016,John Graham Construction Ltd,Causewayside Refurbishment,385310,Building Improvement 39 | 30/12/2016,John Graham Construction Ltd,Causewayside Refurbishment,253618,Building Improvement 40 | 30/12/2016,John Graham Construction Ltd,Causewayside Refurbishment,45127,Building Improvement 41 | 21/04/2016,M & J Ballantyne Ltd,George IV Bridge Work,35098,Building Improvement 42 | 09/05/2016,John Graham Construction Ltd,Causewayside Refurbishment,64361,Building Improvement 43 | 09/05/2016,A McGillivray,Causewayside Refurbishment,53690,Building Improvement 44 | 16/05/2016,John Graham Construction Ltd,Causewayside Refurbishment,365344,Building Improvement 45 | 10/06/2016,Wavetek Ltd,Kelvin Hall,87589,Building Improvement 46 | 10/06/2016,John Graham Construction Ltd,Causewayside Refurbishment,381803,Building Improvement 47 | 30/06/2016,Glasgow City Council,Kelvin Hall,1700000,Building Improvement 48 | 11/07/2016,Wavetek Ltd,Kelvin Hall,65692,Building Improvement 49 | 11/07/2016,John Graham Construction Ltd,Causewayside Refurbishment,139845,Building Improvement 50 | 25/07/2016,A McGillivray,Causewayside Refurbishment,30113,Building Improvement 51 | 15/08/2016,John Graham Construction Ltd,Causewayside Refurbishment,196807,Building Improvement 52 | 06/11/2017,John Graham Construction Ltd,Causewayside Refurbishment,134208,Building Improvement 53 | 31/03/2017,NLS Foundation,Grant Payment,177500,Other 54 | 09/10/2017,Frost And Sullivan Ltd,Literary & Archival Items,28125,Literature & Archive 55 | 09/10/2017,JISC Services Ltd ,Literary & Archival Items,43481,Literature & Archive 56 | 27/02/2017,Cengage Learning (Emea )Ltd,Literary & Archival Items,43302,Literature & Archive 57 | 06/03/2017,Private Sale,Literary & Archival Items,72500,Literature & Archive 58 | 31/03/2017,Private Sale,Literary & Archival Items,3422500,Literature & Archive 59 | 24/04/2017,Cengage Learning (Emea )Ltd,Literary & Archival Items,43302,Literature & Archive 60 | 22/05/2017,ALDL,Legal Deposit Services,27067,Literature & Archive 61 | 19/09/2016,Jisc Services Ltd Subscription Account,Literary & Archival Items,42629,Literature & Archive 62 | 10/10/2016,Cengage Learning (Emea )Ltd,Literary & Archival Items,86604,Literature & Archive 63 | 24/10/2016,ALDL,ALDL Charges,32317,Literature & Archive 64 | 26/04/2016,Private Sale,Literary & Archival Items,30000,Literature & Archive 65 | 30/05/2016,ALDL,ALDL Charges,32317,Literature & Archive 66 | 15/07/2016,Sotheby'S,Literary & Archival Items,28500,Literature & Archive 67 | 18/07/2016,Christies,Literary & Archival Items,33800,Literature & Archive 68 | 31/07/2016,ALDL,ALDL Charges,32317,Literature & Archive 69 | 08/12/2016,Sothebys,Literary & Archival Items,166000,Literature & Archive 70 | 08/12/2016,Private Sale,Literary & Archival Items,87500,Literature & Archive 71 | 26/06/2017,ECG Facilities Service,Facilities Management Charge,33386,Utility Bills 72 | 26/06/2017,British Library,Legal Deposit Services,50056,Other 73 | 24/07/2017,ALDL,Legal Deposit Services,27067,Other 74 | 16/08/2017,ECG Facilities Service,Facilities Management Charge,33386,Utility Bills 75 | 23/08/2017,ECG Facilities Service,Facilities Management Charge,33386,Utility Bills 76 | 07/02/2017,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills 77 | 27/02/2017,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills 78 | 27/03/2017,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills 79 | 22/05/2017,ECG Facilities Service,Facilities Management Charge,33386,Utility Bills 80 | 26/09/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills 81 | 24/10/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills 82 | 08/12/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills 83 | 30/12/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills 84 | 23/05/2016,ECG Facilities Service,Facilities Management Charge,32777,Utility Bills 85 | 23/05/2016,ECG Facilities Service,Facilities Management Charge,32777,Utility Bills 86 | 28/06/2016,ECG Facilities Service,Facilities Management Charge,32832,Utility Bills 87 | 08/08/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills 88 | 24/08/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills 89 | 30/10/2017,ECG Facilities Service,Facilities Management Charge,35758,Utility Bills 90 | 16/08/2017,Ex Libris,IT equipment,76610,Software/IT 91 | 31/03/2017,XMA Scotland Ltd,IT equipment,33450,Software/IT 92 | 31/03/2017,XMA Scotland Ltd,IT equipment,84524,Software/IT 93 | 24/04/2017,Insight Direct (UK) Ltd,IT equipment,56768,Software/IT 94 | 09/05/2016,Computacenter Uk,Kelvin Hall,72835,Software/IT 95 | 23/05/2016,Computacenter Uk,Kelvin Hall,26506,Software/IT 96 | 15/09/2017,City Of Edinburgh Council,Non Domestic Rates ,57662,Utility Bills 97 | 15/09/2017,City Of Edinburgh Council,Non Domestic Rates ,142680,Utility Bills 98 | 08/05/2017,Anglian Water Business,Water,26832,Utility Bills 99 | 30/04/2016,City Of Edinburgh Council,Non Domestic Rates ,40800,Utility Bills 100 | 12/09/2016,City Of Edinburgh Council,Non Domestic Rates ,144330,Utility Bills 101 | 12/09/2016,City Of Edinburgh Council,Non Domestic Rates ,49827,Utility Bills 102 | 24/07/2017,AM Phillip,Vehicle Purchase,26604,Other -------------------------------------------------------------------------------- /examples/data/recommendations_embeddings_cache.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/examples/data/recommendations_embeddings_cache.pkl -------------------------------------------------------------------------------- /examples/fine-tuned_qa/answers_with_ft.py: -------------------------------------------------------------------------------- 1 | """ 2 | Note: To answer questions based on text documents, we recommend the procedure in 3 | [Question Answering using Embeddings](https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb). 4 | Some of the code below may rely on [deprecated API endpoints](https://github.com/openai/openai-cookbook/tree/main/transition_guides_for_deprecated_API_endpoints). 5 | """ 6 | 7 | import argparse 8 | 9 | import openai 10 | 11 | 12 | def create_context( 13 | question, search_file_id, max_len=1800, search_model="ada", max_rerank=10 14 | ): 15 | """ 16 | Create a context for a question by finding the most similar context from the search file. 17 | :param question: The question 18 | :param search_file_id: The file id of the search file 19 | :param max_len: The maximum length of the returned context (in tokens) 20 | :param search_model: The search model to use 21 | :param max_rerank: The maximum number of reranking 22 | :return: The context 23 | """ 24 | results = openai.Engine(search_model).search( 25 | search_model=search_model, 26 | query=question, 27 | max_rerank=max_rerank, 28 | file=search_file_id, 29 | return_metadata=True, 30 | ) 31 | returns = [] 32 | cur_len = 0 33 | for result in results["data"]: 34 | cur_len += int(result["metadata"]) + 4 35 | if cur_len > max_len: 36 | break 37 | returns.append(result["text"]) 38 | return "\n\n###\n\n".join(returns) 39 | 40 | 41 | def answer_question( 42 | search_file_id="", 43 | fine_tuned_qa_model="", 44 | question="Which country won the European Football championship in 2021?", 45 | max_len=1800, 46 | search_model="ada", 47 | max_rerank=10, 48 | debug=False, 49 | stop_sequence=["\n", "."], 50 | max_tokens=100, 51 | ): 52 | """ 53 | Answer a question based on the most similar context from the search file, using your fine-tuned model. 54 | :param question: The question 55 | :param fine_tuned_qa_model: The fine tuned QA model 56 | :param search_file_id: The file id of the search file 57 | :param max_len: The maximum length of the returned context (in tokens) 58 | :param search_model: The search model to use 59 | :param max_rerank: The maximum number of reranking 60 | :param debug: Whether to output debug information 61 | :param stop_sequence: The stop sequence for Q&A model 62 | :param max_tokens: The maximum number of tokens to return 63 | :return: The answer 64 | """ 65 | context = create_context( 66 | question, 67 | search_file_id, 68 | max_len=max_len, 69 | search_model=search_model, 70 | max_rerank=max_rerank, 71 | ) 72 | if debug: 73 | print("Context:\n" + context) 74 | print("\n\n") 75 | try: 76 | # fine-tuned models requires model parameter, whereas other models require engine parameter 77 | model_param = ( 78 | {"model": fine_tuned_qa_model} 79 | if ":" in fine_tuned_qa_model 80 | and fine_tuned_qa_model.split(":")[1].startswith("ft") 81 | else {"engine": fine_tuned_qa_model} 82 | ) 83 | response = openai.Completion.create( 84 | prompt=f"Answer the question based on the context below\n\nText: {context}\n\n---\n\nQuestion: {question}\nAnswer:", 85 | temperature=0, 86 | max_tokens=max_tokens, 87 | top_p=1, 88 | frequency_penalty=0, 89 | presence_penalty=0, 90 | stop=stop_sequence, 91 | **model_param, 92 | ) 93 | return response["choices"][0]["text"] 94 | except Exception as e: 95 | print(e) 96 | return "" 97 | 98 | 99 | if __name__ == "__main__": 100 | parser = argparse.ArgumentParser( 101 | description="Rudimentary functionality of the answers endpoint with a fine-tuned Q&A model.", 102 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 103 | ) 104 | parser.add_argument( 105 | "--search_file_id", help="Search file id", required=True, type=str 106 | ) 107 | parser.add_argument( 108 | "--fine_tuned_qa_model", help="Fine-tuned QA model id", required=True, type=str 109 | ) 110 | parser.add_argument( 111 | "--question", help="Question to answer", required=True, type=str 112 | ) 113 | parser.add_argument( 114 | "--max_len", 115 | help="Maximum length of the returned context (in tokens)", 116 | default=1800, 117 | type=int, 118 | ) 119 | parser.add_argument( 120 | "--search_model", help="Search model to use", default="ada", type=str 121 | ) 122 | parser.add_argument( 123 | "--max_rerank", 124 | help="Maximum number of reranking for the search", 125 | default=10, 126 | type=int, 127 | ) 128 | parser.add_argument( 129 | "--debug", help="Print debug information (context used)", action="store_true" 130 | ) 131 | parser.add_argument( 132 | "--stop_sequence", 133 | help="Stop sequences for the Q&A model", 134 | default=["\n", "."], 135 | nargs="+", 136 | type=str, 137 | ) 138 | parser.add_argument( 139 | "--max_tokens", 140 | help="Maximum number of tokens to return", 141 | default=100, 142 | type=int, 143 | ) 144 | args = parser.parse_args() 145 | response = answer_question( 146 | search_file_id=args.search_file_id, 147 | fine_tuned_qa_model=args.fine_tuned_qa_model, 148 | question=args.question, 149 | max_len=args.max_len, 150 | search_model=args.search_model, 151 | max_rerank=args.max_rerank, 152 | debug=args.debug, 153 | stop_sequence=args.stop_sequence, 154 | max_tokens=args.max_tokens, 155 | ) 156 | print(f"Answer:{response}") 157 | -------------------------------------------------------------------------------- /how_to_work_with_large_language_models.md: -------------------------------------------------------------------------------- 1 | # How to work with large language models 2 | 3 | ## How large language models work 4 | 5 | [Large language models][Large language models Blog Post] are functions that map text to text. Given an input string of text, a large language model predicts the text that should come next. 6 | 7 | The magic of large language models is that by being trained to minimize this prediction error over vast quantities of text, the models end up learning concepts useful for these predictions. For example, they learn: 8 | 9 | * how to spell 10 | * how grammar works 11 | * how to paraphrase 12 | * how to answer questions 13 | * how to hold a conversation 14 | * how to write in many languages 15 | * how to code 16 | * etc. 17 | 18 | None of these capabilities are explicitly programmed in—they all emerge as a result of training. 19 | 20 | GPT-3 powers [hundreds of software products][GPT3 Apps Blog Post], including productivity apps, education apps, games, and more. 21 | 22 | ## How to control a large language model 23 | 24 | Of all the inputs to a large language model, by far the most influential is the text prompt. 25 | 26 | Large language models can be prompted to produce output in a few ways: 27 | 28 | * **Instruction**: Tell the model what you want 29 | * **Completion**: Induce the model to complete the beginning of what you want 30 | * **Demonstration**: Show the model what you want, with either: 31 | * A few examples in the prompt 32 | * Many hundreds or thousands of examples in a fine-tuning training dataset 33 | 34 | An example of each is shown below. 35 | 36 | ### Instruction prompts 37 | 38 | Instruction-following models (e.g., `text-davinci-003` or any model beginning with `text-`) are specially designed to follow instructions. Write your instruction at the top of the prompt (or at the bottom, or both), and the model will do its best to follow the instruction and then stop. Instructions can be detailed, so don't be afraid to write a paragraph explicitly detailing the output you want. 39 | 40 | Example instruction prompt: 41 | 42 | ```text 43 | Extract the name of the author from the quotation below. 44 | 45 | “Some humans theorize that intelligent species go extinct before they can expand into outer space. If they're correct, then the hush of the night sky is the silence of the graveyard.” 46 | ― Ted Chiang, Exhalation 47 | ``` 48 | 49 | Output: 50 | 51 | ```text 52 | Ted Chiang 53 | ``` 54 | 55 | ### Completion prompt example 56 | 57 | Completion-style prompts take advantage of how large language models try to write text they think is mostly likely to come next. To steer the model, try beginning a pattern or sentence that will be completed by the output you want to see. Relative to direct instructions, this mode of steering large language models can take more care and experimentation. In addition, the models won't necessarily know where to stop, so you will often need stop sequences or post-processing to cut off text generated beyond the desired output. 58 | 59 | Example completion prompt: 60 | 61 | ```text 62 | “Some humans theorize that intelligent species go extinct before they can expand into outer space. If they're correct, then the hush of the night sky is the silence of the graveyard.” 63 | ― Ted Chiang, Exhalation 64 | 65 | The author of this quote is 66 | ``` 67 | 68 | Output: 69 | 70 | ```text 71 | Ted Chiang 72 | ``` 73 | 74 | ### Demonstration prompt example (few-shot learning) 75 | 76 | Similar to completion-style prompts, demonstrations can show the model what you want it to do. This approach is sometimes called few-shot learning, as the model learns from a few examples provided in the prompt. 77 | 78 | Example demonstration prompt: 79 | 80 | ```text 81 | Quote: 82 | “When the reasoning mind is forced to confront the impossible again and again, it has no choice but to adapt.” 83 | ― N.K. Jemisin, The Fifth Season 84 | Author: N.K. Jemisin 85 | 86 | Quote: 87 | “Some humans theorize that intelligent species go extinct before they can expand into outer space. If they're correct, then the hush of the night sky is the silence of the graveyard.” 88 | ― Ted Chiang, Exhalation 89 | Author: 90 | ``` 91 | 92 | Output: 93 | 94 | ```text 95 | Ted Chiang 96 | ``` 97 | 98 | ### Fine-tuned prompt example 99 | 100 | With enough training examples, you can [fine-tune][Fine Tuning Docs] a custom model. In this case, instructions become unnecessary, as the model can learn the task from the training data provided. However, it can be helpful to include separator sequences (e.g., `->` or `###` or any string that doesn't commonly appear in your inputs) to tell the model when the prompt has ended and the output should begin. Without separator sequences, there is a risk that the model continues elaborating on the input text rather than starting on the answer you want to see. 101 | 102 | Example fine-tuned prompt (for a model that has been custom trained on similar prompt-completion pairs): 103 | 104 | ```text 105 | “Some humans theorize that intelligent species go extinct before they can expand into outer space. If they're correct, then the hush of the night sky is the silence of the graveyard.” 106 | ― Ted Chiang, Exhalation 107 | 108 | ### 109 | 110 | 111 | ``` 112 | 113 | Output: 114 | 115 | ```text 116 | Ted Chiang 117 | ``` 118 | 119 | ## Code Capabilities 120 | 121 | Large language models aren't only great at text - they can be great at code too. OpenAI's specialized code model is called [Codex]. 122 | 123 | Codex powers [more than 70 products][Codex Apps Blog Post], including: 124 | 125 | * [GitHub Copilot] (autocompletes code in VS Code and other IDEs) 126 | * [Pygma](https://pygma.app/) (turns Figma designs into code) 127 | * [Replit](https://replit.com/) (has an 'Explain code' button and other features) 128 | * [Warp](https://www.warp.dev/) (a smart terminal with AI command search) 129 | * [Machinet](https://machinet.net/) (writes Java unit test templates) 130 | 131 | Note that unlike instruction-following text models (e.g., `text-davinci-002`), Codex is *not* trained to follow instructions. As a result, designing good prompts can take more care. 132 | 133 | ### More prompt advice 134 | 135 | For more prompt examples, visit [OpenAI Examples][OpenAI Examples]. 136 | 137 | In general, the input prompt is the best lever for improving model outputs. You can try tricks like: 138 | 139 | * **Give more explicit instructions.** E.g., if you want the output to be a comma separated list, ask it to return a comma separated list. If you want it to say "I don't know" when the it doesn't know the answer, tell it 'Say "I don't know" if you do not know the answer.' 140 | * **Supply better examples.** If you're demonstrating examples in your prompt, make sure that your examples are diverse and high quality. 141 | * **Ask the model to answer as if it was an expert.** Explicitly asking the model to produce high quality output or output as if it was written by an expert can induce the model to give higher quality answers that it thinks an expert would write. E.g., "The following answer is correct, high-quality, and written by an expert." 142 | * **Prompt the model to write down the series of steps explaining its reasoning.** E.g., prepend your answer with something like "[Let's think step by step](https://arxiv.org/pdf/2205.11916v1.pdf)." Prompting the model to give an explanation of its reasoning before its final answer can increase the likelihood that its final answer is consistent and correct. 143 | 144 | 145 | 146 | [Fine Tuning Docs]: https://beta.openai.com/docs/guides/fine-tuning 147 | [Codex Apps Blog Post]: https://openai.com/blog/codex-apps/ 148 | [Large language models Blog Post]: https://openai.com/blog/better-language-models/ 149 | [GitHub Copilot]: https://copilot.github.com/ 150 | [Codex]: https://openai.com/blog/openai-codex/ 151 | [GPT3 Apps Blog Post]: https://openai.com/blog/gpt-3-apps/ 152 | [OpenAI Examples]: https://beta.openai.com/examples 153 | -------------------------------------------------------------------------------- /images/OpenAI_Logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/OpenAI_Logo.png -------------------------------------------------------------------------------- /images/chain_of_thought_fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/chain_of_thought_fig1.png -------------------------------------------------------------------------------- /images/chain_of_thought_fig11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/chain_of_thought_fig11.png -------------------------------------------------------------------------------- /images/chain_of_thought_fig3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/chain_of_thought_fig3.png -------------------------------------------------------------------------------- /images/chain_of_thought_fig5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/chain_of_thought_fig5.png -------------------------------------------------------------------------------- /images/faithful-reasoning_fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_fig1.png -------------------------------------------------------------------------------- /images/faithful-reasoning_fig2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_fig2.png -------------------------------------------------------------------------------- /images/faithful-reasoning_fig3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_fig3.png -------------------------------------------------------------------------------- /images/faithful-reasoning_fig4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_fig4.png -------------------------------------------------------------------------------- /images/faithful-reasoning_fig5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_fig5.png -------------------------------------------------------------------------------- /images/faithful-reasoning_fig7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_fig7.png -------------------------------------------------------------------------------- /images/faithful-reasoning_tab2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_tab2.png -------------------------------------------------------------------------------- /images/faithful-reasoning_tab5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_tab5.png -------------------------------------------------------------------------------- /images/least-to-most_fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/least-to-most_fig1.png -------------------------------------------------------------------------------- /images/least-to-most_tab11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/least-to-most_tab11.png -------------------------------------------------------------------------------- /images/least-to-most_tab4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/least-to-most_tab4.png -------------------------------------------------------------------------------- /images/least-to-most_tab9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/least-to-most_tab9.png -------------------------------------------------------------------------------- /images/lm_cascades_fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/lm_cascades_fig1.png -------------------------------------------------------------------------------- /images/lm_cascades_fig3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/lm_cascades_fig3.png -------------------------------------------------------------------------------- /images/lm_cascades_fig4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/lm_cascades_fig4.png -------------------------------------------------------------------------------- /images/lm_cascades_fig5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/lm_cascades_fig5.png -------------------------------------------------------------------------------- /images/lm_cascades_fig6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/lm_cascades_fig6.png -------------------------------------------------------------------------------- /images/maieutic_fig2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/maieutic_fig2.png -------------------------------------------------------------------------------- /images/maieutic_fig6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/maieutic_fig6.png -------------------------------------------------------------------------------- /images/maieutic_tab1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/maieutic_tab1.png -------------------------------------------------------------------------------- /images/selection-inference_fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/selection-inference_fig1.png -------------------------------------------------------------------------------- /images/selection-inference_fig4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/selection-inference_fig4.png -------------------------------------------------------------------------------- /images/self-consistency_fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/self-consistency_fig1.png -------------------------------------------------------------------------------- /images/self-consistency_fig3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/self-consistency_fig3.png -------------------------------------------------------------------------------- /images/star_fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/star_fig1.png -------------------------------------------------------------------------------- /images/star_tab1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/star_tab1.png -------------------------------------------------------------------------------- /images/verifiers_fig3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/verifiers_fig3.png -------------------------------------------------------------------------------- /images/verifiers_fig5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/verifiers_fig5.png -------------------------------------------------------------------------------- /images/zero-shot_reasoners_fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/zero-shot_reasoners_fig1.png -------------------------------------------------------------------------------- /images/zero-shot_reasoners_fig2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/zero-shot_reasoners_fig2.png -------------------------------------------------------------------------------- /images/zero-shot_reasoners_tab1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/zero-shot_reasoners_tab1.png -------------------------------------------------------------------------------- /images/zero-shot_reasoners_tab5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/zero-shot_reasoners_tab5.png -------------------------------------------------------------------------------- /solutions/web_crawl_Q&A/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.3 2 | aiosignal==1.3.1 3 | appnope==0.1.3 4 | asttokens==2.2.1 5 | async-timeout==4.0.2 6 | attrs==22.2.0 7 | backcall==0.2.0 8 | beautifulsoup4==4.11.1 9 | blobfile==2.0.1 10 | bs4==0.0.1 11 | certifi==2022.12.7 12 | charset-normalizer==2.1.1 13 | comm==0.1.2 14 | contourpy==1.0.7 15 | cycler==0.11.0 16 | debugpy==1.6.5 17 | decorator==5.1.1 18 | docopt==0.6.2 19 | entrypoints==0.4 20 | executing==1.2.0 21 | filelock==3.9.0 22 | fonttools==4.38.0 23 | frozenlist==1.3.3 24 | html==1.13 25 | huggingface-hub==0.11.1 26 | idna==3.4 27 | ipykernel==6.20.1 28 | ipython==8.8.0 29 | jedi==0.18.2 30 | joblib==1.2.0 31 | jupyter_client==7.4.8 32 | jupyter_core==5.1.3 33 | kiwisolver==1.4.4 34 | lxml==4.9.2 35 | matplotlib==3.6.3 36 | matplotlib-inline==0.1.6 37 | multidict==6.0.4 38 | nest-asyncio==1.5.6 39 | numpy==1.24.1 40 | openai==0.26.1 41 | packaging==23.0 42 | pandas==1.5.2 43 | parso==0.8.3 44 | pexpect==4.8.0 45 | pickleshare==0.7.5 46 | Pillow==9.4.0 47 | pipreqs==0.4.11 48 | platformdirs==2.6.2 49 | plotly==5.12.0 50 | prompt-toolkit==3.0.36 51 | psutil==5.9.4 52 | ptyprocess==0.7.0 53 | pure-eval==0.2.2 54 | pycryptodomex==3.17 55 | Pygments==2.14.0 56 | pyparsing==3.0.9 57 | python-dateutil==2.8.2 58 | pytz==2022.7.1 59 | PyYAML==6.0 60 | pyzmq==24.0.1 61 | regex==2022.10.31 62 | requests==2.28.1 63 | scikit-learn==1.2.0 64 | scipy==1.10.0 65 | six==1.16.0 66 | soupsieve==2.3.2.post1 67 | stack-data==0.6.2 68 | tenacity==8.1.0 69 | threadpoolctl==3.1.0 70 | tiktoken==0.1.2 71 | tokenizers==0.13.2 72 | tornado==6.2 73 | tqdm==4.64.1 74 | traitlets==5.8.1 75 | transformers==4.25.1 76 | typing_extensions==4.4.0 77 | urllib3==1.26.13 78 | wcwidth==0.2.5 79 | yarg==0.1.9 80 | yarl==1.8.2 -------------------------------------------------------------------------------- /solutions/web_crawl_Q&A/web-qa.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### Step 1 3 | ################################################################################ 4 | 5 | import requests 6 | import re 7 | import urllib.request 8 | from bs4 import BeautifulSoup 9 | from collections import deque 10 | from html.parser import HTMLParser 11 | from urllib.parse import urlparse 12 | import os 13 | import pandas as pd 14 | import tiktoken 15 | import openai 16 | from openai.embeddings_utils import distances_from_embeddings 17 | import pandas as pd 18 | import numpy as np 19 | from openai.embeddings_utils import distances_from_embeddings, cosine_similarity 20 | 21 | # Regex pattern to match a URL 22 | HTTP_URL_PATTERN = r'^http[s]*://.+' 23 | 24 | # Define root domain to crawl 25 | domain = "openai.com" 26 | full_url = "https://openai.com/" 27 | 28 | # Create a class to parse the HTML and get the hyperlinks 29 | class HyperlinkParser(HTMLParser): 30 | def __init__(self): 31 | super().__init__() 32 | # Create a list to store the hyperlinks 33 | self.hyperlinks = [] 34 | 35 | # Override the HTMLParser's handle_starttag method to get the hyperlinks 36 | def handle_starttag(self, tag, attrs): 37 | attrs = dict(attrs) 38 | 39 | # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks 40 | if tag == "a" and "href" in attrs: 41 | self.hyperlinks.append(attrs["href"]) 42 | 43 | ################################################################################ 44 | ### Step 2 45 | ################################################################################ 46 | 47 | # Function to get the hyperlinks from a URL 48 | def get_hyperlinks(url): 49 | 50 | # Try to open the URL and read the HTML 51 | try: 52 | # Open the URL and read the HTML 53 | with urllib.request.urlopen(url) as response: 54 | 55 | # If the response is not HTML, return an empty list 56 | if not response.info().get('Content-Type').startswith("text/html"): 57 | return [] 58 | 59 | # Decode the HTML 60 | html = response.read().decode('utf-8') 61 | except Exception as e: 62 | print(e) 63 | return [] 64 | 65 | # Create the HTML Parser and then Parse the HTML to get hyperlinks 66 | parser = HyperlinkParser() 67 | parser.feed(html) 68 | 69 | return parser.hyperlinks 70 | 71 | ################################################################################ 72 | ### Step 3 73 | ################################################################################ 74 | 75 | # Function to get the hyperlinks from a URL that are within the same domain 76 | def get_domain_hyperlinks(local_domain, url): 77 | clean_links = [] 78 | for link in set(get_hyperlinks(url)): 79 | clean_link = None 80 | 81 | # If the link is a URL, check if it is within the same domain 82 | if re.search(HTTP_URL_PATTERN, link): 83 | # Parse the URL and check if the domain is the same 84 | url_obj = urlparse(link) 85 | if url_obj.netloc == local_domain: 86 | clean_link = link 87 | 88 | # If the link is not a URL, check if it is a relative link 89 | else: 90 | if link.startswith("/"): 91 | link = link[1:] 92 | elif link.startswith("#") or link.startswith("mailto:"): 93 | continue 94 | clean_link = "https://" + local_domain + "/" + link 95 | 96 | if clean_link is not None: 97 | if clean_link.endswith("/"): 98 | clean_link = clean_link[:-1] 99 | clean_links.append(clean_link) 100 | 101 | # Return the list of hyperlinks that are within the same domain 102 | return list(set(clean_links)) 103 | 104 | 105 | ################################################################################ 106 | ### Step 4 107 | ################################################################################ 108 | 109 | def crawl(url): 110 | # Parse the URL and get the domain 111 | local_domain = urlparse(url).netloc 112 | 113 | # Create a queue to store the URLs to crawl 114 | queue = deque([url]) 115 | 116 | # Create a set to store the URLs that have already been seen (no duplicates) 117 | seen = set([url]) 118 | 119 | # Create a directory to store the text files 120 | if not os.path.exists("text/"): 121 | os.mkdir("text/") 122 | 123 | if not os.path.exists("text/"+local_domain+"/"): 124 | os.mkdir("text/" + local_domain + "/") 125 | 126 | # Create a directory to store the csv files 127 | if not os.path.exists("processed"): 128 | os.mkdir("processed") 129 | 130 | # While the queue is not empty, continue crawling 131 | while queue: 132 | 133 | # Get the next URL from the queue 134 | url = queue.pop() 135 | print(url) # for debugging and to see the progress 136 | 137 | # Save text from the url to a .txt file 138 | with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w") as f: 139 | 140 | # Get the text from the URL using BeautifulSoup 141 | soup = BeautifulSoup(requests.get(url).text, "html.parser") 142 | 143 | # Get the text but remove the tags 144 | text = soup.get_text() 145 | 146 | # If the crawler gets to a page that requires JavaScript, it will stop the crawl 147 | if ("You need to enable JavaScript to run this app." in text): 148 | print("Unable to parse page " + url + " due to JavaScript being required") 149 | 150 | # Otherwise, write the text to the file in the text directory 151 | f.write(text) 152 | 153 | # Get the hyperlinks from the URL and add them to the queue 154 | for link in get_domain_hyperlinks(local_domain, url): 155 | if link not in seen: 156 | queue.append(link) 157 | seen.add(link) 158 | 159 | crawl(full_url) 160 | 161 | ################################################################################ 162 | ### Step 5 163 | ################################################################################ 164 | 165 | def remove_newlines(serie): 166 | serie = serie.str.replace('\n', ' ') 167 | serie = serie.str.replace('\\n', ' ') 168 | serie = serie.str.replace(' ', ' ') 169 | serie = serie.str.replace(' ', ' ') 170 | return serie 171 | 172 | 173 | ################################################################################ 174 | ### Step 6 175 | ################################################################################ 176 | 177 | # Create a list to store the text files 178 | texts=[] 179 | 180 | # Get all the text files in the text directory 181 | for file in os.listdir("text/" + domain + "/"): 182 | 183 | # Open the file and read the text 184 | with open("text/" + domain + "/" + file, "r") as f: 185 | text = f.read() 186 | 187 | # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces. 188 | texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text)) 189 | 190 | # Create a dataframe from the list of texts 191 | df = pd.DataFrame(texts, columns = ['fname', 'text']) 192 | 193 | # Set the text column to be the raw text with the newlines removed 194 | df['text'] = df.fname + ". " + remove_newlines(df.text) 195 | df.to_csv('processed/scraped.csv') 196 | df.head() 197 | 198 | ################################################################################ 199 | ### Step 7 200 | ################################################################################ 201 | 202 | # Load the cl100k_base tokenizer which is designed to work with the ada-002 model 203 | tokenizer = tiktoken.get_encoding("cl100k_base") 204 | 205 | df = pd.read_csv('processed/scraped.csv', index_col=0) 206 | df.columns = ['title', 'text'] 207 | 208 | # Tokenize the text and save the number of tokens to a new column 209 | df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x))) 210 | 211 | # Visualize the distribution of the number of tokens per row using a histogram 212 | df.n_tokens.hist() 213 | 214 | ################################################################################ 215 | ### Step 8 216 | ################################################################################ 217 | 218 | max_tokens = 500 219 | 220 | # Function to split the text into chunks of a maximum number of tokens 221 | def split_into_many(text, max_tokens = max_tokens): 222 | 223 | # Split the text into sentences 224 | sentences = text.split('. ') 225 | 226 | # Get the number of tokens for each sentence 227 | n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences] 228 | 229 | chunks = [] 230 | tokens_so_far = 0 231 | chunk = [] 232 | 233 | # Loop through the sentences and tokens joined together in a tuple 234 | for sentence, token in zip(sentences, n_tokens): 235 | 236 | # If the number of tokens so far plus the number of tokens in the current sentence is greater 237 | # than the max number of tokens, then add the chunk to the list of chunks and reset 238 | # the chunk and tokens so far 239 | if tokens_so_far + token > max_tokens: 240 | chunks.append(". ".join(chunk) + ".") 241 | chunk = [] 242 | tokens_so_far = 0 243 | 244 | # If the number of tokens in the current sentence is greater than the max number of 245 | # tokens, go to the next sentence 246 | if token > max_tokens: 247 | continue 248 | 249 | # Otherwise, add the sentence to the chunk and add the number of tokens to the total 250 | chunk.append(sentence) 251 | tokens_so_far += token + 1 252 | 253 | return chunks 254 | 255 | 256 | shortened = [] 257 | 258 | # Loop through the dataframe 259 | for row in df.iterrows(): 260 | 261 | # If the text is None, go to the next row 262 | if row[1]['text'] is None: 263 | continue 264 | 265 | # If the number of tokens is greater than the max number of tokens, split the text into chunks 266 | if row[1]['n_tokens'] > max_tokens: 267 | shortened += split_into_many(row[1]['text']) 268 | 269 | # Otherwise, add the text to the list of shortened texts 270 | else: 271 | shortened.append( row[1]['text'] ) 272 | 273 | ################################################################################ 274 | ### Step 9 275 | ################################################################################ 276 | 277 | df = pd.DataFrame(shortened, columns = ['text']) 278 | df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x))) 279 | df.n_tokens.hist() 280 | 281 | ################################################################################ 282 | ### Step 10 283 | ################################################################################ 284 | 285 | df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding']) 286 | df.to_csv('processed/embeddings.csv') 287 | df.head() 288 | 289 | ################################################################################ 290 | ### Step 11 291 | ################################################################################ 292 | 293 | df=pd.read_csv('processed/embeddings.csv', index_col=0) 294 | df['embeddings'] = df['embeddings'].apply(eval).apply(np.array) 295 | 296 | df.head() 297 | 298 | ################################################################################ 299 | ### Step 12 300 | ################################################################################ 301 | 302 | def create_context( 303 | question, df, max_len=1800, size="ada" 304 | ): 305 | """ 306 | Create a context for a question by finding the most similar context from the dataframe 307 | """ 308 | 309 | # Get the embeddings for the question 310 | q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding'] 311 | 312 | # Get the distances from the embeddings 313 | df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine') 314 | 315 | 316 | returns = [] 317 | cur_len = 0 318 | 319 | # Sort by distance and add the text to the context until the context is too long 320 | for i, row in df.sort_values('distances', ascending=True).iterrows(): 321 | 322 | # Add the length of the text to the current length 323 | cur_len += row['n_tokens'] + 4 324 | 325 | # If the context is too long, break 326 | if cur_len > max_len: 327 | break 328 | 329 | # Else add it to the text that is being returned 330 | returns.append(row["text"]) 331 | 332 | # Return the context 333 | return "\n\n###\n\n".join(returns) 334 | 335 | def answer_question( 336 | df, 337 | model="text-davinci-003", 338 | question="Am I allowed to publish model outputs to Twitter, without a human review?", 339 | max_len=1800, 340 | size="ada", 341 | debug=False, 342 | max_tokens=150, 343 | stop_sequence=None 344 | ): 345 | """ 346 | Answer a question based on the most similar context from the dataframe texts 347 | """ 348 | context = create_context( 349 | question, 350 | df, 351 | max_len=max_len, 352 | size=size, 353 | ) 354 | # If debug, print the raw model response 355 | if debug: 356 | print("Context:\n" + context) 357 | print("\n\n") 358 | 359 | try: 360 | # Create a completions using the questin and context 361 | response = openai.Completion.create( 362 | prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:", 363 | temperature=0, 364 | max_tokens=max_tokens, 365 | top_p=1, 366 | frequency_penalty=0, 367 | presence_penalty=0, 368 | stop=stop_sequence, 369 | model=model, 370 | ) 371 | return response["choices"][0]["text"].strip() 372 | except Exception as e: 373 | print(e) 374 | return "" 375 | 376 | ################################################################################ 377 | ### Step 13 378 | ################################################################################ 379 | 380 | print(answer_question(df, question="What day is it?", debug=False)) 381 | 382 | print(answer_question(df, question="What is our newest embeddings model?")) -------------------------------------------------------------------------------- /text_comparison_examples.md: -------------------------------------------------------------------------------- 1 | # Text comparison examples 2 | 3 | The [OpenAI API embeddings endpoint](https://beta.openai.com/docs/guides/embeddings) can be used to measure relatedness or similarity between pieces of text. 4 | 5 | By leveraging GPT-3's understanding of text, these embeddings [achieved state-of-the-art results](https://arxiv.org/abs/2201.10005) on benchmarks in unsupervised learning and transfer learning settings. 6 | 7 | Embeddings can be used for semantic search, recommendations, cluster analysis, near-duplicate detection, and more. 8 | 9 | For more information, read OpenAI's blog post announcements: 10 | 11 | * [Introducing Text and Code Embeddings (Jan 2022)](https://openai.com/blog/introducing-text-and-code-embeddings/) 12 | * [New and Improved Embedding Model (Dec 2022)](https://openai.com/blog/new-and-improved-embedding-model/) 13 | 14 | ## Semantic search 15 | 16 | Embeddings can be used for search either by themselves or as a feature in a larger system. 17 | 18 | The simplest way to use embeddings for search is as follows: 19 | 20 | * Before the search (precompute): 21 | * Split your text corpus into chunks smaller than the token limit (8,191 tokens for `text-embedding-ada-002`) 22 | * Embed each chunk of text 23 | * Store those embeddings in your own database or in a vector search provider like [Pinecone](https://www.pinecone.io) or [Weaviate](https://weaviate.io) 24 | * At the time of the search (live compute): 25 | * Embed the search query 26 | * Find the closest embeddings in your database 27 | * Return the top results 28 | 29 | An example of how to use embeddings for search is shown in [Semantic_text_search_using_embeddings.ipynb](examples/Semantic_text_search_using_embeddings.ipynb). 30 | 31 | In more advanced search systems, the the cosine similarity of embeddings can be used as one feature among many in ranking search results. 32 | 33 | ## Question answering 34 | 35 | The best way to get reliably honest answers from GPT-3 is to give it source documents in which it can locate correct answers. Using the semantic search procedure above, you can cheaply search a corpus of documents for relevant information and then give that information to GPT-3, via the prompt, to answer a question. We demonstrate in [Question_answering_using_embeddings.ipynb](examples/Question_answering_using_embeddings.ipynb). 36 | 37 | ## Recommendations 38 | 39 | Recommendations are quite similar to search, except that instead of a free-form text query, the inputs are items in a set. 40 | 41 | An example of how to use embeddings for recommendations is shown in [Recommendation_using_embeddings.ipynb](examples/Recommendation_using_embeddings.ipynb). 42 | 43 | Similar to search, these cosine similarity scores can either be used on their own to rank items or as features in larger ranking algorithms. 44 | 45 | ## Customizing Embeddings 46 | 47 | Although OpenAI's embedding model weights cannot be fine-tuned, you can nevertheless use training data to customize embeddings to your application. 48 | 49 | In [Customizing_embeddings.ipynb](examples/Customizing_embeddings.ipynb), we provide an example method for customizing your embeddings using training data. The idea of the method is to train a custom matrix to multiply embedding vectors by in order to get new customized embeddings. With good training data, this custom matrix will help emphasize the features relevant to your training labels. You can equivalently consider the matrix multiplication as (a) a modification of the embeddings or (b) a modification of the distance function used to measure the distances between embeddings. 50 | -------------------------------------------------------------------------------- /text_editing_examples.md: -------------------------------------------------------------------------------- 1 | # Text editing examples 2 | 3 | In addition to the [completions API endpoint][Completion API Docs], OpenAI offers an [edits API endpoint][Edit API Docs]. Read more at: 4 | 5 | * [Blog post announcement (Mar 2022)][GPT3 Edit Blog Post] 6 | * [Edit API documentation][Edit API Docs] 7 | 8 | In contrast to completions, which only take a single text input, edits take two text inputs: the instruction and the text to be modified. For example: 9 | 10 | Instruction input: 11 | 12 | ```text 13 | Fix the OCR errors 14 | ``` 15 | 16 | Text input: 17 | 18 | ```text 19 | Therewassomehostilityntheenergybehindthe researchreportedinPerceptrons....Part of ourdrivecame,aswequiteplainlyacknoweldgednourbook,fromhe facthatfundingndresearchnergywerebeingdissipatedon. . .misleadingttemptsouseconnectionistmethodsnpracticalappli-cations. 20 | ``` 21 | 22 | [Output](https://beta.openai.com/playground/p/5W5W6HHlHrGsLu1cpx0VF4qu): 23 | 24 | ```text 25 | There was some hostility in the energy behind the research reported in Perceptrons....Part of our drive came, as we quite plainly acknowledged in our book, from the fact that funding and research energy were being dissipated on...misleading attempts to use connectionist methods in practical applications. 26 | ``` 27 | 28 | In general, instructions can be imperative, present tense, or past tense. Experiment to see what works best for your use case. 29 | 30 | ## Translation 31 | 32 | One application of the edit API is translation. 33 | 34 | Large language models are excellent at translating across common languages. In 2021, [GPT-3 set](https://arxiv.org/abs/2110.05448) a new state-of-the-art record in unsupervised translation on the WMT14 English-French benchmark. 35 | 36 | Here's an example of how to translate text using the edits endpoint: 37 | 38 | Instruction input: 39 | 40 | ```text 41 | translation into French 42 | ``` 43 | 44 | Text input: 45 | 46 | ```text 47 | That's life. 48 | ``` 49 | 50 | [Output](https://beta.openai.com/playground/p/6JWAH8a4ZbEafSDyRsSVdgKr): 51 | 52 | ```text 53 | C'est la vie. 54 | ``` 55 | 56 | Of course, many tasks that can be accomplished with the edits endpoint can also be done by the completions endpoint too. For example, you can request a translate by prepending an instruction as follows: 57 | 58 | ```text 59 | Translate the following text from English to French. 60 | 61 | English: That's life. 62 | French: 63 | ``` 64 | 65 | [Output](https://beta.openai.com/playground/p/UgaPfgjBNTRRPeNcMSNtGzcu): 66 | 67 | ```text 68 | C'est la vie. 69 | ``` 70 | 71 | Tips for translation: 72 | 73 | * Performance is best on the most common languages 74 | * We've seen better performance when the instruction is given in the final language (so if translating into French, give the instruction `Traduire le texte de l'anglais au français.` rather than `Translate the following text from English to French.`) 75 | * Backtranslation (as described [here](https://arxiv.org/abs/2110.05448)) can also increase performance 76 | * Text with colons and heavy punctuation can trip up the instruction-following models, especially if the instruction uses colons (e.g., `English: {english text} French:`) 77 | * The edits endpoint sometimes repeats the original text input alongside the translation, which can be monitored and filtered 78 | 79 | When it comes to translation, large language models particularly shine at combining other instructions alongside translation. For example, you can ask GPT-3 to translate Slovenian to English but keep all LaTeX typesetting commands unchanged. The following notebook details how we translated a Slovenian math book into English: 80 | 81 | [Translation of a Slovenian math book into English](examples/book_translation/translate_latex_book.ipynb) 82 | 83 | 84 | [Edit API Docs]: https://beta.openai.com/docs/api-reference/edits 85 | [Completion API Docs]: https://beta.openai.com/docs/api-reference/completions 86 | [GPT3 Edit Blog Post]: https://openai.com/blog/gpt-3-edit-insert/ -------------------------------------------------------------------------------- /text_explanation_examples.md: -------------------------------------------------------------------------------- 1 | # Text explanation examples 2 | 3 | Large language models are useful for distilling information from long texts. Applications include: 4 | 5 | * Answering questions about a piece of text, e.g.: 6 | * Querying an knowledge base to help people look up things they don't know 7 | * Querying an unfamiliar document to understand what it contains 8 | * Querying a document with structured questions in order to extract tags, classes, entities, etc. 9 | * Summarizing text, e.g.: 10 | * Summarizing long documents 11 | * Summarizing back-and-forth emails or message threads 12 | * Summarizing detailed meeting notes with key points and next steps 13 | * Classifying text, e.g.: 14 | * Classifying customer feedback messages by topic or type 15 | * Classifying documents by topic or type 16 | * Classifying the tone or sentiment of text 17 | * Extracting entities, e.g.: 18 | * Extracting contact information from a customer message 19 | * Extracting names of people or companies or products from a document 20 | * Extracting things mentioned in customer reviews or feedback 21 | 22 | Below are some simple examples of each. 23 | 24 | ## Answering questions about a piece of text 25 | 26 | Here's an example prompt for answering questions about a piece of text: 27 | 28 | ```text 29 | Using the following text, answer the following question. If the answer is not contained within the text, say "I don't know." 30 | 31 | Text: 32 | """ 33 | Oklo Mine (sometimes Oklo Reactor or Oklo Mines), located in Oklo, Gabon on the west coast of Central Africa, is believed to be the only natural nuclear fission reactor. Oklo consists of 16 sites at which self-sustaining nuclear fission reactions are thought to have taken place approximately 1.7 billion years ago, and ran for hundreds of thousands of years. It is estimated to have averaged under 100 kW of thermal power during that time. 34 | """ 35 | 36 | Question: How many natural fission reactors have ever been discovered? 37 | 38 | Answer: 39 | ``` 40 | 41 | [Output](https://beta.openai.com/playground/p/c8ZL7ioqKK7zxrMT2T9Md3gJ): 42 | 43 | ```text 44 | One. Oklo Mine is believed to be the only natural nuclear fission reactor. 45 | ``` 46 | 47 | If the text you wish to ask about is longer than the token limit (~4,000 tokens for `text-davinci-002`/`-003` and ~2,000 tokens for earlier models), you can split the text into smaller pieces, rank them by relevance, and then ask your question only using the most-relevant-looking pieces. This is demonstrated in [Question_answering_using_embeddings.ipynb](examples/Question_answering_using_embeddings.ipynb). 48 | 49 | In the same way that students do better on tests when allowed to access notes, GPT-3 does better at answering questions when it's given text containing the answer. 50 | Without notes, GPT-3 has to rely on its own long-term memory (i.e., internal weights), which are more prone to result in confabulated or hallucinated answers. 51 | 52 | ## Summarization 53 | 54 | Here's a simple example prompt to summarize a piece of text: 55 | 56 | ```text 57 | Summarize the following text. 58 | 59 | Text: 60 | """ 61 | Two independent experiments reported their results this morning at CERN, Europe's high-energy physics laboratory near Geneva in Switzerland. Both show convincing evidence of a new boson particle weighing around 125 gigaelectronvolts, which so far fits predictions of the Higgs previously made by theoretical physicists. 62 | 63 | "As a layman I would say: 'I think we have it'. Would you agree?" Rolf-Dieter Heuer, CERN's director-general, asked the packed auditorium. The physicists assembled there burst into applause. 64 | """ 65 | 66 | Summary: 67 | ``` 68 | 69 | [Output](https://beta.openai.com/playground/p/pew7DNB908TkUYiF0ZOdaIGc): 70 | 71 | ```text 72 | CERN's director-general asked a packed auditorium if they agreed that two independent experiments had found convincing evidence of a new boson particle that fits predictions of the Higgs, to which the physicists assembled there responded with applause. 73 | ``` 74 | 75 | The triple quotation marks `"""` used in these example prompts aren't special; GPT-3 can recognize most delimiters, including `<>`, `{}`, or `###`. For long pieces of text, we recommend using some kind of delimiter to help disambiguate where one section of text ends and the next begins. 76 | 77 | ## Classification 78 | 79 | If you want to classify the text, the best approach depends on whether the classes are known in advance. 80 | 81 | If your classes _are_ known in advance, classification is often best done with a fine-tuned model, as demonstrated in [Fine-tuned_classification.ipynb](examples/Fine-tuned_classification.ipynb). 82 | 83 | If your classes are not known in advance (e.g., they are set by a user or generated on the fly), you can try zero-shot classification by either giving an instruction containing the classes or even by using embeddings to see which class label (or other classified texts) are most similar to the text (as demonstrated in [Zero-shot_classification.ipynb](examples/Zero-shot_classification_with_embeddings.ipynb)). 84 | 85 | ## Entity extraction 86 | 87 | Here's an example prompt for entity extraction: 88 | 89 | ```text 90 | From the text below, extract the following entities in the following format: 91 | Companies: 92 | People & titles: 93 | 94 | Text: 95 | """ 96 | In March 1981, United States v. AT&T came to trial under Assistant Attorney General William Baxter. AT&T chairman Charles L. Brown thought the company would be gutted. He realized that AT&T would lose and, in December 1981, resumed negotiations with the Justice Department. Reaching an agreement less than a month later, Brown agreed to divestiture—the best and only realistic alternative. AT&T's decision allowed it to retain its research and manufacturing arms. The decree, titled the Modification of Final Judgment, was an adjustment of the Consent Decree of 14 January 1956. Judge Harold H. Greene was given the authority over the modified decree.... 97 | 98 | In 1982, the U.S. government announced that AT&T would cease to exist as a monopolistic entity. On 1 January 1984, it was split into seven smaller regional companies, Bell South, Bell Atlantic, NYNEX, American Information Technologies, Southwestern Bell, US West, and Pacific Telesis, to handle regional phone services in the U.S. AT&T retains control of its long distance services, but was no longer protected from competition. 99 | """ 100 | ``` 101 | 102 | [Output](https://beta.openai.com/playground/p/of47T7N5CtHF4RlvwFkTu3pN): 103 | 104 | ```text 105 | 106 | Companies: AT&T, Bell South, Bell Atlantic, NYNEX, American Information Technologies, Southwestern Bell, US West, Pacific Telesis 107 | People & titles: William Baxter (Assistant Attorney General), Charles L. Brown (AT&T chairman), Harold H. Greene (Judge) 108 | ``` -------------------------------------------------------------------------------- /text_writing_examples.md: -------------------------------------------------------------------------------- 1 | # Text writing examples 2 | 3 | Large language models are excellent at writing. They can assist with all sorts of tasks: 4 | 5 | * Blog posts 6 | * Email copy 7 | * Ad copy 8 | * Website copy 9 | * Product descriptions 10 | * Memos 11 | * Storytelling 12 | * Brainstorming 13 | * Question generation 14 | * etc. 15 | 16 | The easiest way to prompt GPT-3 is to tell it what you'd like. For example, if you want it to write an email, you could use a prompt like this: 17 | 18 | ```text 19 | Write an email to a colleague named Jill congratulating her on her promotion. The tone should be warm yet professional. Mention how you admire the work she's been putting in. Include a joke about how her pet lizard Max enjoys eating grasshoppers. Mention how you're looking forward to the team off-site next week. 20 | ``` 21 | 22 | [Output](https://beta.openai.com/playground/p/fUeh5saVLI2dhU0WU3nZtIiU): 23 | 24 | ```text 25 | Subject: Congratulations on Your Promotion, Jill! 26 | 27 | Dear Jill, 28 | 29 | Congratulations on your promotion! I'm so proud of you and all the hard work you've been putting in. It's been inspiring to watch you grow in your career. 30 | 31 | I'm sure Max the lizard is also celebrating with you. I hear he loves eating grasshoppers, so you should treat him to a special snack! 32 | 33 | I'm looking forward to seeing you at the team off-site next week. I'm sure it will be a great opportunity to celebrate your success. 34 | 35 | All the best, 36 | 37 | [Your Name] 38 | ``` 39 | 40 | In general, writing can work with any style of prompt, whether it's an instruction or a series of examples to follow. Experiment to see what works best for your use case. 41 | 42 | Writing also works with any type of model, though they each have strengths and weaknesses. 43 | 44 | | | Advantages | Disadvantages | 45 | | ---------------------------------------------------------- | ----------------------------------------------------------------------------- | -------------------------------------------------------------------------------- | 46 | | Instruction-following models
(e.g., `text-davinci-003`) | Easiest to use | Less diverse; less creative; sometimes harder to steer tone, style, etc. | 47 | | Base models
(e.g., `davinci`) | Potentially more creative and diverse | Harder to prompt well, more expensive (as examples in the prompt cost extra tokens) | 48 | | Fine-tuned models | Can train off of many examples; cheaper than including examples in the prompt | Hard to gather training data; training makes iteration slower and more expensive | 49 | -------------------------------------------------------------------------------- /transition_guides_for_deprecated_API_endpoints/README.md: -------------------------------------------------------------------------------- 1 | # Deprecation of Answers, Classification, and Search 2 | 3 | In 2021, OpenAI released specialized endpoints in beta for Answers, Classification, and Search. 4 | 5 | While these specialized endpoints were convenient, they had two drawbacks: 6 | 7 | 1. These specialized endpoints were eclipsed by techniques that achieved better results. 8 | 2. These specialized endpoints were more difficult to customize and optimize for individual use cases. 9 | 10 | As a result, **the Answers, Classifications, and Search endpoints are being deprecated.** 11 | 12 | ## Timeline of deprecation 13 | 14 | For those who have not used these endpoints, nothing will change except that access will no longer be available. 15 | 16 | **For existing users of these endpoints, access will continue until December 3, 2022.** Before that date, we strongly encourage developers to switch over to newer techniques which produce better results. 17 | 18 | ## How to transition 19 | 20 | We've written guides and code examples for transitioning from the deprecated API endpoints to better methods. 21 | 22 | ### Answers 23 | 24 | [Guide: How to transition off the Answers endpoint](https://help.openai.com/en/articles/6233728-answers-transition-guide) 25 | 26 | * Option 1: transition to embeddings-based search **(recommended)** 27 | * Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb) 28 | 29 | * Option 2: reimplement Answers endpoint functionality 30 | * Example code: [answers_functionality_example.py](answers_functionality_example.py) 31 | 32 | ### Classification 33 | 34 | [Guide: How to transition off the Classifications endpoint](https://help.openai.com/en/articles/6272941-classifications-transition-guide) 35 | 36 | * Option 1: transition to fine-tuning **(recommended)** 37 | * Example code: [Fine-tuned_classification.ipynb](../examples/Fine-tuned_classification.ipynb) 38 | * Option 2: transition to embeddings 39 | * Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb) 40 | * Option 3: reimplement Classifications endpoint functionality 41 | * Example code: [classification_functionality_example.py](classification_functionality_example.py) 42 | 43 | ### Search 44 | 45 | [Guide: How to transition off the Search endpoint](https://help.openai.com/en/articles/6272952-search-transition-guide) 46 | 47 | * Option 1: transition to embeddings-based search **(recommended)** 48 | * Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb) 49 | * Option 2: reimplement Search endpoint functionality 50 | * Example code: [search_functionality_example.py](search_functionality_example.py) 51 | -------------------------------------------------------------------------------- /transition_guides_for_deprecated_API_endpoints/answers_functionality_example.py: -------------------------------------------------------------------------------- 1 | from transformers import GPT2TokenizerFast 2 | 3 | import openai 4 | 5 | tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") 6 | 7 | MAX_TOKENS_LIMIT = 2048 8 | ANSWERS_INSTRUCTION = "Please answer the question according to the above context.\n" 9 | CONTEXT_TEMPLATE = "===\nContext: {context}\n===\n" 10 | 11 | 12 | def extract_instruction(instruction): 13 | """ 14 | Extract `instruction` parameter and format it properly. 15 | If not exist, return empty string. 16 | """ 17 | if instruction is None: 18 | return "" 19 | 20 | return f"{instruction.strip()}\n\n" 21 | 22 | 23 | def semantic_search( 24 | search_model, query_for_search, file_id=None, max_documents=None, examples=None 25 | ): 26 | """ 27 | :param examples: A list of {"text":...} or {"text": ..., "label": ...}. 28 | :return: 29 | a list of semantic search result dict of documents sorted by "score": 30 | [ 31 | { 32 | "document": ..., 33 | "object": "search_result", 34 | "score": ..., 35 | "text": ..., 36 | }, 37 | ... 38 | ] 39 | """ 40 | assert (examples is None) ^ (file_id is None) # xor 41 | 42 | if file_id is not None: 43 | # This is where you'd do an elastic search call. Since there isn't an example of this 44 | # we can query, we'll raise an error. 45 | # The return value from this would be a list of examples 46 | raise NotImplementedError() 47 | 48 | # This isn't quite accurate since Search is also being deprecated. See our search guide for more 49 | # information. 50 | 51 | search_result = openai.Search.create( 52 | model=search_model, 53 | documents=[x["text"] for x in examples], 54 | query=query_for_search, 55 | ) 56 | 57 | info_dict = {d["document"]: d for d in search_result["data"]} 58 | sorted_doc_ids = sorted( 59 | info_dict.keys(), key=lambda x: info_dict[x]["score"], reverse=True 60 | ) 61 | if max_documents: 62 | sorted_doc_ids = sorted_doc_ids[:max_documents] 63 | return [info_dict[i] for i in sorted_doc_ids] 64 | 65 | 66 | def select_by_length( 67 | sorted_doc_infos, 68 | max_token_len, 69 | lambda_fn=None, 70 | ): 71 | """ 72 | Give a list of (document ID, document content in string), we will select as many 73 | documents as possible as long as the total length does not go above `max_token_len`. 74 | 75 | :param sorted_doc_infos: A list of semantic search result dict of documents sorted by "score". 76 | :param max_token_len: The maximum token length for selected documents. 77 | :param lambda_fn: A function that takes in search results dict and output a formatted 78 | example for context stuffing. 79 | :return: A tuple of ( 80 | A concatenation of selected documents used as context, 81 | A list of selected document IDs 82 | ) 83 | """ 84 | if not sorted_doc_infos: 85 | return "", [] 86 | 87 | selected_indices = [] 88 | total_doc_tokens = 0 89 | doc_dict = {} 90 | for i, doc_info in enumerate(sorted_doc_infos): 91 | doc = lambda_fn(doc_info) if lambda_fn else doc_info["text"] 92 | n_doc_tokens = len(tokenizer.encode(doc)) 93 | if total_doc_tokens + n_doc_tokens < max_token_len: 94 | total_doc_tokens += n_doc_tokens 95 | selected_indices.append(i) 96 | doc_dict[i] = doc 97 | 98 | # The top ranked documents should go at the end. 99 | selected_indices = selected_indices[::-1] 100 | 101 | context = "".join([doc_dict[i] for i in selected_indices]) 102 | selected_doc_infos = [sorted_doc_infos[i] for i in selected_indices] 103 | return context, selected_doc_infos 104 | 105 | 106 | def answers( 107 | examples, 108 | question, 109 | model, 110 | examples_context, 111 | file_id=None, 112 | documents=None, 113 | logit_bias=None, 114 | max_rerank=200, 115 | max_tokens=16, 116 | alternative_question=None, 117 | search_model="ada", 118 | temperature=0.0, 119 | logprobs=0, 120 | stop=None, 121 | n=1, 122 | ): 123 | """ 124 | Given a prompt, a question, a list of (question, answer) pairs as examples, and 125 | a list of documents for context, it tries to include all the QA examples and top 126 | relevant context documents. 127 | 128 | The constructed prompt for the final completion call: 129 | ``` 130 | Please answer the question according to the above context. 131 | 132 | === 133 | Context: {{ the context for example QA pairs. }} 134 | === 135 | Q: example 1 question 136 | A: example 1 answer 137 | --- 138 | Q: example 2 question 139 | A: example 2 answer 140 | === 141 | Context: {{ a list of relevant documents sorted via search(question, documents) }} 142 | === 143 | Q: question 144 | A: 145 | ``` 146 | 147 | The returned object has a structure like: 148 | { 149 | "answers": [ 150 | "Beijing", 151 | "Beijing, China" 152 | ], 153 | "completion_id": "xxx-xxx", 154 | "object": "answer", 155 | "selected_documents": [ 156 | { 157 | "document": ..., # document index, same as in search/ results. 158 | "object": "search_result", 159 | "text": ..., 160 | }, 161 | ... 162 | ], 163 | } 164 | """ 165 | 166 | examples = examples if examples else [] 167 | 168 | example_prompts = [f"Q: {x}\nA: {y}" for x, y in examples] 169 | prompt = f"Q: {question}\nA:" 170 | 171 | # Append all the QA examples into the prompt. 172 | if examples_context: 173 | examples_context = CONTEXT_TEMPLATE.format(context=examples_context) 174 | instruction = ( 175 | ANSWERS_INSTRUCTION + examples_context + "\n---\n".join(example_prompts) + "\n" 176 | ) 177 | 178 | logit_bias = logit_bias if logit_bias is not None else {} 179 | 180 | if file_id is None and documents is None: 181 | raise Exception("Please submit at least one of `documents` or `file`.") 182 | if file_id is not None and documents is not None: 183 | raise Exception("Please submit only one of `documents` or `file`.") 184 | 185 | instruction = extract_instruction(instruction) 186 | 187 | n_instruction_tokens = len(tokenizer.encode(instruction)) 188 | n_prompt_tokens = len(tokenizer.encode(prompt)) 189 | n_query_tokens = len(tokenizer.encode(question)) 190 | n_context_tokens = len(tokenizer.encode(CONTEXT_TEMPLATE.format(context=""))) 191 | 192 | if documents is not None: 193 | documents = [doc.strip() + " " for doc in documents] 194 | n_docs_tokens = [len(tokenizer.encode(doc)) for doc in documents] 195 | 196 | # Except all the required content, how many tokens left for context stuffing. 197 | leftover_token_len = MAX_TOKENS_LIMIT - ( 198 | n_instruction_tokens + n_context_tokens + n_prompt_tokens + max_tokens 199 | ) 200 | sorted_doc_infos = [] 201 | 202 | question_for_search = ( 203 | alternative_question if alternative_question is not None else question 204 | ) 205 | if file_id is not None: 206 | search_model_, sorted_doc_infos = semantic_search( 207 | search_model, 208 | question_for_search, 209 | file_id=file_id, 210 | max_documents=max_rerank, 211 | ) 212 | 213 | elif len(documents) == 0: 214 | # If no context document is provided, do nothing. 215 | pass 216 | 217 | elif min(n_docs_tokens) >= leftover_token_len: 218 | # If there is no room for adding any context doc. 219 | pass 220 | 221 | elif (max_rerank is None or max_rerank >= len(documents)) and sum( 222 | n_docs_tokens 223 | ) < leftover_token_len: 224 | # If the total length of docs is short enough to be added all. 225 | selected_indices = list(range(len(documents))) 226 | 227 | sorted_doc_infos = [ 228 | {"document": i, "text": documents[i]} for i in selected_indices 229 | ] 230 | 231 | elif n_query_tokens + max(n_docs_tokens) >= MAX_TOKENS_LIMIT: 232 | # If the prompt and the longest document together go above the limit. 233 | total_tokens = n_query_tokens + max(n_docs_tokens) 234 | raise Exception( 235 | f"The longest document and prompt pair together contains {total_tokens} " 236 | f"tokens, above the limit {MAX_TOKENS_LIMIT} for semantic search. Please consider " 237 | f"shortening the prompt or the longest document." 238 | ) 239 | 240 | else: 241 | # If we can add some context documents but not all of them, we should 242 | # query search endpoint to rank docs by score. 243 | sorted_doc_infos = semantic_search( 244 | search_model, 245 | question_for_search, 246 | examples=[{"text": doc} for doc in documents], 247 | max_documents=max_rerank, 248 | ) 249 | 250 | # Select documents w.r.t. the context length limitation. 251 | context, sorted_doc_infos = select_by_length( 252 | sorted_doc_infos, 253 | leftover_token_len, 254 | lambda_fn=lambda x: x["text"].strip() + " ", 255 | ) 256 | 257 | # Add instruction before the context and the prompt after the context. 258 | if context: 259 | context = CONTEXT_TEMPLATE.format(context=context.strip()) 260 | full_prompt = instruction + context + prompt 261 | 262 | completion_result = openai.Completion.create( 263 | engine=model, 264 | prompt=full_prompt, 265 | logit_bias=logit_bias, 266 | temperature=temperature, 267 | n=n, 268 | max_tokens=max_tokens, 269 | stop=stop, 270 | logprobs=logprobs, 271 | ) 272 | 273 | completion_result["selected_documents"] = sorted_doc_infos 274 | 275 | result = dict( 276 | object="answer", 277 | selected_documents=completion_result.pop("selected_documents"), 278 | completion=completion_result["id"], 279 | ) 280 | 281 | result["answers"] = [ 282 | item["text"].replace("A:", "").split("Q:")[0].strip() 283 | for item in completion_result["choices"] 284 | ] 285 | 286 | return result 287 | 288 | 289 | print( 290 | answers( 291 | examples=[ 292 | ["What is the capital of Washington", "Olympia"], 293 | ["What is the capital of Oregon", "Salem"], 294 | ], 295 | question="What is the capital of China?", 296 | examples_context="I am a bot that names country capitals", 297 | documents=["I am a bot that names country capitals"], 298 | model="davinci", 299 | search_model="ada", 300 | alternative_question="different test", 301 | max_tokens=16, 302 | stop=["\n\n"], 303 | ) 304 | ) 305 | -------------------------------------------------------------------------------- /transition_guides_for_deprecated_API_endpoints/classification_functionality_example.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from collections import defaultdict 3 | 4 | from transformers import GPT2TokenizerFast 5 | 6 | import openai 7 | 8 | tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") 9 | 10 | MAX_TOKENS_LIMIT = 2048 11 | 12 | 13 | def create_instruction(labels) -> str: 14 | """ 15 | Construct an instruction for a classification task. 16 | """ 17 | instruction = f"Please classify a piece of text into the following categories: {', '.join(labels)}." 18 | 19 | return f"{instruction.strip()}\n\n" 20 | 21 | 22 | def semantic_search( 23 | search_model, query_for_search, file_id=None, max_documents=None, examples=None 24 | ): 25 | """ 26 | :param examples: A list of {"text":...} or {"text": ..., "label": ...}. 27 | :return: 28 | a list of semantic search result dict of documents sorted by "score": 29 | [ 30 | { 31 | "document": ..., 32 | "object": "search_result", 33 | "score": ..., 34 | "text": ..., 35 | }, 36 | ... 37 | ] 38 | 39 | """ 40 | assert (examples is None) ^ (file_id is None) # xor 41 | 42 | if file_id is not None: 43 | # This is where you'd do an elastic search call. Since there isn't an example of this 44 | # we can query, we'll raise an error. 45 | # The return value from this would be a list of examples 46 | raise NotImplementedError() 47 | 48 | # This isn't quite accurate since Search is also being deprecated. See our search guide for more 49 | # information. 50 | 51 | search_result = openai.Search.create( 52 | model=search_model, 53 | documents=[x["text"] for x in examples], 54 | query=query_for_search, 55 | ) 56 | 57 | info_dict = {d["document"]: d for d in search_result["data"]} 58 | sorted_doc_ids = sorted( 59 | info_dict.keys(), key=lambda x: info_dict[x]["score"], reverse=True 60 | ) 61 | if max_documents: 62 | sorted_doc_ids = sorted_doc_ids[:max_documents] 63 | return [info_dict[i] for i in sorted_doc_ids] 64 | 65 | 66 | def select_by_length( 67 | sorted_doc_infos, 68 | max_token_len, 69 | lambda_fn=None, 70 | ): 71 | """ 72 | Give a list of (document ID, document content in string), we will select as many 73 | documents as possible as long as the total length does not go above `max_token_len`. 74 | 75 | :param sorted_doc_infos: A list of semantic search result dict of documents sorted by "score". 76 | :param max_token_len: The maximum token length for selected documents. 77 | :param lambda_fn: A function that takes in search results dict and output a formatted 78 | example for context stuffing. 79 | :return: A tuple of ( 80 | A concatenation of selected documents used as context, 81 | A list of selected document IDs 82 | ) 83 | """ 84 | if not sorted_doc_infos: 85 | return "", [] 86 | 87 | selected_indices = [] 88 | total_doc_tokens = 0 89 | doc_dict = {} 90 | for i, doc_info in enumerate(sorted_doc_infos): 91 | doc = lambda_fn(doc_info) if lambda_fn else doc_info["text"] 92 | n_doc_tokens = len(tokenizer.encode(doc)) 93 | if total_doc_tokens + n_doc_tokens < max_token_len: 94 | total_doc_tokens += n_doc_tokens 95 | selected_indices.append(i) 96 | doc_dict[i] = doc 97 | 98 | # The top ranked documents should go at the end. 99 | selected_indices = selected_indices[::-1] 100 | 101 | context = "".join([doc_dict[i] for i in selected_indices]) 102 | selected_doc_infos = [sorted_doc_infos[i] for i in selected_indices] 103 | return context, selected_doc_infos 104 | 105 | 106 | def format_example_fn(x: dict) -> str: 107 | return "Text: {text}\nCategory: {label}\n---\n".format( 108 | text=x["text"].replace("\n", " ").strip(), 109 | label=x["label"].replace("\n", " ").strip(), 110 | ) 111 | 112 | 113 | def classifications( 114 | query, 115 | model, 116 | search_model="ada", 117 | examples=None, 118 | file=None, 119 | labels=None, 120 | temperature=0.0, 121 | logprobs=None, 122 | max_examples=200, 123 | logit_bias=None, 124 | alternative_query=None, 125 | max_tokens=16, 126 | ) -> dict: 127 | """ 128 | Given a prompt, a question and a list of examples, containing (text, label) pairs, 129 | it selects top relevant examples to construct a prompt for few-shot classification. 130 | 131 | The constructed prompt for the final completion call: 132 | ``` 133 | {{ an optional instruction }} 134 | 135 | Text: example 1 text 136 | Category: example 1 label 137 | --- 138 | Text: example 1 text 139 | Category: example 2 label 140 | --- 141 | Text: question 142 | Category: 143 | ``` 144 | 145 | The returned object has a structure like: 146 | { 147 | "label": "Happy", 148 | "model": "ada", 149 | "object": "classification", 150 | "selected_examples": [ 151 | { 152 | "document": ..., # document index, same as in search/ results. 153 | "text": ..., 154 | "label": ..., 155 | }, 156 | ... 157 | ], 158 | } 159 | """ 160 | 161 | query = query.replace("\n", " ").strip() 162 | logit_bias = logit_bias if logit_bias else {} 163 | labels = labels if labels else [] 164 | 165 | if file is None and examples is None: 166 | raise Exception("Please submit at least one of `examples` or `file`.") 167 | if file is not None and examples is not None: 168 | raise Exception("Please submit only one of `examples` or `file`.") 169 | 170 | instruction = create_instruction(labels) 171 | 172 | query_for_search = alternative_query if alternative_query is not None else query 173 | 174 | # Extract examples and example labels first. 175 | if file is not None: 176 | sorted_doc_infos = semantic_search( 177 | search_model, 178 | query_for_search, 179 | file_id=file, 180 | max_documents=max_examples, 181 | ) 182 | 183 | else: 184 | example_prompts = [ 185 | format_example_fn(dict(text=x, label=y)) for x, y in examples 186 | ] 187 | n_examples_tokens = [len(tokenizer.encode(x)) for x in example_prompts] 188 | 189 | query_prompt = f"Text: {query}\nCategory:" 190 | n_instruction_tokens = len(tokenizer.encode(instruction)) 191 | n_query_tokens = len(tokenizer.encode(query_prompt)) 192 | 193 | # Except all the required content, how many tokens left for context stuffing. 194 | leftover_token_len = MAX_TOKENS_LIMIT - ( 195 | n_instruction_tokens + n_query_tokens + max_tokens 196 | ) 197 | 198 | # Process when `examples` are provided but no `file` is provided. 199 | if examples: 200 | if (max_examples is None or max_examples >= len(examples)) and sum( 201 | n_examples_tokens 202 | ) < leftover_token_len: 203 | # If the total length of docs is short enough that we can add all examples, no search call. 204 | selected_indices = list(range(len(examples))) 205 | 206 | sorted_doc_infos = [ 207 | {"document": i, "text": examples[i][0], "label": examples[i][1]} 208 | for i in selected_indices 209 | ] 210 | 211 | elif max(n_examples_tokens) + n_query_tokens >= MAX_TOKENS_LIMIT: 212 | # If the prompt and the longest example together go above the limit: 213 | total_tokens = max(n_examples_tokens) + n_query_tokens 214 | raise Exception( 215 | user_message=f"The longest classification example, query and prompt together contain " 216 | f"{total_tokens} tokens, above the limit {MAX_TOKENS_LIMIT} for semantic search. " 217 | f"Please consider shortening your instruction, query or the longest example." 218 | ) 219 | 220 | else: 221 | # If we can add some context documents but not all of them, we should 222 | # query search endpoint to rank docs by score. 223 | sorted_doc_infos = semantic_search( 224 | search_model, 225 | query_for_search, 226 | examples=[{"text": x, "label": y} for x, y in examples], 227 | max_documents=max_examples, 228 | ) 229 | 230 | # Per label, we have a list of doc id sorted by its relevancy to the query. 231 | label_to_indices = defaultdict(list) 232 | for idx, d in enumerate(sorted_doc_infos): 233 | label_to_indices[d["label"]].append(idx) 234 | 235 | # Do a round robin for each of the different labels, taking the best match for each label. 236 | label_indices = [label_to_indices[label] for label in labels] 237 | mixed_indices = [ 238 | i for x in itertools.zip_longest(*label_indices) for i in x if i is not None 239 | ] 240 | sorted_doc_infos = [sorted_doc_infos[i] for i in mixed_indices] 241 | 242 | # Try to select as many examples as needed to fit into the context 243 | context, sorted_doc_infos = select_by_length( 244 | sorted_doc_infos, 245 | leftover_token_len, 246 | lambda_fn=format_example_fn, 247 | ) 248 | 249 | prompt = instruction + context + query_prompt 250 | 251 | completion_params = { 252 | "engine": model, 253 | "prompt": prompt, 254 | "temperature": temperature, 255 | "logprobs": logprobs, 256 | "logit_bias": logit_bias, 257 | "max_tokens": max_tokens, 258 | "stop": "\n", 259 | "n": 1, 260 | } 261 | 262 | completion_resp = openai.Completion.create( 263 | **completion_params, 264 | ) 265 | 266 | label = completion_resp["choices"][0]["text"] 267 | label = label.split("\n")[0].strip().lower().capitalize() 268 | if label not in labels: 269 | label = "Unknown" 270 | 271 | result = dict( 272 | # TODO: Add id for object persistence. 273 | object="classification", 274 | model=completion_resp["model"], 275 | label=label, 276 | completion=completion_resp["id"], 277 | ) 278 | 279 | result["selected_examples"] = sorted_doc_infos 280 | 281 | return result 282 | 283 | 284 | print( 285 | classifications( 286 | query="this is my test", 287 | model="davinci", 288 | search_model="ada", 289 | examples=[ 290 | ["this is my test", "davinci"], 291 | ["this is other test", "blahblah"], 292 | ], 293 | file=None, 294 | labels=["davinci", "blahblah"], 295 | temperature=0.1, 296 | logprobs=0, 297 | max_examples=200, 298 | logit_bias=None, 299 | alternative_query="different test", 300 | max_tokens=16, 301 | ) 302 | ) 303 | -------------------------------------------------------------------------------- /transition_guides_for_deprecated_API_endpoints/search_functionality_example.py: -------------------------------------------------------------------------------- 1 | from transformers import GPT2TokenizerFast 2 | 3 | import openai 4 | 5 | tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") 6 | 7 | docs = ["test1", "asdklgjnasdv", "banana", "lord lollipop"] 8 | query = "apple orang asdansbdausd" 9 | 10 | print(openai.Search.create(model="davinci", query=query, documents=docs)) 11 | 12 | 13 | def construct_context(query, document): 14 | return "<|endoftext|>{document}\n\n---\n\nThe above passage is related to: {query}".format( 15 | document=document, query=query 16 | ) 17 | 18 | 19 | def get_score(context, query, log_probs, text_offsets) -> float: 20 | SCORE_MULTIPLIER = 100.0 21 | 22 | log_prob = 0 23 | count = 0 24 | cutoff = len(context) - len(query) 25 | 26 | for i in range(len(text_offsets) - 1, 0, -1): 27 | log_prob += log_probs[i] 28 | count += 1 29 | 30 | if text_offsets[i] <= cutoff and text_offsets[i] != text_offsets[i - 1]: 31 | break 32 | 33 | return log_prob / float(count) * SCORE_MULTIPLIER 34 | 35 | 36 | def search(query, documents, engine): 37 | 38 | prompts = [construct_context(query, doc) for doc in [""] + documents] 39 | 40 | resps = openai.Completion.create( 41 | model=engine, 42 | prompt=prompts, 43 | temperature=1.0, 44 | top_p=1.0, 45 | max_tokens=0, 46 | logprobs=0, 47 | n=1, 48 | echo=True, 49 | ) 50 | 51 | resps_by_index = {choice["index"]: choice for choice in resps["choices"]} 52 | 53 | scores = [ 54 | get_score( 55 | prompts[i], 56 | query, 57 | resps_by_index[i]["logprobs"]["token_logprobs"], 58 | resps_by_index[i]["logprobs"]["text_offset"], 59 | ) 60 | for i in range(len(prompts)) 61 | ] 62 | 63 | # Process results 64 | scores = [score - scores[0] for score in scores][1:] 65 | 66 | return [ 67 | { 68 | "object": "search_result", 69 | "document": document_idx, 70 | "score": round(score, 3), 71 | } 72 | for document_idx, score in enumerate(scores) 73 | ] 74 | 75 | 76 | print(search(query=query, documents=docs, engine="davinci")) 77 | --------------------------------------------------------------------------------