├── .DS_Store
├── .gitignore
├── README.md
├── YOU_and_Together_Demo.ipynb
├── blogpost_final.ipynb
├── chatbot
    ├── chatbot.py
    └── requirements.txt
├── command_example.ipynb
├── data_enrichment.ipynb
├── langchain_retriever_evals.ipynb
├── legal_assistant_csv_langchain_chatbot.ipynb
├── pdf_langchain_chatbot.ipynb
├── transcripts_to_salesforce.ipynb
├── you_dspy_vc_chat.ipynb
├── you_news_and_llama_index.ipynb
└── you_pinecone_multiretriever.ipynb


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/You-OpenSource/You-API-Examples/9b4ba480eae59cb1ac4ea0acd14e5b3b5092f250/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .env


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # You-API-Examples
2 | 
3 | This repository contains examples of how to use our Search API in conjunction with an LLM to perform RAG tasks.
4 | 


--------------------------------------------------------------------------------
/YOU_and_Together_Demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "2pp4KYyMIaIb"
  7 |       },
  8 |       "source": [
  9 |         "# Using Together Inference REST API\n",
 10 |         "\n",
 11 |         "This is the simplest way to use Together Inference using REST API. You only need to provide your Together API key and YOU API key."
 12 |       ]
 13 |     },
 14 |     {
 15 |       "cell_type": "markdown",
 16 |       "metadata": {
 17 |         "id": "gjDNSnqkNUNh"
 18 |       },
 19 |       "source": [
 20 |         "### Provide the API keys and define functions\n"
 21 |       ]
 22 |     },
 23 |     {
 24 |       "cell_type": "code",
 25 |       "execution_count": 16,
 26 |       "metadata": {},
 27 |       "outputs": [],
 28 |       "source": [
 29 |         "! pip -q install requests"
 30 |       ]
 31 |     },
 32 |     {
 33 |       "cell_type": "code",
 34 |       "execution_count": 17,
 35 |       "metadata": {
 36 |         "id": "K0G94X7L6KoH"
 37 |       },
 38 |       "outputs": [],
 39 |       "source": [
 40 |         "YOU_API_KEY = \"\" # Provide your YOU API key.\n",
 41 |         "TOGETHER_API_KEY = \"\" # Provide your TOGETHER API key.\n",
 42 |         "\n",
 43 |         "import requests\n",
 44 |         "\n",
 45 |         "def get_ai_snippets_for_query(query):\n",
 46 |         "  headers = {\"X-API-Key\": YOU_API_KEY}\n",
 47 |         "  results = requests.get(\n",
 48 |         "      f\"https://api.ydc-index.io/search?query={query}\",\n",
 49 |         "      headers=headers,\n",
 50 |         "  ).json()\n",
 51 |         "\n",
 52 |         "  # We return many text snippets for each search hit so\n",
 53 |         "  # we need to explode both levels\n",
 54 |         "  return \"\\n\".join([\"\\n\".join(hit[\"snippets\"]) for hit in results[\"hits\"]])\n",
 55 |         "\n",
 56 |         "def get_together_prompt(query, context):\n",
 57 |         "  return f\"\"\"Provide an answer based on the given context.\\n\n",
 58 |         "    Context: {context}\\n\n",
 59 |         "    Question: {query}\"\"\"\n",
 60 |         "\n",
 61 |         "def ask_together(query, context, model_api_string):\n",
 62 |         "  \"\"\"\n",
 63 |         "  Generate a response from the given query and context.\n",
 64 |         "\n",
 65 |         "  Args:\n",
 66 |         "    query: (str) your query.\n",
 67 |         "    context: (str) your context from snippets.\n",
 68 |         "    model_api_string: (str) a model API string from Together Inference. See the full list in (https://docs.together.ai/docs/inference-models)\n",
 69 |         "  \"\"\"\n",
 70 |         "  # This is hard coded here. To automatically find the default values, use the\n",
 71 |         "  # Python library as shown in the next section.\n",
 72 |         "  prompt_format = \"[INST]\\n {prompt} \\n[/INST]\\n\\n\"\n",
 73 |         "  stop_sequences =  ['[INST]', '\\n\\n']\n",
 74 |         "  max_context_length = 4096\n",
 75 |         "\n",
 76 |         "  # Truncate the context based on the model context length. Instead of using its\n",
 77 |         "  # tokenizer and the exact token count, we assume 1 token ~= ¾ words.\n",
 78 |         "  truncated_context = \" \".join(context.split(\" \")[:int(max_context_length*3/4)])\n",
 79 |         "  prompt = get_together_prompt(query, truncated_context)\n",
 80 |         "\n",
 81 |         "  # Formatting the prompt properly through the model info.\n",
 82 |         "  if prompt_format:\n",
 83 |         "    prompt_format_list = prompt_format.split(\" \")\n",
 84 |         "    formated_prompt = f\"{prompt_format_list[0]}{prompt}{prompt_format_list[2]}\"\n",
 85 |         "  else:\n",
 86 |         "    formated_prompt = prompt\n",
 87 |         "\n",
 88 |         "\n",
 89 |         "  url = \"https://api.together.xyz/inference\"\n",
 90 |         "\n",
 91 |         "  payload = {\n",
 92 |         "      \"model\": model_api_string,\n",
 93 |         "      \"prompt\": formated_prompt, # Use the corrrect prompt format.\n",
 94 |         "      \"max_tokens\": 256,\n",
 95 |         "      \"stop\":stop_sequences,\n",
 96 |         "      \"temperature\": 1.0,\n",
 97 |         "      \"top_p\": 0.7,\n",
 98 |         "      \"top_k\": 50,\n",
 99 |         "      \"repetition_penalty\": 1.1\n",
100 |         "  }\n",
101 |         "  headers = {\n",
102 |         "      \"accept\": \"application/json\",\n",
103 |         "      \"content-type\": \"application/json\",\n",
104 |         "      \"Authorization\": f\"Bearer {TOGETHER_API_KEY}\"\n",
105 |         "  }\n",
106 |         "\n",
107 |         "  response = requests.post(url, json=payload, headers=headers)\n",
108 |         "\n",
109 |         "  if response.status_code != 200:\n",
110 |         "      raise ValueError(f\"Request failed with status code {response.status_code}: {response.text}\")\n",
111 |         "\n",
112 |         "  return response.json()['output']['choices'][0]['text']\n",
113 |         "\n",
114 |         "def ask_together_with_ai_snippets(query, model_api_string):\n",
115 |         "    ai_snippets = get_ai_snippets_for_query(query)\n",
116 |         "    return ask_together(query, ai_snippets, model_api_string)"
117 |       ]
118 |     },
119 |     {
120 |       "cell_type": "markdown",
121 |       "metadata": {
122 |         "id": "R3rrMAD3NYoQ"
123 |       },
124 |       "source": [
125 |         "### Send the query with a choice of your generation model from Together Inference API\n",
126 |         "\n"
127 |       ]
128 |     },
129 |     {
130 |       "cell_type": "code",
131 |       "execution_count": 18,
132 |       "metadata": {
133 |         "colab": {
134 |           "base_uri": "https://localhost:8080/"
135 |         },
136 |         "id": "P1biFE3r871m",
137 |         "outputId": "395d5617-3ee4-47ca-ee58-ab02ab0a24db"
138 |       },
139 |       "outputs": [
140 |         {
141 |           "name": "stdout",
142 |           "output_type": "stream",
143 |           "text": [
144 |             "1. Linux\n",
145 |             "2. Git\n",
146 |             "3. Node.js\n",
147 |             "4. Android\n",
148 |             "5. Ruby on Rails\n",
149 |             "6. Python\n",
150 |             "7. MariaDB\n",
151 |             "8. PostgreSQL\n",
152 |             "9. Docker\n",
153 |             "10. Visual Studio Code (Documentation)\n"
154 |           ]
155 |         }
156 |       ],
157 |       "source": [
158 |         "query = \"What are top 10 successful open source projects?\"\n",
159 |         "model_api_string=\"togethercomputer/Llama-2-7B-32K-Instruct\"\n",
160 |         "print(ask_together_with_ai_snippets(query, model_api_string))"
161 |       ]
162 |     },
163 |     {
164 |       "cell_type": "markdown",
165 |       "metadata": {
166 |         "id": "AfGpkpJ8DCkz"
167 |       },
168 |       "source": [
169 |         "# Using Together Inference Python Library\n",
170 |         "\n",
171 |         "This is more useful when you want to find the default prompt format and stop sequences. You need to install the together library."
172 |       ]
173 |     },
174 |     {
175 |       "cell_type": "markdown",
176 |       "metadata": {
177 |         "id": "L_f4tGg7Ng-6"
178 |       },
179 |       "source": [
180 |         "### Provide the API keys and define functions"
181 |       ]
182 |     },
183 |     {
184 |       "cell_type": "code",
185 |       "execution_count": 19,
186 |       "metadata": {
187 |         "id": "mVqW5EeG9Uqb"
188 |       },
189 |       "outputs": [],
190 |       "source": [
191 |         "! pip install -q together"
192 |       ]
193 |     },
194 |     {
195 |       "cell_type": "code",
196 |       "execution_count": 20,
197 |       "metadata": {
198 |         "id": "O49UXxdN9YAw"
199 |       },
200 |       "outputs": [],
201 |       "source": [
202 |         "import together\n",
203 |         "import requests\n",
204 |         "\n",
205 |         "YOU_API_KEY = \"\" # Provide your YOU API key.\n",
206 |         "TOGETHER_API_KEY = \"\" # Provide your TOGETHER API key.\n",
207 |         "\n",
208 |         "together.api_key = TOGETHER_API_KEY\n",
209 |         "\n",
210 |         "def get_ai_snippets_for_query(query):\n",
211 |         "  headers = {\"X-API-Key\": YOU_API_KEY}\n",
212 |         "  results = requests.get(\n",
213 |         "      f\"https://api.ydc-index.io/search?query={query}\",\n",
214 |         "      headers=headers,\n",
215 |         "  ).json()\n",
216 |         "\n",
217 |         "  # We return many text snippets for each search hit so\n",
218 |         "  # we need to explode both levels\n",
219 |         "  return \"\\n\".join([\"\\n\".join(hit[\"snippets\"]) for hit in results[\"hits\"]])\n",
220 |         "\n",
221 |         "def get_together_prompt(query, context):\n",
222 |         "  return f\"\"\"You are an helpful assistant answering to a question based on\n",
223 |         "    provided context. Here is a context found on the internet: {context}.\\n\n",
224 |         "    Answer the following question: {query}\\n\"\"\"\n",
225 |         "\n",
226 |         "def get_model_config(model_api_string):\n",
227 |         "  model_list = together.Models.list()\n",
228 |         "\n",
229 |         "  prompt_format, stop_sequences = None, []\n",
230 |         "  context_length = 2048\n",
231 |         "  for m in model_list:\n",
232 |         "    if m['name'] == model_api_string:\n",
233 |         "      if 'prompt_format' in m['config']: prompt_format = m['config']['prompt_format']\n",
234 |         "      if 'stop' in m['config']: stop_sequences = m['config']['stop']\n",
235 |         "      if 'context_length' in m: context_length = m['context_length']\n",
236 |         "      break\n",
237 |         "\n",
238 |         "  return prompt_format, stop_sequences, context_length\n",
239 |         "\n",
240 |         "def ask_together(query, context, model_api_string):\n",
241 |         "  \"\"\"\n",
242 |         "  Generate a response from the given query and context.\n",
243 |         "\n",
244 |         "  Args:\n",
245 |         "    query: (str) your query.\n",
246 |         "    context: (str) your context from snippets.\n",
247 |         "    model_api_string: (str) a model API string from Together Inference. See the full list in (https://docs.together.ai/docs/inference-models)\n",
248 |         "  \"\"\"\n",
249 |         "  prompt_format, stop_sequences, max_context_length = get_model_config(model_api_string)\n",
250 |         "\n",
251 |         "  # Truncate the context based on the model context length. Instead of using its\n",
252 |         "  # tokenizer and the exact token count, we assume 1 token ~= ¾ words.\n",
253 |         "  truncated_context = \" \".join(context.split(\" \")[:int(max_context_length*3/4)])\n",
254 |         "  prompt = get_together_prompt(query, truncated_context)\n",
255 |         "\n",
256 |         "  # Formatting the prompt properly through the model info.\n",
257 |         "  if prompt_format:\n",
258 |         "    prompt_format_list = prompt_format.split(\" \")\n",
259 |         "    formated_prompt = f\"{prompt_format_list[0]}{prompt}{prompt_format_list[2]}\"\n",
260 |         "  else:\n",
261 |         "    formated_prompt = prompt\n",
262 |         "\n",
263 |         "  response = together.Complete.create(\n",
264 |         "      prompt=formated_prompt,\n",
265 |         "      model=model_api_string,\n",
266 |         "      max_tokens = 256,\n",
267 |         "      temperature = 1.0,\n",
268 |         "      top_k = 60,\n",
269 |         "      top_p = 0.6,\n",
270 |         "      repetition_penalty = 1.1,\n",
271 |         "      stop = stop_sequences,\n",
272 |         "      )\n",
273 |         "\n",
274 |         "  return response[\"output\"][\"choices\"][0][\"text\"]\n",
275 |         "\n",
276 |         "def ask_together_with_ai_snippets(query, model_api_string):\n",
277 |         "    ai_snippets = get_ai_snippets_for_query(query)\n",
278 |         "    return ask_together(query, ai_snippets, model_api_string)\n"
279 |       ]
280 |     },
281 |     {
282 |       "cell_type": "markdown",
283 |       "metadata": {
284 |         "id": "VxIZB2_MNl3Z"
285 |       },
286 |       "source": [
287 |         "### Send the query with a choice of your generation model from Together Inference API"
288 |       ]
289 |     },
290 |     {
291 |       "cell_type": "code",
292 |       "execution_count": 21,
293 |       "metadata": {
294 |         "colab": {
295 |           "base_uri": "https://localhost:8080/"
296 |         },
297 |         "id": "aLcPpwmr9vgz",
298 |         "outputId": "ef287434-dbce-444c-b192-2af3612a3f92"
299 |       },
300 |       "outputs": [
301 |         {
302 |           "ename": "AttributeError",
303 |           "evalue": "module 'together' has no attribute 'Models'",
304 |           "output_type": "error",
305 |           "traceback": [
306 |             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
307 |             "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
308 |             "Cell \u001b[0;32mIn[21], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m query \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWhat are top 10 successful open source projects?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      2\u001b[0m model_api_string\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtogethercomputer/Llama-2-7B-32K-Instruct\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mask_together_with_ai_snippets\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel_api_string\u001b[49m\u001b[43m)\u001b[49m)\n",
309 |             "Cell \u001b[0;32mIn[20], line 77\u001b[0m, in \u001b[0;36mask_together_with_ai_snippets\u001b[0;34m(query, model_api_string)\u001b[0m\n\u001b[1;32m     75\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mask_together_with_ai_snippets\u001b[39m(query, model_api_string):\n\u001b[1;32m     76\u001b[0m     ai_snippets \u001b[38;5;241m=\u001b[39m get_ai_snippets_for_query(query)\n\u001b[0;32m---> 77\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mask_together\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mai_snippets\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel_api_string\u001b[49m\u001b[43m)\u001b[49m\n",
310 |             "Cell \u001b[0;32mIn[20], line 48\u001b[0m, in \u001b[0;36mask_together\u001b[0;34m(query, context, model_api_string)\u001b[0m\n\u001b[1;32m     39\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mask_together\u001b[39m(query, context, model_api_string):\n\u001b[1;32m     40\u001b[0m \u001b[38;5;250m  \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     41\u001b[0m \u001b[38;5;124;03m  Generate a response from the given query and context.\u001b[39;00m\n\u001b[1;32m     42\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     46\u001b[0m \u001b[38;5;124;03m    model_api_string: (str) a model API string from Together Inference. See the full list in (https://docs.together.ai/docs/inference-models)\u001b[39;00m\n\u001b[1;32m     47\u001b[0m \u001b[38;5;124;03m  \"\"\"\u001b[39;00m\n\u001b[0;32m---> 48\u001b[0m   prompt_format, stop_sequences, max_context_length \u001b[38;5;241m=\u001b[39m \u001b[43mget_model_config\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_api_string\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     50\u001b[0m   \u001b[38;5;66;03m# Truncate the context based on the model context length. Instead of using its\u001b[39;00m\n\u001b[1;32m     51\u001b[0m   \u001b[38;5;66;03m# tokenizer and the exact token count, we assume 1 token ~= ¾ words.\u001b[39;00m\n\u001b[1;32m     52\u001b[0m   truncated_context \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(context\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m)[:\u001b[38;5;28mint\u001b[39m(max_context_length\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m3\u001b[39m\u001b[38;5;241m/\u001b[39m\u001b[38;5;241m4\u001b[39m)])\n",
311 |             "Cell \u001b[0;32mIn[20], line 26\u001b[0m, in \u001b[0;36mget_model_config\u001b[0;34m(model_api_string)\u001b[0m\n\u001b[1;32m     25\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_model_config\u001b[39m(model_api_string):\n\u001b[0;32m---> 26\u001b[0m   model_list \u001b[38;5;241m=\u001b[39m \u001b[43mtogether\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mModels\u001b[49m\u001b[38;5;241m.\u001b[39mlist()\n\u001b[1;32m     28\u001b[0m   prompt_format, stop_sequences \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m, []\n\u001b[1;32m     29\u001b[0m   context_length \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2048\u001b[39m\n",
312 |             "\u001b[0;31mAttributeError\u001b[0m: module 'together' has no attribute 'Models'"
313 |           ]
314 |         }
315 |       ],
316 |       "source": [
317 |         "query = \"What are top 10 successful open source projects?\"\n",
318 |         "model_api_string=\"togethercomputer/Llama-2-7B-32K-Instruct\"\n",
319 |         "print(ask_together_with_ai_snippets(query, model_api_string))"
320 |       ]
321 |     },
322 |     {
323 |       "cell_type": "code",
324 |       "execution_count": null,
325 |       "metadata": {},
326 |       "outputs": [],
327 |       "source": []
328 |     }
329 |   ],
330 |   "metadata": {
331 |     "colab": {
332 |       "provenance": []
333 |     },
334 |     "kernelspec": {
335 |       "display_name": "Python 3",
336 |       "name": "python3"
337 |     },
338 |     "language_info": {
339 |       "codemirror_mode": {
340 |         "name": "ipython",
341 |         "version": 3
342 |       },
343 |       "file_extension": ".py",
344 |       "mimetype": "text/x-python",
345 |       "name": "python",
346 |       "nbconvert_exporter": "python",
347 |       "pygments_lexer": "ipython3",
348 |       "version": "3.10.14"
349 |     }
350 |   },
351 |   "nbformat": 4,
352 |   "nbformat_minor": 0
353 | }
354 | 


--------------------------------------------------------------------------------
/chatbot/chatbot.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | from typing import Any, Dict, List
 3 | import json
 4 | 
 5 | import requests
 6 | import sseclient
 7 | 
 8 | import streamlit as st
 9 | 
10 | SYSTEM_PROMPT = "You are a helpful smart assistant. Search the web and answer the correct question with ciations. You have access to the web to answer any question"
11 | 
12 | 
13 | def build_prompt():
14 |     prompt = ""
15 |     for msg in st.session_state.messages:
16 |         prompt += msg["role"] + ":\t" + msg["content"] + "\n"
17 |     return prompt
18 | 
19 | ydc_api_key = st.secrets["YDC_API_KEY"]
20 | 
21 | 
22 | def get_ydc_answer(messages, mode='smart', stream=False):
23 |     query = build_prompt()
24 |     headers = {'x-api-key': ydc_api_key, 'Content-Type': 'application/json'}
25 |     endpoint = f"https://chat-api.you.com/{mode}" # use /research for Research mode
26 |     params = {"query":query, "chat_id": st.session_state.chat_id}
27 |     response = requests.post(endpoint, json=params, headers=headers)
28 |     print(response)
29 |     return response.json()
30 | 
31 | def get_ydc_stream_answer(mode='smart'):
32 |     query = build_prompt()
33 |     headers = {'x-api-key': ydc_api_key}
34 | 
35 |     endpoint = f"https://chat-api.you.com/{mode}" # use /research for Research mode
36 |     params = {"query": query, "stream": True}
37 |     headers = {
38 |         'x-api-key': ydc_api_key,
39 |     }
40 |     response = requests.get(endpoint, headers=headers, params=params, stream=True)
41 |     client = sseclient.SSEClient(response)
42 |     for event in client:
43 |         for event in response.iter_lines():
44 |             if event.event == "token":
45 |                 yield (str(event.data))
46 | 
47 |     return None
48 | 
49 | # Better way to clear history
50 | def clear_chat_history():
51 |     st.session_state.chat_id = str(uuid.uuid4())
52 |     st.session_state["messages"] = [
53 |         {"role": "system", "content": SYSTEM_PROMPT},
54 |         {"role": "assistant", "content": "What can I help you build today?"}
55 |     ]
56 | 
57 | 
58 | with st.sidebar:
59 |     model_select = st.selectbox("Select a model", ["smart", "research"])
60 |     st.button('Reset Chat', on_click=clear_chat_history)
61 | 
62 | 
63 | 
64 | st.title("💬 YOU.COM API ASSISTANT")
65 | st.caption(""" 🚀 Let us help you build with You.com""")
66 | 
67 | 
68 | if "messages" not in st.session_state:
69 |     st.session_state.chat_id = str(uuid.uuid4())
70 |     st.session_state["messages"] = [
71 |         {"role": "system", "content": SYSTEM_PROMPT},
72 |         {"role": "assistant", "content": "What can I help you build today?"}
73 |     ]
74 | 
75 | # Display or clear messages
76 | for msg in st.session_state.messages:
77 |     if msg["role"] != "system":
78 |         st.chat_message(msg["role"]).write(msg["content"])
79 | 
80 | # User provided prompt
81 | if prompt := st.chat_input():
82 |     st.session_state.messages.append({"role": "user", "content": prompt})
83 |     st.chat_message("user").write(prompt)
84 | 
85 | 
86 | # Generate response if last reponse not from assistant
87 | if st.session_state.messages[-1]["role"] != "assistant":
88 |     with st.chat_message("assistant"):
89 |         # Test no stream
90 |         full_response = get_ydc_answer(model_select)["answer"]
91 |         st.write(full_response)
92 |         
93 |         # Test Stream Currently not working
94 |         #full_response = get_ydc_stream_answer(model_select)
95 |         #st.write_stream(full_response)
96 | 
97 |         message = {"role": "assistant", "content": full_response}
98 |         st.session_state.messages.append(message)


--------------------------------------------------------------------------------
/chatbot/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit


--------------------------------------------------------------------------------
/command_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "51c76d98",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Welcome to You.com's Search API\n",
  9 |     "### Access Keys\n",
 10 |     "You will need to have the environment variables `AUTH_KEY` and `COHERE_API_KEY` set for this notebook to fully work."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "86b35f4a",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "!pip install cohere"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "id": "c0007036",
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import os\n",
 31 |     "\n",
 32 |     "\n",
 33 |     "if not os.getenv(\"AUTH_KEY\") or not os.getenv(\"COHERE_API_KEY\"):\n",
 34 |     "    raise RuntimeError(\"You need to set both AUTH_KEY and COHERE_API_KEY environment variables to proceed\")\n",
 35 |     "\n",
 36 |     "    \n",
 37 |     "try:\n",
 38 |     "    import cohere\n",
 39 |     "except:\n",
 40 |     "    raise RuntimeError(\"Cohere library is not installed\")\n",
 41 |     "\n",
 42 |     "    \n",
 43 |     "cohere_client = cohere.Client(os.getenv(\"COHERE_API_KEY\"))"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "id": "67d0d549",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "### Define Utility Functions\n",
 52 |     "We use the requests library to interface with the You.com API currently with plans to have a dedicated Python package in the near future. We use the Cohere Python client to interface with the Command model."
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "id": "96a6b0a9",
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "import requests\n",
 63 |     "\n",
 64 |     "from itertools import chain\n",
 65 |     "\n",
 66 |     "\n",
 67 |     "def get_ai_snippets_for_query(query):\n",
 68 |     "    headers = {\"X-API-Key\": os.environ[\"AUTH_KEY\"]}\n",
 69 |     "    results = requests.get(\n",
 70 |     "        f\"https://api.ydc-index.io/search?query={query}\",\n",
 71 |     "        headers=headers,\n",
 72 |     "    ).json()\n",
 73 |     "        \n",
 74 |     "    # We return many text snippets for each search hit so we need to explode both levels\n",
 75 |     "    return \"\\n\".join([\"\\n\".join(hit[\"snippets\"]) for hit in results[\"hits\"]])\n",
 76 |     "\n",
 77 |     "\n",
 78 |     "def get_cohere_prompt(query, context):\n",
 79 |     "    return f\"\"\"given a question and a bunch of snippets context try to answer the question using the context. If you can't please say 'Sorry hooman, no dice'.\n",
 80 |     "question: {query}\n",
 81 |     "context: {context}\n",
 82 |     "answer: \"\"\"\n",
 83 |     "\n",
 84 |     "\n",
 85 |     "def ask_cohere(query, context):\n",
 86 |     "    try:\n",
 87 |     "        return cohere_client.generate(prompt=get_cohere_prompt(query, context))[\n",
 88 |     "            0\n",
 89 |     "        ].text\n",
 90 |     "    except:\n",
 91 |     "        logging.error(\n",
 92 |     "            f\"Cohere call failed for query {query} and context {context}\",\n",
 93 |     "            exc_info=True,\n",
 94 |     "        )\n",
 95 |     "        return \"Sorry hooman, no dice\"\n",
 96 |     "\n",
 97 |     "\n",
 98 |     "def ask_cohere_with_ai_snippets(query):\n",
 99 |     "    ai_snippets = get_ai_snippets_for_query(query)\n",
100 |     "    return ask_cohere(query, ai_snippets)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "id": "f8208f10",
106 |    "metadata": {},
107 |    "source": [
108 |     "### Now ask a question!"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "id": "d1d4cceb",
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "ask_cohere_with_ai_snippets(\"Who is the CEO of Cohere?\")"
119 |    ]
120 |   }
121 |  ],
122 |  "metadata": {
123 |   "kernelspec": {
124 |    "display_name": "Python 3 (ipykernel)",
125 |    "language": "python",
126 |    "name": "python3"
127 |   },
128 |   "language_info": {
129 |    "codemirror_mode": {
130 |     "name": "ipython",
131 |     "version": 3
132 |    },
133 |    "file_extension": ".py",
134 |    "mimetype": "text/x-python",
135 |    "name": "python",
136 |    "nbconvert_exporter": "python",
137 |    "pygments_lexer": "ipython3",
138 |    "version": "3.9.17"
139 |   }
140 |  },
141 |  "nbformat": 4,
142 |  "nbformat_minor": 5
143 | }
144 | 


--------------------------------------------------------------------------------
/data_enrichment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Missing Tabular Data Enrichment With YOU Research API \n",
  8 |     "\n",
  9 |     "In this tutorial, we will enrich the missing values in a tabular dataset primarily using `You.com` research API. "
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "### Install required packages"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "%%capture\n",
 26 |     "! pip install pandas\n",
 27 |     "! pip install openai\n",
 28 |     "! pip install openai\n",
 29 |     "! pip install python-dotenv\n",
 30 |     "! pip install requests"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "### Loading API keys\n",
 38 |     "\n",
 39 |     "Set the environment variables `YDC_API_KEY` and `OPENAI_API_KEY` in a .env file which we will load using `dotenv`."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/plain": [
 50 |        "True"
 51 |       ]
 52 |      },
 53 |      "execution_count": 2,
 54 |      "metadata": {},
 55 |      "output_type": "execute_result"
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "import dotenv\n",
 60 |     "dotenv.load_dotenv(\".env\", override=True)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "### Creating a synthetic dataset for enrichment\n",
 68 |     "\n",
 69 |     "We will create a dataframe with a few company domains and columns such as number of employees, NAICS code, headquarter and year founded with missing values. Our aim is to fill these missing values using `You.com` research API.  "
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 3,
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "data": {
 79 |       "text/html": [
 80 |        "<div>\n",
 81 |        "<style scoped>\n",
 82 |        "    .dataframe tbody tr th:only-of-type {\n",
 83 |        "        vertical-align: middle;\n",
 84 |        "    }\n",
 85 |        "\n",
 86 |        "    .dataframe tbody tr th {\n",
 87 |        "        vertical-align: top;\n",
 88 |        "    }\n",
 89 |        "\n",
 90 |        "    .dataframe thead th {\n",
 91 |        "        text-align: right;\n",
 92 |        "    }\n",
 93 |        "</style>\n",
 94 |        "<table border=\"1\" class=\"dataframe\">\n",
 95 |        "  <thead>\n",
 96 |        "    <tr style=\"text-align: right;\">\n",
 97 |        "      <th></th>\n",
 98 |        "      <th>company</th>\n",
 99 |        "      <th>number of employees</th>\n",
100 |        "      <th>NAICS code(s)</th>\n",
101 |        "      <th>headquarter location</th>\n",
102 |        "      <th>founded year</th>\n",
103 |        "    </tr>\n",
104 |        "  </thead>\n",
105 |        "  <tbody>\n",
106 |        "    <tr>\n",
107 |        "      <th>0</th>\n",
108 |        "      <td>Apple</td>\n",
109 |        "      <td></td>\n",
110 |        "      <td></td>\n",
111 |        "      <td></td>\n",
112 |        "      <td></td>\n",
113 |        "    </tr>\n",
114 |        "    <tr>\n",
115 |        "      <th>1</th>\n",
116 |        "      <td>Canadian Tire</td>\n",
117 |        "      <td></td>\n",
118 |        "      <td></td>\n",
119 |        "      <td></td>\n",
120 |        "      <td></td>\n",
121 |        "    </tr>\n",
122 |        "    <tr>\n",
123 |        "      <th>2</th>\n",
124 |        "      <td>Home Depot</td>\n",
125 |        "      <td></td>\n",
126 |        "      <td></td>\n",
127 |        "      <td></td>\n",
128 |        "      <td></td>\n",
129 |        "    </tr>\n",
130 |        "    <tr>\n",
131 |        "      <th>3</th>\n",
132 |        "      <td>LinkedIn Corporation</td>\n",
133 |        "      <td></td>\n",
134 |        "      <td></td>\n",
135 |        "      <td></td>\n",
136 |        "      <td></td>\n",
137 |        "    </tr>\n",
138 |        "    <tr>\n",
139 |        "      <th>4</th>\n",
140 |        "      <td>General Dynamics</td>\n",
141 |        "      <td></td>\n",
142 |        "      <td></td>\n",
143 |        "      <td></td>\n",
144 |        "      <td></td>\n",
145 |        "    </tr>\n",
146 |        "  </tbody>\n",
147 |        "</table>\n",
148 |        "</div>"
149 |       ],
150 |       "text/plain": [
151 |        "                company number of employees NAICS code(s)  \\\n",
152 |        "0                 Apple                                     \n",
153 |        "1         Canadian Tire                                     \n",
154 |        "2            Home Depot                                     \n",
155 |        "3  LinkedIn Corporation                                     \n",
156 |        "4      General Dynamics                                     \n",
157 |        "\n",
158 |        "  headquarter location founded year  \n",
159 |        "0                                    \n",
160 |        "1                                    \n",
161 |        "2                                    \n",
162 |        "3                                    \n",
163 |        "4                                    "
164 |       ]
165 |      },
166 |      "execution_count": 3,
167 |      "metadata": {},
168 |      "output_type": "execute_result"
169 |     }
170 |    ],
171 |    "source": [
172 |     "import pandas as pd\n",
173 |     "\n",
174 |     "data = {\n",
175 |     "    'company': [\"Apple\", 'Canadian Tire', 'Home Depot', 'LinkedIn Corporation', 'General Dynamics'],\n",
176 |     "    'number of employees': ['', '', '', '', ''],\n",
177 |     "    'NAICS code(s)': ['', '', '', '', ''],\n",
178 |     "    'headquarter location': ['', '', '', '', ''],\n",
179 |     "    'founded year': ['', '', '', '', ''],\n",
180 |     "}\n",
181 |     "\n",
182 |     "# Convert to DataFrame\n",
183 |     "df = pd.DataFrame(data)\n",
184 |     "\n",
185 |     "# Display the DataFrame\n",
186 |     "df.head()"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "### Searching and getting results with `You.com` API\n",
194 |     "\n",
195 |     "We will obtain the missing values with a simple call to `You.com` API"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 4,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "# TODO - Switch out to use langchain/dspy implementation instead of reuests once available\n",
205 |     "\n",
206 |     "import requests\n",
207 |     "import os\n",
208 |     "\n",
209 |     "# obtain the API key from the environment\n",
210 |     "headers = {'x-api-key': os.environ['YDC_API_KEY']}\n",
211 |     "\n",
212 |     "def get_research_data(company_name, missing_cols, mode=\"research\"):\n",
213 |     "    \"\"\"\n",
214 |     "    Run the query with the given company domain and missing columns to obtain the missing data\n",
215 |     "    in the form of a JSON object.\n",
216 |     "    \"\"\"\n",
217 |     "    endpoint = f\"https://chat-api.you.com/{mode}\"\n",
218 |     "    params = {\"query\": f\"\"\"\n",
219 |     "              I am trying to find some missing information corresponding to {company_name}.\n",
220 |     "              For these companies, get me more information on each of the following {missing_cols}.\n",
221 |     "              \"\"\"\n",
222 |     "        }\n",
223 |     "    response = requests.get(endpoint, params=params, headers=headers)\n",
224 |     "    return response.json()[\"answer\"]"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 5,
230 |    "metadata": {},
231 |    "outputs": [
232 |     {
233 |      "name": "stdout",
234 |      "output_type": "stream",
235 |      "text": [
236 |       "# General Dynamics: Comprehensive Company Overview\n",
237 |       "\n",
238 |       "## Number of Employees\n",
239 |       "As of December 31, 2023, General Dynamics had 111,600 employees, marking an increase of 5,100 employees or 4.79% compared to the previous year [[1]](https://stockanalysis.com/stocks/gd/employees/#:~:text=General%20Dynamics%20had%20111%2C600%20employees%20on%20December%2031%2C%202023.%20The%20number%20of%20employees%20increased%20by%205%2C100%20or%204.79%25%20compared%20to%20the%20previous%20year). This number is consistent across multiple sources, confirming the company's substantial workforce [[2]](https://www.forbes.com/companies/general-dynamics/#:~:text=Employees%20111%2C600)[[3]](https://www.globaldata.com/company-profile/general-dynamics-corp/#:~:text=No%20of%20Employees%20111%2C600). In January 2021, the company reported having more than 100,000 full-time employees, with about 84,000 based in the United States and 16,000 in over 70 countries worldwide [[4]](https://www.gd.com/about-gd/faqs#:~:text=As%20of%20January%202021%2C%20General%20Dynamics%20had%20more%20than%20100%2C000%20full%2Dtime%20employees.%20About%2084%2C000%20of%20these%20are%20based%20in%20the%20United%20States%2C%20and%2016%2C000%20are%20based%20in%20more%20than%2070%20countries%20outside%20the%20United%20States).\n",
240 |       "\n",
241 |       "## NAICS Code(s)\n",
242 |       "General Dynamics operates under multiple NAICS codes. The primary NAICS code is 336414, which pertains to Guided Missile and Space Vehicle Manufacturing [[5]](https://leadiq.com/c/general-dynamics/5a1d95922300005a008441e4#:~:text=NAICS%20code%20is%20336414%20%2D%20Guided%20Missile%20and%20Space%20Vehicle%20Manufacturing). Another relevant NAICS code is 336411, which is associated with Aircraft Manufacturing [[6]](https://siccode.com/business/general-dynamics-28#:~:text=NAICS%20CODE%20336411%20%2D%20Aircraft%20Manufacturing).\n",
243 |       "\n",
244 |       "## Headquarter Location\n",
245 |       "The headquarters of General Dynamics is located at 11011 Sunset Hills Road, Reston, Virginia, 20190, United States [[7]](https://www.forbes.com/companies/general-dynamics/#:~:text=Headquarters%20Reston%2C%20Virginia)[[8]](https://www.gd.com/about-gd/faqs#:~:text=Our%20corporate%20headquarters%20is%20in%20Reston%2C%20Virginia)[[9]](https://leadiq.com/c/general-dynamics/5a1d95922300005a008441e4#:~:text=Sign%20up%20for%20free%20to%20view%20info%20Email%20Format%20Employee%20directory%20Company%20Overview%20Headquarters%2011011%20Sunset%20Hills%20Rd%20Reston%2C%20Virginia%2020190%20US%20Website%20generaldynamics)[[10]](https://www.globaldata.com/company-profile/general-dynamics-corp/#:~:text=Headquarters%20United%20States%20of%20America%20Address%2011011%20Sunset%20Hills%20Road%2C%20Reston%2C%20Virginia%2C%2020190).\n",
246 |       "\n",
247 |       "![General Dynamics Headquarters](https://upload.wikimedia.org/wikipedia/commons/thumb/b/bb/General_Dynamics_Headquarters.jpg/500px-General_Dynamics_Headquarters.jpg)\n",
248 |       "\n",
249 |       "## Founded Year\n",
250 |       "General Dynamics was founded in 1952 [[11]](https://www.forbes.com/companies/general-dynamics/#:~:text=Founded%201952)[[12]](https://www.gd.com/about-gd/faqs#:~:text=General%20Dynamics%20was%20incorporated%20in%201952%2C%20but%20traces%20its%20legacy%20back%20to%201899%20with%20the%20founding%20of%20the%20Electric%20Boat%20Company%20in%20Groton%2C%20Connecticut)[[13]](https://leadiq.com/c/general-dynamics/5a1d95922300005a008441e4#:~:text=founded%20in%201952). The company traces its legacy back to 1899 with the founding of the Electric Boat Company in Groton, Connecticut [[12]](https://www.gd.com/about-gd/faqs#:~:text=General%20Dynamics%20was%20incorporated%20in%201952%2C%20but%20traces%20its%20legacy%20back%20to%201899%20with%20the%20founding%20of%20the%20Electric%20Boat%20Company%20in%20Groton%2C%20Connecticut).\n",
251 |       "\n",
252 |       "## Conclusion\n",
253 |       "General Dynamics is a major player in the aerospace and defense industry, with a significant global workforce and a rich history dating back to the late 19th century. The company is headquartered in Reston, Virginia, and operates under several NAICS codes related to aerospace and defense manufacturing.\n",
254 |       "\n",
255 |       "### References\n",
256 |       "- General Dynamics had 111,600 employees on December 31, 2023 [[1]](https://stockanalysis.com/stocks/gd/employees/#:~:text=General%20Dynamics%20had%20111%2C600%20employees%20on%20December%2031%2C%202023.%20The%20number%20of%20employees%20increased%20by%205%2C100%20or%204.79%25%20compared%20to%20the%20previous%20year).\n",
257 |       "- Employees 111,600 [[2]](https://www.forbes.com/companies/general-dynamics/#:~:text=Employees%20111%2C600).\n",
258 |       "- No of Employees 111,600 [[3]](https://www.globaldata.com/company-profile/general-dynamics-corp/#:~:text=No%20of%20Employees%20111%2C600).\n",
259 |       "- As of January 2021, General Dynamics had more than 100,000 full-time employees [[4]](https://www.gd.com/about-gd/faqs#:~:text=As%20of%20January%202021%2C%20General%20Dynamics%20had%20more%20than%20100%2C000%20full%2Dtime%20employees.%20About%2084%2C000%20of%20these%20are%20based%20in%20the%20United%20States%2C%20and%2016%2C000%20are%20based%20in%20more%20than%2070%20countries%20outside%20the%20United%20States).\n",
260 |       "- General Dynamics had more than 100,000 full-time employees [[14]](https://www.gd.com/about-gd/faqs#:~:text=General%20Dynamics%20had%20more%20than%20100%2C000%20full%2Dtime%20employees).\n",
261 |       "- NAICS code is 336414 - Guided Missile and Space Vehicle Manufacturing [[5]](https://leadiq.com/c/general-dynamics/5a1d95922300005a008441e4#:~:text=NAICS%20code%20is%20336414%20%2D%20Guided%20Missile%20and%20Space%20Vehicle%20Manufacturing).\n",
262 |       "- NAICS CODE 336411 - Aircraft Manufacturing [[6]](https://siccode.com/business/general-dynamics-28#:~:text=NAICS%20CODE%20336411%20%2D%20Aircraft%20Manufacturing).\n",
263 |       "- Headquarters Reston, Virginia [[7]](https://www.forbes.com/companies/general-dynamics/#:~:text=Headquarters%20Reston%2C%20Virginia).\n",
264 |       "- Our corporate headquarters is in Reston, Virginia [[8]](https://www.gd.com/about-gd/faqs#:~:text=Our%20corporate%20headquarters%20is%20in%20Reston%2C%20Virginia).\n",
265 |       "- headquarters 11011 Sunset Hills Rd Reston, Virginia 20190 US [[9]](https://leadiq.com/c/general-dynamics/5a1d95922300005a008441e4#:~:text=Sign%20up%20for%20free%20to%20view%20info%20Email%20Format%20Employee%20directory%20Company%20Overview%20Headquarters%2011011%20Sunset%20Hills%20Rd%20Reston%2C%20Virginia%2020190%20US%20Website%20generaldynamics).\n",
266 |       "- Headquarters United States of America Address 11011 Sunset Hills Road, Reston, Virginia, 20190 [[10]](https://www.globaldata.com/company-profile/general-dynamics-corp/#:~:text=Headquarters%20United%20States%20of%20America%20Address%2011011%20Sunset%20Hills%20Road%2C%20Reston%2C%20Virginia%2C%2020190).\n",
267 |       "- Founded 1952 [[11]](https://www.forbes.com/companies/general-dynamics/#:~:text=Founded%201952).\n",
268 |       "- General Dynamics was incorporated in 1952, but traces its legacy back to 1899 with the founding of the Electric Boat Company in Groton, Connecticut [[12]](https://www.gd.com/about-gd/faqs#:~:text=General%20Dynamics%20was%20incorporated%20in%201952%2C%20but%20traces%20its%20legacy%20back%20to%201899%20with%20the%20founding%20of%20the%20Electric%20Boat%20Company%20in%20Groton%2C%20Connecticut).\n",
269 |       "- founded in 1952 [[13]](https://leadiq.com/c/general-dynamics/5a1d95922300005a008441e4#:~:text=founded%20in%201952)\n"
270 |      ]
271 |     }
272 |    ],
273 |    "source": [
274 |     "# Let's look at the results returned by the API for one of the companies\n",
275 |     "you_response = get_research_data(\"General Dynamics\",\n",
276 |     "                           ['number_of_employees', 'NAICS_code(s)', 'headquarter_location', 'founded_year'])\n",
277 |     "print(you_response)"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "markdown",
282 |    "metadata": {},
283 |    "source": [
284 |     "### Value extraction with GPT function calling\n",
285 |     "\n",
286 |     "The response from You.com API is in long text format along with the relevant links. As we want to fill our table with the exact information, we will be using chatGPT funtion calling to extract the information for each of the missing fields. "
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 6,
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "from openai import OpenAI\n",
296 |     "import json\n",
297 |     "\n",
298 |     "client = OpenAI()"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 7,
304 |    "metadata": {},
305 |    "outputs": [],
306 |    "source": [
307 |     "# Let's create a function to extract the missing data from the You.com response using chatGPT function calling\n",
308 |     "def get_missing_data(you_com_response):\n",
309 |     "    \"\"\"\n",
310 |     "    Given the response from the You.com API, extract the missing data fields using function calling with chatGPT\n",
311 |     "    and return the extracted data as a json object. \n",
312 |     "    \"\"\"\n",
313 |     "    prompt = f\"Given the {you_com_response}, extract number of employees, NAICS code, headquarter location, and founded year.\"\n",
314 |     "\n",
315 |     "    # function description to specify the format of the extracted data\n",
316 |     "    function = [\n",
317 |     "        {\n",
318 |     "            \"name\": \"get_company_data\",\n",
319 |     "            \"description\": \"Extract the relevant data corresponding to each field\",\n",
320 |     "            \"parameters\": {\n",
321 |     "                \"type\": \"object\",\n",
322 |     "                \"properties\": {\n",
323 |     "                    \"number of employees\": {\n",
324 |     "                        \"type\": \"string\",\n",
325 |     "                        \"description\": \"The number of employees working in the company\",\n",
326 |     "                    },\n",
327 |     "                    \"NAICS code(s)\": {\n",
328 |     "                        \"type\": \"string\",\n",
329 |     "                        \"description\": \"The NAICS code of the company\",\n",
330 |     "                    },\n",
331 |     "                    \"headquarter location\": {\n",
332 |     "                        \"type\": \"string\",\n",
333 |     "                        \"description\": \"The headquarter location of the company\",\n",
334 |     "                    },\n",
335 |     "                    \"founded year\": {\n",
336 |     "                        \"type\": \"string\",\n",
337 |     "                        \"description\": \"The year in which the company was founded\",\n",
338 |     "                    },\n",
339 |     "                },\n",
340 |     "                \"required\": [\"number of employees\", \"NAICS code\", \"headquarter location\", \"founded year\"],\n",
341 |     "            },\n",
342 |     "        }\n",
343 |     "    ]\n",
344 |     "\n",
345 |     "    # call the chatGPT model with the function calling\n",
346 |     "    completion = client.chat.completions.create(\n",
347 |     "        model=\"gpt-3.5-turbo\",\n",
348 |     "        messages=[\n",
349 |     "            {\"role\": \"user\", \"content\": prompt}\n",
350 |     "        ],\n",
351 |     "        functions=function,\n",
352 |     "        function_call=\"auto\"\n",
353 |     "    )\n",
354 |     "\n",
355 |     "    # get the message from the completion which contains the extracted data\n",
356 |     "    output = completion.choices[0].message\n",
357 |     "    # parse the output to extract the json object\n",
358 |     "    json_output = json.loads(output.function_call.arguments)\n",
359 |     "    \n",
360 |     "    return json_output"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 8,
366 |    "metadata": {},
367 |    "outputs": [
368 |     {
369 |      "data": {
370 |       "text/plain": [
371 |        "{'number of employees': '111600',\n",
372 |        " 'NAICS code(s)': '336414, 336411',\n",
373 |        " 'headquarter location': 'Reston, Virginia, 20190, United States',\n",
374 |        " 'founded year': '1952'}"
375 |       ]
376 |      },
377 |      "execution_count": 8,
378 |      "metadata": {},
379 |      "output_type": "execute_result"
380 |     }
381 |    ],
382 |    "source": [
383 |     "# Let's look at the extracted missing data for the company from earlier\n",
384 |     "get_missing_data(you_response)"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "markdown",
389 |    "metadata": {},
390 |    "source": [
391 |     "### Fill the missing data\n",
392 |     "\n",
393 |     "Finally, let's run our functions to first search and get the infromation using `You.com` research API then extract the values of the missing columns in `json` format with GPT function calling."
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": 9,
399 |    "metadata": {},
400 |    "outputs": [],
401 |    "source": [
402 |     "# call the API to get the missing data and update the DataFrame\n",
403 |     "for index, row in df.iterrows():\n",
404 |     "    missing_cols = [col for col in df.columns if row[col] == '']\n",
405 |     "\n",
406 |     "    if missing_cols:\n",
407 |     "        # get the missing data from the You.com API\n",
408 |     "        you_response = get_research_data(row['company'], missing_cols)\n",
409 |     "\n",
410 |     "        # extract the missing data using chatGPT function calling\n",
411 |     "        json_response = get_missing_data(you_response)\n",
412 |     "\n",
413 |     "        # fill the data into original DataFrame\n",
414 |     "        for col in missing_cols:\n",
415 |     "            df.at[index, col] = json_response.get(col, '')"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 10,
421 |    "metadata": {},
422 |    "outputs": [
423 |     {
424 |      "data": {
425 |       "text/html": [
426 |        "<div>\n",
427 |        "<style scoped>\n",
428 |        "    .dataframe tbody tr th:only-of-type {\n",
429 |        "        vertical-align: middle;\n",
430 |        "    }\n",
431 |        "\n",
432 |        "    .dataframe tbody tr th {\n",
433 |        "        vertical-align: top;\n",
434 |        "    }\n",
435 |        "\n",
436 |        "    .dataframe thead th {\n",
437 |        "        text-align: right;\n",
438 |        "    }\n",
439 |        "</style>\n",
440 |        "<table border=\"1\" class=\"dataframe\">\n",
441 |        "  <thead>\n",
442 |        "    <tr style=\"text-align: right;\">\n",
443 |        "      <th></th>\n",
444 |        "      <th>company</th>\n",
445 |        "      <th>number of employees</th>\n",
446 |        "      <th>NAICS code(s)</th>\n",
447 |        "      <th>headquarter location</th>\n",
448 |        "      <th>founded year</th>\n",
449 |        "    </tr>\n",
450 |        "  </thead>\n",
451 |        "  <tbody>\n",
452 |        "    <tr>\n",
453 |        "      <th>0</th>\n",
454 |        "      <td>Apple</td>\n",
455 |        "      <td>161,000</td>\n",
456 |        "      <td>334111, 511210, 42, 42343, 33, 423, 334, 4234</td>\n",
457 |        "      <td>Cupertino, California 95014, United States</td>\n",
458 |        "      <td>1976</td>\n",
459 |        "    </tr>\n",
460 |        "    <tr>\n",
461 |        "      <th>1</th>\n",
462 |        "      <td>Canadian Tire</td>\n",
463 |        "      <td>26,420</td>\n",
464 |        "      <td>NAICS Code 44, NAICS Code 45</td>\n",
465 |        "      <td>2180 Yonge St, Toronto, Ontario, M4S 2B9, Canada</td>\n",
466 |        "      <td>1922</td>\n",
467 |        "    </tr>\n",
468 |        "    <tr>\n",
469 |        "      <th>2</th>\n",
470 |        "      <td>Home Depot</td>\n",
471 |        "      <td>463100</td>\n",
472 |        "      <td>444130, 444110</td>\n",
473 |        "      <td>2455 Paces Ferry Road, Atlanta, Georgia 30339,...</td>\n",
474 |        "      <td>1978</td>\n",
475 |        "    </tr>\n",
476 |        "    <tr>\n",
477 |        "      <th>3</th>\n",
478 |        "      <td>LinkedIn Corporation</td>\n",
479 |        "      <td>19,400</td>\n",
480 |        "      <td>541511</td>\n",
481 |        "      <td>Sunnyvale, California</td>\n",
482 |        "      <td>2003</td>\n",
483 |        "    </tr>\n",
484 |        "    <tr>\n",
485 |        "      <th>4</th>\n",
486 |        "      <td>General Dynamics</td>\n",
487 |        "      <td>111600</td>\n",
488 |        "      <td>336414,336411</td>\n",
489 |        "      <td>11011 Sunset Hills Road, Reston, Virginia, 201...</td>\n",
490 |        "      <td>1952</td>\n",
491 |        "    </tr>\n",
492 |        "  </tbody>\n",
493 |        "</table>\n",
494 |        "</div>"
495 |       ],
496 |       "text/plain": [
497 |        "                company number of employees  \\\n",
498 |        "0                 Apple             161,000   \n",
499 |        "1         Canadian Tire              26,420   \n",
500 |        "2            Home Depot              463100   \n",
501 |        "3  LinkedIn Corporation              19,400   \n",
502 |        "4      General Dynamics              111600   \n",
503 |        "\n",
504 |        "                                   NAICS code(s)  \\\n",
505 |        "0  334111, 511210, 42, 42343, 33, 423, 334, 4234   \n",
506 |        "1                   NAICS Code 44, NAICS Code 45   \n",
507 |        "2                                 444130, 444110   \n",
508 |        "3                                         541511   \n",
509 |        "4                                  336414,336411   \n",
510 |        "\n",
511 |        "                                headquarter location founded year  \n",
512 |        "0         Cupertino, California 95014, United States         1976  \n",
513 |        "1   2180 Yonge St, Toronto, Ontario, M4S 2B9, Canada         1922  \n",
514 |        "2  2455 Paces Ferry Road, Atlanta, Georgia 30339,...         1978  \n",
515 |        "3                              Sunnyvale, California         2003  \n",
516 |        "4  11011 Sunset Hills Road, Reston, Virginia, 201...         1952  "
517 |       ]
518 |      },
519 |      "execution_count": 10,
520 |      "metadata": {},
521 |      "output_type": "execute_result"
522 |     }
523 |    ],
524 |    "source": [
525 |     "# Finally, let's display the updated DataFrame with the missing data filled in\n",
526 |     "df.head()"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "metadata": {},
533 |    "outputs": [],
534 |    "source": []
535 |   }
536 |  ],
537 |  "metadata": {
538 |   "kernelspec": {
539 |    "display_name": ".venv",
540 |    "language": "python",
541 |    "name": "python3"
542 |   },
543 |   "language_info": {
544 |    "codemirror_mode": {
545 |     "name": "ipython",
546 |     "version": 3
547 |    },
548 |    "file_extension": ".py",
549 |    "mimetype": "text/x-python",
550 |    "name": "python",
551 |    "nbconvert_exporter": "python",
552 |    "pygments_lexer": "ipython3",
553 |    "version": "3.10.14"
554 |   }
555 |  },
556 |  "nbformat": 4,
557 |  "nbformat_minor": 2
558 | }
559 | 


--------------------------------------------------------------------------------
/langchain_retriever_evals.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "application/vnd.databricks.v1+cell": {
  8 |      "cellMetadata": {
  9 |       "byteLimit": 2048000,
 10 |       "rowLimit": 10000
 11 |      },
 12 |      "inputWidgets": {},
 13 |      "nuid": "e1a49484-f9e8-4a3f-8fbe-7c7032c490a8",
 14 |      "showTitle": false,
 15 |      "title": ""
 16 |     }
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "! pip install -q transformers openai sentence-transformers datasets langchain==0.0.310"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "! pip install -q -U google-api-python-client"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {
 36 |     "application/vnd.databricks.v1+cell": {
 37 |      "cellMetadata": {
 38 |       "byteLimit": 2048000,
 39 |       "rowLimit": 10000
 40 |      },
 41 |      "inputWidgets": {},
 42 |      "nuid": "611b289b-1f45-4f85-a630-723fc476070c",
 43 |      "showTitle": false,
 44 |      "title": ""
 45 |     }
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "import os\n",
 50 |     "\n",
 51 |     "from langchain.retrievers.you import YouRetriever\n",
 52 |     "from langchain.chains import RetrievalQA\n",
 53 |     "from langchain.chat_models import ChatOpenAI\n",
 54 |     "\n",
 55 |     "\n",
 56 |     "os.environ[\"YDC_API_KEY\"] = \"YOUR YOU.COM API KEY\"\n",
 57 |     "os.environ[\"OPENAI_API_KEY\"] = \"YOUR OPENAI API KEY\"\n",
 58 |     "yr = YouRetriever()\n",
 59 |     "model = \"gpt-3.5-turbo-16k\"\n",
 60 |     "qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model=model), chain_type=\"map_reduce\", retriever=yr)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {
 67 |     "application/vnd.databricks.v1+cell": {
 68 |      "cellMetadata": {
 69 |       "byteLimit": 2048000,
 70 |       "rowLimit": 10000
 71 |      },
 72 |      "inputWidgets": {},
 73 |      "nuid": "00acf693-cf3d-4394-bf88-0b984153f3ab",
 74 |      "showTitle": false,
 75 |      "title": ""
 76 |     }
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "from datasets import load_dataset\n",
 81 |     "\n",
 82 |     "\n",
 83 |     "ds = load_dataset(\"hotpot_qa\", \"fullwiki\")[\"train\"]"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {
 90 |     "application/vnd.databricks.v1+cell": {
 91 |      "cellMetadata": {
 92 |       "byteLimit": 2048000,
 93 |       "rowLimit": 10000
 94 |      },
 95 |      "inputWidgets": {},
 96 |      "nuid": "1231c8fd-d200-434e-9340-91b47657261d",
 97 |      "showTitle": false,
 98 |      "title": ""
 99 |     }
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "from langchain.utilities import GoogleSearchAPIWrapper\n",
104 |     "\n",
105 |     "\n",
106 |     "os.environ[\"GOOGLE_CSE_ID\"] = \"Your Google CSE ID\"\n",
107 |     "os.environ[\"GOOGLE_API_KEY\"] = \"Your Google API Key\"\n",
108 |     "search = GoogleSearchAPIWrapper()\n",
109 |     "\n",
110 |     "def top10_results(query):\n",
111 |     "    return search.results(query, 10)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "application/vnd.databricks.v1+cell": {
119 |      "cellMetadata": {
120 |       "byteLimit": 2048000,
121 |       "rowLimit": 10000
122 |      },
123 |      "inputWidgets": {},
124 |      "nuid": "6df592b2-4508-4399-82ff-aae91fd2080e",
125 |      "showTitle": false,
126 |      "title": ""
127 |     }
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "from langchain.schema.retriever import BaseRetriever, Document\n",
132 |     "from typing import TYPE_CHECKING, Any, Dict, List, Optional \n",
133 |     "from langchain.callbacks.manager import CallbackManagerForRetrieverRun, AsyncCallbackManagerForRetrieverRun\n",
134 |     "\n",
135 |     "\n",
136 |     "class GoogleRetriever(BaseRetriever):\n",
137 |     "    def __int__(self):\n",
138 |     "        pass\n",
139 |     "\n",
140 |     "    def _get_relevant_documents(\n",
141 |     "            self, query: str, *, run_manager: CallbackManagerForRetrieverRun\n",
142 |     "    ) -> List[Document]:\n",
143 |     "        return [Document(page_content=result.get(\"snippet\", \"\")) for result in top10_results(query)]\n",
144 |     "\n",
145 |     "    async def _aget_relevant_documents(\n",
146 |     "            self,\n",
147 |     "            query: str,\n",
148 |     "            *,\n",
149 |     "            run_manager: AsyncCallbackManagerForRetrieverRun,\n",
150 |     "            **kwargs: Any,\n",
151 |     "    ) -> List[Document]:\n",
152 |     "        raise NotImplementedError()"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {
159 |     "application/vnd.databricks.v1+cell": {
160 |      "cellMetadata": {
161 |       "byteLimit": 2048000,
162 |       "rowLimit": 10000
163 |      },
164 |      "inputWidgets": {},
165 |      "nuid": "9b4e0cc3-13fa-4453-8778-6d9f5107ce6e",
166 |      "showTitle": false,
167 |      "title": ""
168 |     }
169 |    },
170 |    "outputs": [],
171 |    "source": [
172 |     "goog_qa = RetrievalQA.from_chain_type(\n",
173 |     "    llm=ChatOpenAI(model=model), chain_type=\"map_reduce\", retriever=GoogleRetriever()\n",
174 |     ")"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {
181 |     "application/vnd.databricks.v1+cell": {
182 |      "cellMetadata": {
183 |       "byteLimit": 2048000,
184 |       "rowLimit": 10000
185 |      },
186 |      "inputWidgets": {},
187 |      "nuid": "707ebbcd-76da-4b35-862e-529c2f76cb93",
188 |      "showTitle": false,
189 |      "title": ""
190 |     }
191 |    },
192 |    "outputs": [],
193 |    "source": [
194 |     "SAMPLE_SIZE = 100\n",
195 |     "pds = ds.to_pandas()\n",
196 |     "pds_sample = pds.sample(SAMPLE_SIZE).reset_index()"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {
203 |     "application/vnd.databricks.v1+cell": {
204 |      "cellMetadata": {
205 |       "byteLimit": 2048000,
206 |       "rowLimit": 10000
207 |      },
208 |      "inputWidgets": {},
209 |      "nuid": "9c303236-dca8-4fdb-b1ae-378629541ce9",
210 |      "showTitle": false,
211 |      "title": ""
212 |     }
213 |    },
214 |    "outputs": [],
215 |    "source": [
216 |     "from concurrent.futures import ThreadPoolExecutor\n",
217 |     "from tqdm import tqdm\n",
218 |     "\n",
219 |     "\n",
220 |     "def parallel_progress_apply(column, callback, num_workers):\n",
221 |     "    with ThreadPoolExecutor(max_workers=num_workers) as executor:\n",
222 |     "        return list(tqdm(executor.map(callback, column), total=len(column)))"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {
229 |     "application/vnd.databricks.v1+cell": {
230 |      "cellMetadata": {
231 |       "byteLimit": 2048000,
232 |       "rowLimit": 10000
233 |      },
234 |      "inputWidgets": {},
235 |      "nuid": "aed5bc7a-4ec2-4831-a222-8e43d0f9244e",
236 |      "showTitle": false,
237 |      "title": ""
238 |     }
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "def get_run_chain_function(chain):\n",
243 |     "    def run_chain(example):\n",
244 |     "        try:\n",
245 |     "            return chain(example)[\"result\"]\n",
246 |     "        except:\n",
247 |     "            return \"\"\n",
248 |     "    return run_chain"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {
255 |     "application/vnd.databricks.v1+cell": {
256 |      "cellMetadata": {
257 |       "byteLimit": 2048000,
258 |       "rowLimit": 10000
259 |      },
260 |      "inputWidgets": {},
261 |      "nuid": "01d09a6e-39b7-4739-94a8-fefd57eb7b32",
262 |      "showTitle": false,
263 |      "title": ""
264 |     }
265 |    },
266 |    "outputs": [],
267 |    "source": [
268 |     "pds_sample[\"ydc_prediction\"] = parallel_progress_apply(\n",
269 |     "    pds_sample[\"question\"], lambda x: get_run_chain_function(qa)(x), num_workers=8\n",
270 |     ")"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {
277 |     "application/vnd.databricks.v1+cell": {
278 |      "cellMetadata": {
279 |       "byteLimit": 2048000,
280 |       "rowLimit": 10000
281 |      },
282 |      "inputWidgets": {},
283 |      "nuid": "fd32e6bd-6cb0-4bc4-b1da-3dc607d6d206",
284 |      "showTitle": false,
285 |      "title": ""
286 |     }
287 |    },
288 |    "outputs": [],
289 |    "source": [
290 |     "# Can't use parallel calls here because Google API so slow :/\n",
291 |     "pds_sample[\"google_prediction\"] = pds_sample[\"question\"].apply(get_run_chain_function(goog_qa))"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": null,
297 |    "metadata": {
298 |     "application/vnd.databricks.v1+cell": {
299 |      "cellMetadata": {
300 |       "byteLimit": 2048000,
301 |       "rowLimit": 10000
302 |      },
303 |      "inputWidgets": {},
304 |      "nuid": "67dc6256-956d-4400-b0af-47ea2c280dab",
305 |      "showTitle": false,
306 |      "title": ""
307 |     }
308 |    },
309 |    "outputs": [],
310 |    "source": [
311 |     "import re\n",
312 |     "import string\n",
313 |     "from collections import Counter\n",
314 |     "\n",
315 |     "\n",
316 |     "# This is all ripped from hotpot_qa source code with minor modifications to only return the f1 instead of the (P,R,F1) tuple\n",
317 |     "# https://github.com/hotpotqa/hotpot/blob/master/hotpot_evaluate_v1.py#L26\n",
318 |     "def calculate_f1_score(prediction, ground_truth):\n",
319 |     "    normalized_prediction = normalize_answer(prediction)\n",
320 |     "    normalized_ground_truth = normalize_answer(ground_truth)\n",
321 |     "\n",
322 |     "    ZERO_METRIC = (0, 0, 0)\n",
323 |     "\n",
324 |     "    if (\n",
325 |     "        normalized_prediction in [\"yes\", \"no\", \"noanswer\"]\n",
326 |     "        and normalized_prediction != normalized_ground_truth\n",
327 |     "    ):\n",
328 |     "        return 0\n",
329 |     "    if (\n",
330 |     "        normalized_ground_truth in [\"yes\", \"no\", \"noanswer\"]\n",
331 |     "        and normalized_prediction != normalized_ground_truth\n",
332 |     "    ):\n",
333 |     "        return 0\n",
334 |     "\n",
335 |     "    prediction_tokens = normalized_prediction.split()\n",
336 |     "    ground_truth_tokens = normalized_ground_truth.split()\n",
337 |     "    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)\n",
338 |     "    num_same = sum(common.values())\n",
339 |     "    if num_same == 0:\n",
340 |     "        return 0\n",
341 |     "    precision = 1.0 * num_same / len(prediction_tokens)\n",
342 |     "    recall = 1.0 * num_same / len(ground_truth_tokens)\n",
343 |     "    f1 = (2 * precision * recall) / (precision + recall)\n",
344 |     "    return f1\n",
345 |     "\n",
346 |     "\n",
347 |     "def normalize_answer(s):\n",
348 |     "    def remove_articles(text):\n",
349 |     "        return re.sub(r\"\\b(a|an|the)\\b\", \" \", text)\n",
350 |     "\n",
351 |     "    def white_space_fix(text):\n",
352 |     "        return \" \".join(text.split())\n",
353 |     "\n",
354 |     "    def remove_punc(text):\n",
355 |     "        exclude = set(string.punctuation)\n",
356 |     "        return \"\".join(ch for ch in text if ch not in exclude)\n",
357 |     "\n",
358 |     "    def lower(text):\n",
359 |     "        return text.lower()\n",
360 |     "\n",
361 |     "    return white_space_fix(remove_articles(remove_punc(lower(s))))\n",
362 |     "\n",
363 |     "\n",
364 |     "def exact_match_score(prediction, ground_truth):\n",
365 |     "    return normalize_answer(prediction) == normalize_answer(ground_truth)\n",
366 |     "\n",
367 |     "\n",
368 |     "def filter_wiki_citation(snip):\n",
369 |     "    return not snip.startswith(\"- ^\")"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {
376 |     "application/vnd.databricks.v1+cell": {
377 |      "cellMetadata": {
378 |       "byteLimit": 2048000,
379 |       "rowLimit": 10000
380 |      },
381 |      "inputWidgets": {},
382 |      "nuid": "d406dbfd-c187-4c82-b3d6-3b8b4726c9e5",
383 |      "showTitle": false,
384 |      "title": ""
385 |     }
386 |    },
387 |    "outputs": [],
388 |    "source": [
389 |     "pds_sample[\"ydc_f1\"] = parallel_progress_apply(\n",
390 |     "    list(pds_sample.iterrows()),\n",
391 |     "    lambda x: calculate_f1_score(x[1][\"ydc_prediction\"], x[1][\"answer\"]),\n",
392 |     "    num_workers=8,\n",
393 |     ")\n",
394 |     "pds_sample[\"google_f1\"] = parallel_progress_apply(\n",
395 |     "    list(pds_sample.iterrows()),\n",
396 |     "    lambda x: calculate_f1_score(x[1][\"google_prediction\"], x[1][\"answer\"]),\n",
397 |     "    num_workers=8,\n",
398 |     ")"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": null,
404 |    "metadata": {
405 |     "application/vnd.databricks.v1+cell": {
406 |      "cellMetadata": {
407 |       "byteLimit": 2048000,
408 |       "rowLimit": 10000
409 |      },
410 |      "inputWidgets": {},
411 |      "nuid": "9734c64e-3514-45c7-acba-ce15e45571c9",
412 |      "showTitle": false,
413 |      "title": ""
414 |     }
415 |    },
416 |    "outputs": [],
417 |    "source": [
418 |     "print(\"YDC F1\")\n",
419 |     "print(pds_sample[\"ydc_f1\"].mean())\n",
420 |     "print(\"Google F1\")\n",
421 |     "print(pds_sample[\"google_f1\"].mean())"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {
428 |     "application/vnd.databricks.v1+cell": {
429 |      "cellMetadata": {
430 |       "byteLimit": 2048000,
431 |       "rowLimit": 10000
432 |      },
433 |      "inputWidgets": {},
434 |      "nuid": "a9ef0926-d8a6-4593-ac12-d6af67b22e70",
435 |      "showTitle": false,
436 |      "title": ""
437 |     }
438 |    },
439 |    "outputs": [],
440 |    "source": []
441 |   }
442 |  ],
443 |  "metadata": {
444 |   "application/vnd.databricks.v1+notebook": {
445 |    "dashboards": [],
446 |    "language": "python",
447 |    "notebookMetadata": {
448 |     "pythonIndentUnit": 4
449 |    },
450 |    "notebookName": "Langchain Retriever Evals",
451 |    "widgets": {}
452 |   },
453 |   "kernelspec": {
454 |    "display_name": "Python 3",
455 |    "language": "python",
456 |    "name": "python3"
457 |   },
458 |   "language_info": {
459 |    "codemirror_mode": {
460 |     "name": "ipython",
461 |     "version": 3
462 |    },
463 |    "file_extension": ".py",
464 |    "mimetype": "text/x-python",
465 |    "name": "python",
466 |    "nbconvert_exporter": "python",
467 |    "pygments_lexer": "ipython3",
468 |    "version": "3.10.14"
469 |   }
470 |  },
471 |  "nbformat": 4,
472 |  "nbformat_minor": 0
473 | }
474 | 


--------------------------------------------------------------------------------
/legal_assistant_csv_langchain_chatbot.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Legal Assistant Bot"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This chatbot retrieves context from a proprietary datasource and the web to answer questions about federal laws in the United States of America (USA).  The proprietary datasource is a CSV file of all federal laws and their revision history in the USA.  The web data required to respond to the user's questions is retrieved using the You.com API.  The chatbot is implemented as an agent in Langchain."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Install all required packages"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "%%capture\n",
 31 |     "! pip install langgraph==0.0.59\n",
 32 |     "! pip install pandas==2.2.2\n",
 33 |     "! pip install openai==1.30.3\n",
 34 |     "! pip install langchain==0.2.1\n",
 35 |     "! pip install langchain_community==0.2.1\n",
 36 |     "! pip install langchain_openai==0.1.7\n",
 37 |     "! pip install langchain-anthropic\n",
 38 |     "! pip install langchain_text_splitters==0.2.0\n",
 39 |     "! pip install langchain_core==0.2.1\n",
 40 |     "! pip install numpy==1.26.4\n",
 41 |     "! pip install python-dotenv"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "## Load in the US Federal Laws dataset and create a vector database representation of this dataset, which will then be converted into a Langchain Retriever and Tool"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 2,
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "data": {
 58 |       "text/html": [
 59 |        "<div>\n",
 60 |        "<style scoped>\n",
 61 |        "    .dataframe tbody tr th:only-of-type {\n",
 62 |        "        vertical-align: middle;\n",
 63 |        "    }\n",
 64 |        "\n",
 65 |        "    .dataframe tbody tr th {\n",
 66 |        "        vertical-align: top;\n",
 67 |        "    }\n",
 68 |        "\n",
 69 |        "    .dataframe thead th {\n",
 70 |        "        text-align: right;\n",
 71 |        "    }\n",
 72 |        "</style>\n",
 73 |        "<table border=\"1\" class=\"dataframe\">\n",
 74 |        "  <thead>\n",
 75 |        "    <tr style=\"text-align: right;\">\n",
 76 |        "      <th></th>\n",
 77 |        "      <th>row_number</th>\n",
 78 |        "      <th>action</th>\n",
 79 |        "      <th>Title</th>\n",
 80 |        "      <th>sal_volume</th>\n",
 81 |        "      <th>sal_page_start</th>\n",
 82 |        "      <th>BillCitation</th>\n",
 83 |        "      <th>congress_number</th>\n",
 84 |        "      <th>chapter</th>\n",
 85 |        "      <th>session_number</th>\n",
 86 |        "      <th>pl_no</th>\n",
 87 |        "      <th>date_of_passage</th>\n",
 88 |        "      <th>secondary_date</th>\n",
 89 |        "      <th>dates_conflict</th>\n",
 90 |        "      <th>Source</th>\n",
 91 |        "      <th>URL</th>\n",
 92 |        "      <th>alternate_sal_volume</th>\n",
 93 |        "      <th>alternate_sal_page_start</th>\n",
 94 |        "      <th>has_alternate_sal_citation</th>\n",
 95 |        "    </tr>\n",
 96 |        "  </thead>\n",
 97 |        "  <tbody>\n",
 98 |        "    <tr>\n",
 99 |        "      <th>0</th>\n",
100 |        "      <td>1</td>\n",
101 |        "      <td>An Act</td>\n",
102 |        "      <td>To regulate the time and manner of administeri...</td>\n",
103 |        "      <td>1</td>\n",
104 |        "      <td>23.0</td>\n",
105 |        "      <td>NaN</td>\n",
106 |        "      <td>1</td>\n",
107 |        "      <td>1.0</td>\n",
108 |        "      <td>1.0</td>\n",
109 |        "      <td>NaN</td>\n",
110 |        "      <td>1789-06-01</td>\n",
111 |        "      <td>NaN</td>\n",
112 |        "      <td>NaN</td>\n",
113 |        "      <td>HeinOnline</td>\n",
114 |        "      <td>NaN</td>\n",
115 |        "      <td>NaN</td>\n",
116 |        "      <td>NaN</td>\n",
117 |        "      <td>False</td>\n",
118 |        "    </tr>\n",
119 |        "    <tr>\n",
120 |        "      <th>1</th>\n",
121 |        "      <td>2</td>\n",
122 |        "      <td>An Act</td>\n",
123 |        "      <td>For laying a duty on goods, wares, and merchan...</td>\n",
124 |        "      <td>1</td>\n",
125 |        "      <td>24.0</td>\n",
126 |        "      <td>NaN</td>\n",
127 |        "      <td>1</td>\n",
128 |        "      <td>2.0</td>\n",
129 |        "      <td>1.0</td>\n",
130 |        "      <td>NaN</td>\n",
131 |        "      <td>1789-07-04</td>\n",
132 |        "      <td>NaN</td>\n",
133 |        "      <td>NaN</td>\n",
134 |        "      <td>HeinOnline</td>\n",
135 |        "      <td>NaN</td>\n",
136 |        "      <td>NaN</td>\n",
137 |        "      <td>NaN</td>\n",
138 |        "      <td>False</td>\n",
139 |        "    </tr>\n",
140 |        "    <tr>\n",
141 |        "      <th>2</th>\n",
142 |        "      <td>3</td>\n",
143 |        "      <td>An Act</td>\n",
144 |        "      <td>Imposing duties on tonnage.</td>\n",
145 |        "      <td>1</td>\n",
146 |        "      <td>27.0</td>\n",
147 |        "      <td>NaN</td>\n",
148 |        "      <td>1</td>\n",
149 |        "      <td>3.0</td>\n",
150 |        "      <td>1.0</td>\n",
151 |        "      <td>NaN</td>\n",
152 |        "      <td>1789-07-20</td>\n",
153 |        "      <td>NaN</td>\n",
154 |        "      <td>NaN</td>\n",
155 |        "      <td>HeinOnline</td>\n",
156 |        "      <td>NaN</td>\n",
157 |        "      <td>NaN</td>\n",
158 |        "      <td>NaN</td>\n",
159 |        "      <td>False</td>\n",
160 |        "    </tr>\n",
161 |        "    <tr>\n",
162 |        "      <th>3</th>\n",
163 |        "      <td>4</td>\n",
164 |        "      <td>An Act</td>\n",
165 |        "      <td>For establishing an executive department to be...</td>\n",
166 |        "      <td>1</td>\n",
167 |        "      <td>28.0</td>\n",
168 |        "      <td>NaN</td>\n",
169 |        "      <td>1</td>\n",
170 |        "      <td>4.0</td>\n",
171 |        "      <td>1.0</td>\n",
172 |        "      <td>NaN</td>\n",
173 |        "      <td>1789-07-27</td>\n",
174 |        "      <td>NaN</td>\n",
175 |        "      <td>NaN</td>\n",
176 |        "      <td>HeinOnline</td>\n",
177 |        "      <td>NaN</td>\n",
178 |        "      <td>NaN</td>\n",
179 |        "      <td>NaN</td>\n",
180 |        "      <td>False</td>\n",
181 |        "    </tr>\n",
182 |        "    <tr>\n",
183 |        "      <th>4</th>\n",
184 |        "      <td>5</td>\n",
185 |        "      <td>An Act</td>\n",
186 |        "      <td>To regulate the collection of the duties impos...</td>\n",
187 |        "      <td>1</td>\n",
188 |        "      <td>29.0</td>\n",
189 |        "      <td>NaN</td>\n",
190 |        "      <td>1</td>\n",
191 |        "      <td>5.0</td>\n",
192 |        "      <td>1.0</td>\n",
193 |        "      <td>NaN</td>\n",
194 |        "      <td>1789-07-31</td>\n",
195 |        "      <td>NaN</td>\n",
196 |        "      <td>NaN</td>\n",
197 |        "      <td>HeinOnline</td>\n",
198 |        "      <td>NaN</td>\n",
199 |        "      <td>NaN</td>\n",
200 |        "      <td>NaN</td>\n",
201 |        "      <td>False</td>\n",
202 |        "    </tr>\n",
203 |        "  </tbody>\n",
204 |        "</table>\n",
205 |        "</div>"
206 |       ],
207 |       "text/plain": [
208 |        "   row_number  action                                              Title  \\\n",
209 |        "0           1  An Act  To regulate the time and manner of administeri...   \n",
210 |        "1           2  An Act  For laying a duty on goods, wares, and merchan...   \n",
211 |        "2           3  An Act                        Imposing duties on tonnage.   \n",
212 |        "3           4  An Act  For establishing an executive department to be...   \n",
213 |        "4           5  An Act  To regulate the collection of the duties impos...   \n",
214 |        "\n",
215 |        "   sal_volume  sal_page_start BillCitation  congress_number  chapter  \\\n",
216 |        "0           1            23.0          NaN                1      1.0   \n",
217 |        "1           1            24.0          NaN                1      2.0   \n",
218 |        "2           1            27.0          NaN                1      3.0   \n",
219 |        "3           1            28.0          NaN                1      4.0   \n",
220 |        "4           1            29.0          NaN                1      5.0   \n",
221 |        "\n",
222 |        "   session_number pl_no date_of_passage secondary_date dates_conflict  \\\n",
223 |        "0             1.0   NaN      1789-06-01            NaN            NaN   \n",
224 |        "1             1.0   NaN      1789-07-04            NaN            NaN   \n",
225 |        "2             1.0   NaN      1789-07-20            NaN            NaN   \n",
226 |        "3             1.0   NaN      1789-07-27            NaN            NaN   \n",
227 |        "4             1.0   NaN      1789-07-31            NaN            NaN   \n",
228 |        "\n",
229 |        "       Source  URL  alternate_sal_volume  alternate_sal_page_start  \\\n",
230 |        "0  HeinOnline  NaN                   NaN                       NaN   \n",
231 |        "1  HeinOnline  NaN                   NaN                       NaN   \n",
232 |        "2  HeinOnline  NaN                   NaN                       NaN   \n",
233 |        "3  HeinOnline  NaN                   NaN                       NaN   \n",
234 |        "4  HeinOnline  NaN                   NaN                       NaN   \n",
235 |        "\n",
236 |        "   has_alternate_sal_citation  \n",
237 |        "0                       False  \n",
238 |        "1                       False  \n",
239 |        "2                       False  \n",
240 |        "3                       False  \n",
241 |        "4                       False  "
242 |       ]
243 |      },
244 |      "execution_count": 2,
245 |      "metadata": {},
246 |      "output_type": "execute_result"
247 |     }
248 |    ],
249 |    "source": [
250 |     "# Let's take a look at our CSV dataset first\n",
251 |     "import pandas as pd\n",
252 |     "\n",
253 |     "# The CSV file can be downloaded from: https://www.nature.com/articles/s41597-023-02758-z#Sec3\n",
254 |     "df = pd.read_csv(\"us_laws_dataset.csv\")\n",
255 |     "\n",
256 |     "df.head()"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 3,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "import openai\n",
266 |     "import langchain\n",
267 |     "import os"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 4,
273 |    "metadata": {},
274 |    "outputs": [
275 |     {
276 |      "data": {
277 |       "text/plain": [
278 |        "True"
279 |       ]
280 |      },
281 |      "execution_count": 4,
282 |      "metadata": {},
283 |      "output_type": "execute_result"
284 |     }
285 |    ],
286 |    "source": [
287 |     "#os.environ[\"YDC_API_KEY\"] = \"<Insert your YDC API key here>\"\n",
288 |     "#os.environ[\"OPENAI_API_KEY\"] = \"<Insert your Open AI API key here>\"\n",
289 |     "#os.environ[\"ANTHROPIC_API_KEY\"] = \"<Insert your Anthropic API key here>\"\n",
290 |     "\n",
291 |     "# Or load from .env file\n",
292 |     "from dotenv import load_dotenv\n",
293 |     "load_dotenv()"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 5,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "from langchain_community.document_loaders.csv_loader import CSVLoader\n",
303 |     "\n",
304 |     "# The CSV file can be downloaded from: https://www.nature.com/articles/s41597-023-02758-z#Sec3\n",
305 |     "loader = CSVLoader(file_path = \"us_laws_dataset.csv\")\n",
306 |     "\n",
307 |     "data = loader.load()"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 6,
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": [
316 |     "from langchain_community.vectorstores import FAISS\n",
317 |     "from langchain_openai import OpenAIEmbeddings\n",
318 |     "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
319 |     "\n",
320 |     "# split the document into chunks, and vectorize these chunks in a FAISS database\n",
321 |     "text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)\n",
322 |     "docs = text_splitter.split_documents(data)\n",
323 |     "embeddings = OpenAIEmbeddings()\n",
324 |     "db = FAISS.from_documents(documents=docs, embedding=embeddings)"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 7,
330 |    "metadata": {},
331 |    "outputs": [
332 |     {
333 |      "data": {
334 |       "text/plain": [
335 |        "[Document(page_content='row_number: 1885\\naction: An Act\\nTitle: In addition to the act, entitled \"An act for the prompt settlement of public accounts,\" and for the punishment of the crime of perjury\\nsal_volume: 3\\nsal_page_start: 770\\nBillCitation: NA\\ncongress_number: 17\\nchapter: 37\\nsession_number: 2\\npl_no: NA\\ndate_of_passage: 1823-03-01\\nsecondary_date: NA\\ndates_conflict: NA\\nSource: HeinOnline\\nURL: NA\\nalternate_sal_volume: NA\\nalternate_sal_page_start: NA\\nhas_alternate_sal_citation: FALSE', metadata={'source': 'us_laws_dataset.csv', 'row': 1884}),\n",
336 |        " Document(page_content='row_number: 38724\\naction: An Act\\nTitle: An act to permit the use of unsworn declarations under penalty of perjury as evidence in Federal proceedings\\nsal_volume: 90\\nsal_page_start: 2534\\nBillCitation: H.R. 15531\\ncongress_number: 94\\nchapter: NA\\nsession_number: 2\\npl_no: 94-550\\ndate_of_passage: 1976-10-18\\nsecondary_date: NA\\ndates_conflict: FALSE\\nSource: NA\\nURL: https://www.govinfo.gov/content/pkg/STATUTE-90/pdf/STATUTE-90-Pg2534.pdf\\nalternate_sal_volume: 90\\nalternate_sal_page_start: 2534\\nhas_alternate_sal_citation: TRUE', metadata={'source': 'us_laws_dataset.csv', 'row': 38720}),\n",
337 |        " Document(page_content='row_number: 44400\\naction: An Act\\nTitle: An act to amend title 18, United States Code, with respect to witness retaliation, witness tampering and jury tampering\\nsal_volume: 110\\nsal_page_start: 3017\\nBillCitation: H.R. 3120\\ncongress_number: 104\\nchapter: NA\\nsession_number: 2\\npl_no: 104-214\\ndate_of_passage: 1996-10-01\\nsecondary_date: NA\\ndates_conflict: FALSE\\nSource: NA\\nURL: https://www.govinfo.gov/content/pkg/STATUTE-110/pdf/STATUTE-110-Pg3017.pdf\\nalternate_sal_volume: 110\\nalternate_sal_page_start: 3017\\nhas_alternate_sal_citation: TRUE', metadata={'source': 'us_laws_dataset.csv', 'row': 44396})]"
338 |       ]
339 |      },
340 |      "execution_count": 7,
341 |      "metadata": {},
342 |      "output_type": "execute_result"
343 |     }
344 |    ],
345 |    "source": [
346 |     "# test out the similarity search\n",
347 |     "query = \"What laws and amendments relate to perjury?\"\n",
348 |     "response = db.similarity_search(query, k=10)\n",
349 |     "# let's look at the first 3 retrieved docs\n",
350 |     "response[:3]"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 8,
356 |    "metadata": {},
357 |    "outputs": [],
358 |    "source": [
359 |     "from langchain.tools.retriever import create_retriever_tool\n",
360 |     "\n",
361 |     "# convert this retriver into a tool\n",
362 |     "db_retriever = db.as_retriever()\n",
363 |     "db_retriever_tool = create_retriever_tool(\n",
364 |     "    db_retriever,\n",
365 |     "    name = \"law_dataset_retriever\",\n",
366 |     "    description = \"Retrieve relevant context from the US laws dataset.\"\n",
367 |     ")"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "markdown",
372 |    "metadata": {},
373 |    "source": [
374 |     "## Instantiating the You.com Tool in Langchain"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": 9,
380 |    "metadata": {},
381 |    "outputs": [],
382 |    "source": [
383 |     "from langchain_community.tools.you import YouSearchTool\n",
384 |     "from langchain_community.utilities.you import YouSearchAPIWrapper\n",
385 |     "\n",
386 |     "api_wrapper = YouSearchAPIWrapper(num_web_results = 10)\n",
387 |     "ydc_tool = YouSearchTool(api_wrapper=api_wrapper)"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "code",
392 |    "execution_count": 10,
393 |    "metadata": {},
394 |    "outputs": [
395 |     {
396 |      "data": {
397 |       "text/plain": [
398 |        "[Document(page_content='Meijer, Inc. v. Ferring B.V.; Ferring Pharmaceuticals, Inc.; and Aventis Pharmaceuticals [In Re: DDAVP Direct Purchaser Antitrust Litigation] U.S. v. Memphis Board of Realtors', metadata={'url': 'https://www.justice.gov/atr/antitrust-case-filings-alpha', 'thumbnail_url': None, 'title': 'Antitrust Division | Antitrust Case Filings | United States Department of Justice', 'description': 'An official website of the United States government · Official websites use .gov A .gov website belongs to an official government organization in the United States'}),\n",
399 |        " Document(page_content=\"Dentsply International, Inc. v. Antitrust Division of the United States Department of Justice · U.S. v. Freddy Deoliveira · U.S. v. Wilhelm DerMinassian · U.S. v. Eric Descouraux · Leinani Deslandes, Stephanie Turner, et al. v. McDonald's USA, LLC, et al.\", metadata={'url': 'https://www.justice.gov/atr/antitrust-case-filings-alpha', 'thumbnail_url': None, 'title': 'Antitrust Division | Antitrust Case Filings | United States Department of Justice', 'description': 'An official website of the United States government · Official websites use .gov A .gov website belongs to an official government organization in the United States'}),\n",
400 |        " Document(page_content='U.S. v. Arden-Mayfair, Inc., Matanuska Maid, Inc.; and Meadowmoor Alaska Dairy, Inc. U.S. v. Argos USA LLC, f.k.a. Argos Ready Mix LLC', metadata={'url': 'https://www.justice.gov/atr/antitrust-case-filings-alpha', 'thumbnail_url': None, 'title': 'Antitrust Division | Antitrust Case Filings | United States Department of Justice', 'description': 'An official website of the United States government · Official websites use .gov A .gov website belongs to an official government organization in the United States'})]"
401 |       ]
402 |      },
403 |      "execution_count": 10,
404 |      "metadata": {},
405 |      "output_type": "execute_result"
406 |     }
407 |    ],
408 |    "source": [
409 |     "# test out the You.com search tool\n",
410 |     "response = ydc_tool.invoke(\"Tell me about a recent high-profile case related to antitrust in the USA?\")\n",
411 |     "# let's look at the first 3 results\n",
412 |     "response[:3]"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "markdown",
417 |    "metadata": {},
418 |    "source": [
419 |     "## Instantiate our LLM"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 21,
425 |    "metadata": {},
426 |    "outputs": [],
427 |    "source": [
428 |     "from langchain_openai import ChatOpenAI\n",
429 |     "from langchain_anthropic import ChatAnthropic\n",
430 |     "\n",
431 |     "llm = ChatAnthropic(model='claude-3-5-sonnet-20240620')\n",
432 |     "#llm = ChatOpenAI(model=\"gpt-4o\", temperature=0.5)"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "markdown",
437 |    "metadata": {},
438 |    "source": [
439 |     "## Tying it all together"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": 12,
445 |    "metadata": {},
446 |    "outputs": [],
447 |    "source": [
448 |     "from langgraph.prebuilt import chat_agent_executor\n",
449 |     "from langgraph.checkpoint import MemorySaver\n",
450 |     "\n",
451 |     "# Create a checkpointer to use memory\n",
452 |     "memory = MemorySaver()\n",
453 |     "# the vector store representation of the CSV dataset and the You.com Search tool will both be passed as tools to the agent\n",
454 |     "tools = [db_retriever_tool, ydc_tool]\n",
455 |     "agent_executor = chat_agent_executor.create_tool_calling_executor(llm, tools, checkpointer=memory)"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "markdown",
460 |    "metadata": {},
461 |    "source": [
462 |     "## Let's try it out!"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": 13,
468 |    "metadata": {},
469 |    "outputs": [
470 |     {
471 |      "name": "stdout",
472 |      "output_type": "stream",
473 |      "text": [
474 |       "Based on the information retrieved, I can provide you with an overview of the laws pertaining to perjury in the United States and a recent case related to a violation of these laws.\n",
475 |       "\n",
476 |       "Laws pertaining to perjury in the US:\n",
477 |       "\n",
478 |       "1. The main federal statutes criminalizing perjury are 18 U.S.C. §§ 1621 and 1623.\n",
479 |       "\n",
480 |       "2. 18 U.S.C. § 1621 (Perjury generally):\n",
481 |       "   - This is the traditional, broadly applicable perjury statute.\n",
482 |       "   - It applies to false statements made under oath before legislative, administrative, or judicial bodies.\n",
483 |       "   - Key elements include:\n",
484 |       "     a) Taking an oath before a competent tribunal, officer, or person.\n",
485 |       "     b) Willfully stating or subscribing to any material matter which the person does not believe to be true.\n",
486 |       "   - The penalty includes fines and imprisonment for up to five years.\n",
487 |       "\n",
488 |       "3. 18 U.S.C. § 1623 (False declarations before grand jury or court):\n",
489 |       "   - This statute specifically addresses false statements made in court or before a grand jury.\n",
490 |       "   - It was added in 1971 as a result of the Organized Crime Control Act of 1970.\n",
491 |       "\n",
492 |       "4. The legal standard for perjury, as established in United States v. Dunnigan (1993), requires that a person:\n",
493 |       "   - Testifies under oath or affirmation\n",
494 |       "   - Gives false testimony concerning a material matter\n",
495 |       "   - Does so with the willful intent to provide false testimony (not as a result of confusion, mistake, or faulty memory)\n",
496 |       "\n",
497 |       "5. Subornation of perjury (18 U.S.C. § 1622) is also a crime, which involves inducing another person to commit perjury.\n",
498 |       "\n",
499 |       "6. Some notable aspects of US perjury law include:\n",
500 |       "   - A defense allowing individuals to recant a perjurious statement during the same proceeding.\n",
501 |       "   - Application to written declarations made under penalty of perjury, such as tax returns.\n",
502 |       "\n",
503 |       "Regarding a recent case related to a violation of these laws:\n",
504 |       "\n",
505 |       "Unfortunately, the search results don't provide information about a specific recent case involving perjury. However, it's worth noting that perjury cases can be prosecuted at both federal and state levels. The FBI has primary investigative responsibility for perjury violations in cases involving departments and agencies of the United States, with some exceptions for specific agencies.\n",
506 |       "\n",
507 |       "To find a recent case, you would need to search through recent court records or news reports. Perjury cases can arise in various contexts, including criminal trials, civil litigation, grand jury proceedings, and even in official documents like tax returns.\n",
508 |       "\n",
509 |       "If you're interested in finding a specific recent case, I would recommend searching through legal databases or recent news articles about perjury convictions or charges. This would provide the most up-to-date information on perjury cases in the United States.\n"
510 |      ]
511 |     }
512 |    ],
513 |    "source": [
514 |     "prompt_1 = \"What laws in the US pertain to perjury and is there a recent case in the US that relates to a violation of these laws?\"\n",
515 |     "\n",
516 |     "result = agent_executor.invoke(input={\"messages\": prompt_1}, config={\"configurable\": {\"thread_id\": \"xyz_789\"}})[\"messages\"][-1].content\n",
517 |     "print(result)"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": 14,
523 |    "metadata": {},
524 |    "outputs": [
525 |     {
526 |      "name": "stdout",
527 |      "output_type": "stream",
528 |      "text": [
529 |       "Based on the search results, the most famous US Supreme Court perjury case is Bronston v. United States, 409 U.S. 352 (1973). This case is considered a seminal decision in US perjury law. Here are the key points about this case:\n",
530 |       "\n",
531 |       "1. Significance: Bronston v. United States is the controlling legal standard for perjury in federal jurisprudence.\n",
532 |       "\n",
533 |       "2. Decision: The Supreme Court strictly construed the federal perjury statute.\n",
534 |       "\n",
535 |       "3. Ruling: Chief Justice Warren Burger, writing for a unanimous Court, held that responses to questions made under oath that relay truthful information in themselves but are intended to mislead or evade the examiner could not be prosecuted as perjury.\n",
536 |       "\n",
537 |       "4. Implication: The criminal justice system must rely on more carefully worded follow-up questions to prevent evasive answers, rather than prosecuting for perjury.\n",
538 |       "\n",
539 |       "5. Legal Standard: The Court established that for a statement to be considered perjury, it must be false, concern a material matter, and be made with the willful intent to provide false testimony. It cannot be the result of confusion, mistake, or faulty memory.\n",
540 |       "\n",
541 |       "6. Impact: This decision has been cited in many subsequent cases and has become the controlling legal standard for perjury in federal courts.\n",
542 |       "\n",
543 |       "7. Controversy: The ruling has been criticized for creating a loophole in perjury statutes, essentially allowing a witness to mislead without legal consequences as long as their statements are literally true.\n",
544 |       "\n",
545 |       "8. Later Applications: The Bronston standard was notably invoked during Bill Clinton's impeachment proceedings in 1998 as a defense against perjury charges.\n",
546 |       "\n",
547 |       "While there have been other important perjury cases in the Supreme Court's history, Bronston v. United States stands out as the most famous and influential in shaping modern perjury law in the United States.\n"
548 |      ]
549 |     }
550 |    ],
551 |    "source": [
552 |     "prompt_2 = \"What is the most famous US Supreme Court perjury case?\"\n",
553 |     "result = agent_executor.invoke(input={\"messages\": prompt_2}, config={\"configurable\": {\"thread_id\": \"xyz_789\"}})[\"messages\"][-1].content\n",
554 |     "print(result)"
555 |    ]
556 |   },
557 |   {
558 |    "cell_type": "code",
559 |    "execution_count": 15,
560 |    "metadata": {},
561 |    "outputs": [
562 |     {
563 |      "name": "stdout",
564 |      "output_type": "stream",
565 |      "text": [
566 |       "Based on the information provided about US laws pertaining to perjury and considering the Bronston v. United States case, it's important to note that the laws have evolved over time. However, to answer your question about whether other laws could have been applied in this case, we need to consider the laws that were in effect at the time of the Bronston case in 1973. Let's analyze the situation:\n",
567 |       "\n",
568 |       "1. Main Perjury Statute: The primary law applied in Bronston v. United States was likely 18 U.S.C. § 1621, which is the general federal perjury statute. This law was already in place at the time of the case.\n",
569 |       "\n",
570 |       "2. False Declarations Statute: In 1970, just a few years before the Bronston case, Congress enacted 18 U.S.C. § 1623, which specifically addresses false declarations before a grand jury or court. This law could potentially have been applied, but it was relatively new at the time of the Bronston case.\n",
571 |       "\n",
572 |       "3. Unsworn Declarations: The act \"to permit the use of unsworn declarations under penalty of perjury as evidence in Federal proceedings\" (Public Law 94-550) was passed in 1976, after the Bronston case. Therefore, it couldn't have been applied in this specific case.\n",
573 |       "\n",
574 |       "4. Witness Tampering: The act \"to amend title 18, United States Code, with respect to witness retaliation, witness tampering and jury tampering\" (Public Law 104-214) was passed in 1996, long after the Bronston case. So this also couldn't have been applied.\n",
575 |       "\n",
576 |       "5. Earlier Perjury Laws: The 1823 act \"for the punishment of the crime of perjury\" might have influenced the legal framework at the time, but it's likely that its provisions had been superseded or incorporated into more recent statutes by 1973.\n",
577 |       "\n",
578 |       "Given this information, it appears that the main laws that could have been applied in the Bronston case were:\n",
579 |       "\n",
580 |       "1. 18 U.S.C. § 1621 (the general perjury statute)\n",
581 |       "2. 18 U.S.C. § 1623 (false declarations before a grand jury or court)\n",
582 |       "\n",
583 |       "The Court's decision in Bronston focused on interpreting these statutes, particularly § 1621. The Court's ruling essentially narrowed the application of the perjury statute by holding that literally true but misleading statements do not constitute perjury.\n",
584 |       "\n",
585 |       "It's worth noting that while other related laws existed or were later enacted, the Bronston decision set a precedent in interpreting perjury statutes that has influenced how these laws are applied in subsequent cases. The Court's interpretation emphasized the importance of precise questioning and placed the burden on the questioner to clarify evasive answers, rather than relying on perjury charges to address misleading but technically truthful statements.\n"
586 |      ]
587 |     }
588 |    ],
589 |    "source": [
590 |     "prompt_3 =  \"Based on your knowledge of all laws in the US pertaining to perjury, were there any other laws that could have been applied in this case?\"\n",
591 |     "result = agent_executor.invoke(input={\"messages\":prompt_3}, config={\"configurable\": {\"thread_id\": \"xyz_789\"}})[\"messages\"][-1].content\n",
592 |     "print(result)"
593 |    ]
594 |   },
595 |   {
596 |    "cell_type": "markdown",
597 |    "metadata": {},
598 |    "source": [
599 |     "## Let's create a Python class that encapsulates the code above.  This will enable users to easily create chatbots for new use cases with custom CSV datasets"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": 22,
605 |    "metadata": {},
606 |    "outputs": [],
607 |    "source": [
608 |     "import secrets\n",
609 |     "from typing import Union\n",
610 |     "\n",
611 |     "class CSV_QA_Bot:\n",
612 |     "    def __init__(self, llm: Union[ChatOpenAI, ChatAnthropic], csv_files: list[str], num_web_results_to_fetch: int = 10):\n",
613 |     "        self._llm = llm\n",
614 |     "        \n",
615 |     "        docs = self._load_csv_files(csv_files)\n",
616 |     "        \n",
617 |     "        # split the docs into chunks, vectorize the chunks and load them into a vector store\n",
618 |     "        db = self._create_vector_store(docs)\n",
619 |     "        \n",
620 |     "        # create a retriever from the vector store\n",
621 |     "        self._faiss_retriever = db.as_retriever()\n",
622 |     "        \n",
623 |     "        # convert this retriever into a Langchain tool\n",
624 |     "        self._faiss_retriever_tool = create_retriever_tool(\n",
625 |     "            self._faiss_retriever,\n",
626 |     "            name = \"law_dataset_retriever\",\n",
627 |     "            description = \"Retrieve relevant context from the US laws dataset.\"\n",
628 |     "        )\n",
629 |     "        \n",
630 |     "        # instantiate the YDC search tool in Langchain\n",
631 |     "        self._ydc_api_wrapper = YouSearchAPIWrapper(num_web_results=num_web_results_to_fetch)\n",
632 |     "        self._ydc_search_tool = YouSearchTool(api_wrapper=self._ydc_api_wrapper)\n",
633 |     "        \n",
634 |     "        \n",
635 |     "        # create a list of tools that will be supplied to the Langchain agent\n",
636 |     "        self._tools = [self._faiss_retriever_tool, self._ydc_search_tool]\n",
637 |     "        \n",
638 |     "        # Create a checkpointer to use memory\n",
639 |     "        self._memory = MemorySaver()\n",
640 |     "        \n",
641 |     "        # create the agent executor\n",
642 |     "        self._agent_executor = chat_agent_executor.create_tool_calling_executor(self._llm, tools, checkpointer=memory)\n",
643 |     "        \n",
644 |     "        # generate a thread ID for to keep track of conversation history\n",
645 |     "        self._thread_id = self._generate_thread_id()\n",
646 |     "\n",
647 |     "    def _load_csv_files(self, csv_files: list[str]) -> list:\n",
648 |     "        docs = []\n",
649 |     "        for file in csv_files:\n",
650 |     "            data_loader = CSVLoader(file)\n",
651 |     "            docs.extend(data_loader.load())\n",
652 |     "        return docs\n",
653 |     "    \n",
654 |     "    def _create_vector_store(self, docs: list) -> FAISS:\n",
655 |     "        text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)\n",
656 |     "        chunked_docs = text_splitter.split_documents(docs)\n",
657 |     "        embeddings = OpenAIEmbeddings()\n",
658 |     "        return FAISS.from_documents(documents=chunked_docs, embedding=embeddings)\n",
659 |     "\n",
660 |     "    def _generate_thread_id(self) -> str:\n",
661 |     "        thread_id = secrets.token_urlsafe(16)\n",
662 |     "        return thread_id\n",
663 |     "    \n",
664 |     "    def invoke_bot(self, input_str: str) -> str:\n",
665 |     "        input = {\"messages\": input_str}\n",
666 |     "        config = {\"configurable\": {\"thread_id\": self._thread_id}}\n",
667 |     "        output = self._agent_executor.invoke(input=input, config=config)[\"messages\"][-1].content\n",
668 |     "        return output"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "markdown",
673 |    "metadata": {},
674 |    "source": [
675 |     "## Let's try it out!"
676 |    ]
677 |   },
678 |   {
679 |    "cell_type": "code",
680 |    "execution_count": 23,
681 |    "metadata": {},
682 |    "outputs": [],
683 |    "source": [
684 |     "conversational_agent = CSV_QA_Bot(llm, csv_files=[\"us_laws_dataset.csv\"])"
685 |    ]
686 |   },
687 |   {
688 |    "cell_type": "code",
689 |    "execution_count": 24,
690 |    "metadata": {},
691 |    "outputs": [
692 |     {
693 |      "name": "stdout",
694 |      "output_type": "stream",
695 |      "text": [
696 |       "Based on the search results, I can provide information about a recent perjury case in the United States. The case involves Craig German, a 60-year-old man from Kernersville, North Carolina. This case is particularly interesting because it relates to perjury committed during the sentencing phase of a previous case.\n",
697 |       "\n",
698 |       "Here are the key details of this recent perjury case:\n",
699 |       "\n",
700 |       "1. Background: Craig German was previously convicted for conspiring to steal trade secrets from aircraft manufacturing companies.\n",
701 |       "\n",
702 |       "2. New Charges: After his initial conviction, German faced additional charges for committing perjury in his prior case and for providing false statements to a government agency (the FBI).\n",
703 |       "\n",
704 |       "3. Trial and Conviction: A federal jury in the U.S. District Court for the Southern District of Georgia found German guilty of perjury and false statements to a government agency after a three-day trial.\n",
705 |       "\n",
706 |       "4. Specifics of the Perjury:\n",
707 |       "   - During the sentencing portion of his prior case, German testified under oath and denied copying more than 15,000 proprietary engineering drawings and documents onto a removable storage device while employed at an aircraft manufacturing company.\n",
708 |       "   - The jury found this testimony to be false, constituting perjury.\n",
709 |       "\n",
710 |       "5. Additional False Statements: German was also found guilty of providing a materially false statement to the FBI during a voluntary meeting, where he emphatically denied copying, taking, or otherwise transferring the proprietary documents.\n",
711 |       "\n",
712 |       "6. Prosecution: The case was investigated by the FBI and prosecuted by Assistant U.S. Attorneys Jennifer G. Solari and Steven H. Lee.\n",
713 |       "\n",
714 |       "7. Potential Consequences: The conviction for perjury and false statements means German faces additional prison time on top of his sentence from the previous case involving trade secret theft.\n",
715 |       "\n",
716 |       "This case illustrates how seriously the U.S. justice system takes perjury, especially when it occurs during official proceedings like sentencing hearings. It also demonstrates that lying to federal investigators (in this case, the FBI) can result in additional criminal charges.\n",
717 |       "\n",
718 |       "The case aligns with the laws we discussed earlier, particularly the ones related to using declarations under penalty of perjury in federal proceedings and addressing witness tampering. It shows that the legal system actively enforces these laws to maintain the integrity of court proceedings and investigations.\n"
719 |      ]
720 |     }
721 |    ],
722 |    "source": [
723 |     "prompt_1 = \"What laws in the US pertain to perjury and is there a recent case in the US that relates to a violation of these laws?\"\n",
724 |     "\n",
725 |     "print(conversational_agent.invoke_bot(prompt_1))"
726 |    ]
727 |   },
728 |   {
729 |    "cell_type": "code",
730 |    "execution_count": 25,
731 |    "metadata": {},
732 |    "outputs": [
733 |     {
734 |      "name": "stdout",
735 |      "output_type": "stream",
736 |      "text": [
737 |       "Based on the search results, the most famous US Supreme Court perjury case is Bronston v. United States, 409 U.S. 352 (1973). This case is considered seminal in US jurisprudence regarding perjury. Here are the key points about this landmark case:\n",
738 |       "\n",
739 |       "1. Significance: Bronston v. United States is the controlling legal standard for perjury in federal jurisprudence and has been widely cited since its decision.\n",
740 |       "\n",
741 |       "2. Ruling: The Supreme Court, in a unanimous decision written by Chief Justice Warren Burger, strictly construed the federal perjury statute.\n",
742 |       "\n",
743 |       "3. Key principle: The Court held that responses to questions made under oath that relay truthful information in themselves but are intended to mislead or evade the examiner cannot be prosecuted as perjury.\n",
744 |       "\n",
745 |       "4. Implications: This decision essentially created a loophole in perjury statutes, allowing witnesses to potentially mislead without legal consequences as long as their statements are literally true.\n",
746 |       "\n",
747 |       "5. Remedy: The Court stated that the criminal justice system must rely on more carefully worded follow-up questions to address evasive answers, rather than prosecuting for perjury.\n",
748 |       "\n",
749 |       "6. Later impact: The Bronston decision was invoked during Bill Clinton's impeachment proceedings in 1998 as a defense against perjury charges.\n",
750 |       "\n",
751 |       "7. Criticism: The ruling has been long criticized for potentially allowing witnesses to lie without consequences, but subsequent Courts have refused to overrule or limit its scope.\n",
752 |       "\n",
753 |       "8. Ongoing relevance: Legal experts continue to analyze cases under the Bronston standard, as seen in the example of President Clinton's testimony during his impeachment proceedings.\n",
754 |       "\n",
755 |       "This case remains crucial in understanding how perjury is defined and prosecuted in the United States federal court system, making it the most famous Supreme Court perjury case to date.\n"
756 |      ]
757 |     }
758 |    ],
759 |    "source": [
760 |     "prompt_2 = \"What is the most famous US Supreme Court perjury case?\"\n",
761 |     "\n",
762 |     "print(conversational_agent.invoke_bot(prompt_2))"
763 |    ]
764 |   },
765 |   {
766 |    "cell_type": "code",
767 |    "execution_count": 26,
768 |    "metadata": {},
769 |    "outputs": [
770 |     {
771 |      "name": "stdout",
772 |      "output_type": "stream",
773 |      "text": [
774 |       "Based on the information retrieved about US laws pertaining to perjury and considering the Bronston v. United States case, it's important to note that the Bronston case was decided in 1973, which predates some of the laws mentioned in the retrieval. However, we can analyze whether any of these laws or other existing laws at the time could have been applied in this case:\n",
775 |       "\n",
776 |       "1. The 1823 Act for the punishment of the crime of perjury: This act was likely superseded by more modern statutes by the time of the Bronston case, but it shows the long-standing nature of perjury laws in the US.\n",
777 |       "\n",
778 |       "2. The federal perjury statute (18 U.S.C. § 1621): This was the primary law under consideration in the Bronston case. The Court's interpretation of this statute led to the ruling that literal truth, even if misleading, cannot be prosecuted as perjury.\n",
779 |       "\n",
780 |       "3. False Statements Statute (18 U.S.C. § 1001): While not specifically a perjury statute, this law prohibits making false statements to federal officials. It could potentially have been considered in cases where the statements were made to federal officials outside of court proceedings.\n",
781 |       "\n",
782 |       "4. Obstruction of Justice (18 U.S.C. § 1503): Although not a perjury statute per se, this law could potentially have been applied if the misleading statements were seen as an attempt to obstruct justice.\n",
783 |       "\n",
784 |       "5. The 1976 Act permitting unsworn declarations under penalty of perjury: This law came after the Bronston case, so it wouldn't have been applicable. However, it shows the evolution of perjury laws to include unsworn statements in certain circumstances.\n",
785 |       "\n",
786 |       "6. The 1996 Act amending Title 18 with respect to witness retaliation, witness tampering, and jury tampering: This also came after Bronston, but it demonstrates how laws related to court proceedings and testimony have expanded over time.\n",
787 |       "\n",
788 |       "In the specific context of the Bronston case, the Court's interpretation focused narrowly on the federal perjury statute. The ruling essentially stated that if a witness gives an answer that is literally true but nonresponsive to the question, it cannot be considered perjury even if the answer is intentionally misleading.\n",
789 |       "\n",
790 |       "Given this interpretation, it's unlikely that other laws could have been successfully applied in this specific case. The Court's decision set a high bar for perjury convictions, emphasizing the need for explicit falsehoods rather than merely misleading statements.\n",
791 |       "\n",
792 |       "However, in subsequent cases, prosecutors might consider using a combination of charges, including obstruction of justice or false statements, to address situations where witnesses provide misleading but technically truthful answers. The evolution of laws after Bronston also suggests that legislators have attempted to close some of the loopholes created by this decision, particularly in areas related to witness testimony and court proceedings.\n",
793 |       "\n",
794 |       "It's worth noting that the Bronston decision remains controversial, with some legal scholars arguing that it creates too high a bar for perjury convictions and potentially encourages evasive testimony. Nonetheless, it remains the controlling precedent in federal perjury cases.\n"
795 |      ]
796 |     }
797 |    ],
798 |    "source": [
799 |     "prompt_3 =  \"Based on your knowledge of all laws in the US pertaining to perjury, were there any other laws that could have been applied in this case?\"\n",
800 |     "\n",
801 |     "print(conversational_agent.invoke_bot(prompt_3))"
802 |    ]
803 |   },
804 |   {
805 |    "cell_type": "code",
806 |    "execution_count": null,
807 |    "metadata": {},
808 |    "outputs": [],
809 |    "source": []
810 |   }
811 |  ],
812 |  "metadata": {
813 |   "kernelspec": {
814 |    "display_name": "llm-practice",
815 |    "language": "python",
816 |    "name": "python3"
817 |   },
818 |   "language_info": {
819 |    "codemirror_mode": {
820 |     "name": "ipython",
821 |     "version": 3
822 |    },
823 |    "file_extension": ".py",
824 |    "mimetype": "text/x-python",
825 |    "name": "python",
826 |    "nbconvert_exporter": "python",
827 |    "pygments_lexer": "ipython3",
828 |    "version": "3.10.14"
829 |   }
830 |  },
831 |  "nbformat": 4,
832 |  "nbformat_minor": 2
833 | }
834 | 


--------------------------------------------------------------------------------
/transcripts_to_salesforce.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Extracting structred information from Unstrctured Data\n",
  8 |     "\n",
  9 |     "Often, sales representatives are required to go through call transcriptions to extract relevant details and manually enter them into Salesforce tables. This process is very time consuming. Using You.com's APIs, it's possible to automate the extraction of structured information from unstructured data like call transcripts. The following sections of the notebook demonstrates how the APIs can answer questions based on the transcript, and when there is not enough information available, the API can search through the web to find the most relevant answer."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "### Setup environment"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "%%capture\n",
 26 |     "! pip install python-dotenv\n",
 27 |     "! pip install requests"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "data": {
 37 |       "text/plain": [
 38 |        "True"
 39 |       ]
 40 |      },
 41 |      "execution_count": 2,
 42 |      "metadata": {},
 43 |      "output_type": "execute_result"
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "# The YDC_API_KEY and OPENAI_API_KEY should be defined in a .env file\n",
 48 |     "# Let's load the API keys in from the .env file\n",
 49 |     "import dotenv\n",
 50 |     "\n",
 51 |     "dotenv.load_dotenv(\".env\", override=True)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "### Sample Transcript\n",
 59 |     "\n",
 60 |     "Following is a sample transcript between Alex, a Sales Representative from Tech Solution and Taylor who is an IT Analyst at You.com. Alex has booked a call with Taylor to understand potential pain points his company might be facing. "
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 3,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "transcript = \"\"\"\n",
 70 |     "Sales Representative: Hi, this is Alex from Tech Solutions. Is this Taylor?\n",
 71 |     "\n",
 72 |     "Prospect: Yes, this is Taylor.\n",
 73 |     "\n",
 74 |     "Sales Representative: Great! How are you today, Taylor?\n",
 75 |     "\n",
 76 |     "Prospect: I'm doing well, thanks. How about you?\n",
 77 |     "\n",
 78 |     "Sales Representative: I'm doing well, thank you! If you don't mind me asking, Taylor,  what is your role at You.com?\n",
 79 |     "\n",
 80 |     "Prospect: I'm an IT Analyst at You.com. I manage the IT infrastructure and help resolve any technical issues that arise.\n",
 81 |     "\n",
 82 |     "Sales Representative: That's great! I appreciate you taking the time to speak with me. \n",
 83 |     "The reason for my call is that I noticed youdotcom has been growing rapidly, and I wanted to see \n",
 84 |     "if there's a way we can help streamline some of your IT processes. Do you have a few minutes to discuss this?\n",
 85 |     "\n",
 86 |     "Prospect: Sure, I can spare a few minutes.\n",
 87 |     "\n",
 88 |     "Sales Representative: Excellent. To give you a bit of context, at Tech Solutions, we specialize in \n",
 89 |     "providing comprehensive IT management software that helps companies like yours save time and reduce costs. \n",
 90 |     "Can you tell me a bit about your current IT setup and any challenges you might be facing?\n",
 91 |     "\n",
 92 |     "Prospect: Well, we have a small team managing our IT infrastructure, and sometimes it gets overwhelming.\n",
 93 |     "We’ve been experiencing a few issues with network downtime and managing our software licenses.\n",
 94 |     "\n",
 95 |     "Sales Representative: I understand. Network downtime and software management can definitely be challenging. \n",
 96 |     "How much downtime are you currently experiencing, and how is it impacting your operations?\n",
 97 |     "\n",
 98 |     "Prospect: We’ve had a few incidents in the past month, resulting in a couple of hours of downtime each time. \n",
 99 |     "It's affecting our productivity and causing some frustration among the team.\n",
100 |     "\n",
101 |     "Sales Representative: That sounds frustrating indeed. At Tech Solutions, our software is designed to minimize\n",
102 |     "downtime by proactively monitoring your network and automating many of the manual tasks that can lead to errors and delays. \n",
103 |     "Additionally, our license management feature ensures that all your software licenses are up-to-date and compliant, \n",
104 |     "reducing the risk of unexpected downtime. Would a solution like this be of interest to you?\n",
105 |     "\n",
106 |     "Prospect: It does sound helpful. How does it work exactly?\n",
107 |     "\n",
108 |     "Sales Representative: Great question! Our software integrates seamlessly with your existing infrastructure.\n",
109 |     "It provides real-time monitoring and alerts for any potential issues, allowing your team to address them before\n",
110 |     "they cause downtime. It also automates software updates and license renewals, so you don’t have to worry about them manually.\n",
111 |     "Would you be interested in seeing a demo of how this works?\n",
112 |     "\n",
113 |     "Prospect: Yes, a demo would be useful.\n",
114 |     "\n",
115 |     "Sales Representative: Fantastic! I can schedule a demo for you with one of our specialists.\n",
116 |     "They can walk you through the features and show you how it can specifically benefit You.com.\n",
117 |     "Does tomorrow at 2 PM work for you?\n",
118 |     "\n",
119 |     "Prospect: Tomorrow at 2 PM works for me.\n",
120 |     "\n",
121 |     "Sales Representative: Perfect. I’ll send you a calendar invite with all the details.\n",
122 |     "Before we wrap up, do you have any other questions or concerns that you'd like to address?\n",
123 |     "\n",
124 |     "Prospect: Not at the moment, but I’m looking forward to the demo.\n",
125 |     "\n",
126 |     "Sales Representative: Great to hear. Thanks again for your time, Taylor. \n",
127 |     "I’ll send over the invite shortly, and I look forward to speaking with you tomorrow.\n",
128 |     "\n",
129 |     "Prospect: Thank you, Alex. Talk to you then.\n",
130 |     "\n",
131 |     "Sales Representative: Take care!\n",
132 |     "\"\"\""
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "After the call is over, Alex wants to update the Salesforce table on how the call went.  Following are the list of questions that Alex wants to know the answer to. Note that not all questions can be answered solely based on the transcript, and in fact might involve searching through the web to find relevant information."
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 4,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "questions = \"\"\"\n",
149 |     "what is the name of the prospect?\n",
150 |     "what is the prospect's role?\n",
151 |     "what is the name of the company?\n",
152 |     "what is the company's website URL?\n",
153 |     "what is company's LinkedIn URL?\n",
154 |     "what is the market segment the prospect operates in?\n",
155 |     "what are the pain points expressed by the prospect?\n",
156 |     "what are the follow up actions?\n",
157 |     "what is the overall sentiment of the conversation?\n",
158 |     "\"\"\""
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "### Extract structured information\n",
166 |     "\n",
167 |     "Given a transcript, the following function extracts  structural information and answers specific questions based on the transcript provided.\n",
168 |     "If the answers to the question do not exist in the transcript, search the You.com searches the web for relevant answers."
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 5,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "import requests\n",
178 |     "import os\n",
179 |     "\n",
180 |     "def get_structural_info(transcript, questions, mode=\"research\"):\n",
181 |     "    headers = {'x-api-key': os.environ['YDC_API_KEY']}\n",
182 |     "    endpoint = f\"https://chat-api.you.com/{mode}\"\n",
183 |     "\n",
184 |     "    params = {\"query\": f\"\"\"\n",
185 |     "            You are a helpful assistant who is given a transcript` between a sales representative and a prospect.\n",
186 |     "            Your task is to extract the information from the transcript as well the web to answer the following questions:\n",
187 |     "            {questions}\n",
188 |     "\n",
189 |     "            The transcript you need to analyze is as follows:\n",
190 |     "            {transcript}\n",
191 |     "\n",
192 |     "            Convert relative time references to absolute time references.\n",
193 |     "            Return the extracted information as a JSON object.\n",
194 |     "            \"\"\"\n",
195 |     "        }\n",
196 |     "    response = requests.get(endpoint, params=params, headers=headers)\n",
197 |     "    return response.json()"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "### Results\n",
205 |     "Display the answers to Alex's questions:"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 6,
211 |    "metadata": {},
212 |    "outputs": [
213 |     {
214 |      "name": "stdout",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "{\n",
218 |       "  \"name_of_prospect\": \"Taylor\",\n",
219 |       "  \"prospect_role\": \"IT Analyst\",\n",
220 |       "  \"name_of_company\": \"You.com\",\n",
221 |       "  \"company_website_url\": \"https://www.you.com\",\n",
222 |       "  \"company_linkedin_url\": \"https://www.linkedin.com/company/you-com\",\n",
223 |       "  \"market_segment\": \"IT management software\",\n",
224 |       "  \"pain_points\": [\n",
225 |       "    \"Network downtime\",\n",
226 |       "    \"Managing software licenses\"\n",
227 |       "  ],\n",
228 |       "  \"follow_up_actions\": [\n",
229 |       "    \"Schedule a demo for tomorrow at 2 PM\",\n",
230 |       "    \"Send a calendar invite with all the details\"\n",
231 |       "  ],\n",
232 |       "  \"overall_sentiment\": \"Positive\"\n",
233 |       "}\n"
234 |      ]
235 |     }
236 |    ],
237 |    "source": [
238 |     "structural_info = get_structural_info(transcript, questions, mode=\"smart\")\n",
239 |     "print(structural_info[\"answer\"])"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "Note that information such as `company_website` and `company_linkedin` are not available on the transcript. Similarly, the API can also answer questions like the overall sentiment of the transcript by leveraging underlying foundational LLMs."
247 |    ]
248 |   }
249 |  ],
250 |  "metadata": {
251 |   "kernelspec": {
252 |    "display_name": ".venv",
253 |    "language": "python",
254 |    "name": "python3"
255 |   },
256 |   "language_info": {
257 |    "codemirror_mode": {
258 |     "name": "ipython",
259 |     "version": 3
260 |    },
261 |    "file_extension": ".py",
262 |    "mimetype": "text/x-python",
263 |    "name": "python",
264 |    "nbconvert_exporter": "python",
265 |    "pygments_lexer": "ipython3",
266 |    "version": "3.10.14"
267 |   }
268 |  },
269 |  "nbformat": 4,
270 |  "nbformat_minor": 2
271 | }
272 | 


--------------------------------------------------------------------------------
/you_dspy_vc_chat.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# you.com <> dspy: ChatVC\n",
  8 |     "\n",
  9 |     "A chatbot that I could ask questions about early-stage investing and any relevants news to potential investment opportunities."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "%%capture\n",
 19 |     "! pip install dspy==0.1.5\n",
 20 |     "! pip install dotenv==0.0.5"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## Load API keys"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 9,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "data": {
 37 |       "text/plain": [
 38 |        "True"
 39 |       ]
 40 |      },
 41 |      "execution_count": 9,
 42 |      "metadata": {},
 43 |      "output_type": "execute_result"
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "# assumes a .env file exists with api keys YDC_API_KEY and OPENAI_API_KEY\n",
 48 |     "\n",
 49 |     "from dotenv import load_dotenv\n",
 50 |     "\n",
 51 |     "load_dotenv()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "## Building blocks\n",
 59 |     "\n",
 60 |     "This section introduces the blocks to build out a RAG and ReAct agent later on."
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "### Create Language Model (lm)\n",
 68 |     "\n",
 69 |     "A Language Model (lm) in DSPy refers to a framework for programming and interacting with large language models."
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 85,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "import dspy\n",
 79 |     "\n",
 80 |     "turbo = dspy.OpenAI(model='gpt-4o')\n",
 81 |     "\n",
 82 |     "# thread-safe built-in\n",
 83 |     "dspy.settings.configure(lm=turbo)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "### Signature\n",
 91 |     "\n",
 92 |     "Every call to the LM in a DSPy program needs to have a `Signature`.\n",
 93 |     "\n",
 94 |     "A signature consists of three simple elements:\n",
 95 |     "\n",
 96 |     "* A minimal description of the sub-task the LM is supposed to solve.\n",
 97 |     "* A description of one or more input fields (e.g., input question) that we will give to the LM.\n",
 98 |     "* A description of one or more output fields (e.g., the question's answer) that we will expect from the LM."
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 12,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "class BasicQA(dspy.Signature):\n",
108 |     "    \"\"\"Answer questions with wise suggestions\"\"\"\n",
109 |     "\n",
110 |     "    question = dspy.InputField()\n",
111 |     "    answer = dspy.OutputField(desc=\"often between 40-50 words\")"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "### Try it out!"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 13,
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "name": "stdout",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "Question: If you are advising a founder on how they should choose an invester in their company, what qualities should they look for?\n",
131 |       "Predicted Answer: Question: If you are advising a founder on how they should choose an investor in their company, what qualities should they look for?\n",
132 |       "Answer: Look for investors who align with your vision, bring industry expertise, and offer valuable networks. Ensure they have a track record of supporting startups and can provide strategic guidance. Compatibility in values and communication style is also crucial for a successful partnership.\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "question = \"If you are advising a founder on how they should choose an invester in their company, what qualities should they look for?\"\n",
138 |     "\n",
139 |     "# Define the predictor.\n",
140 |     "generate_answer = dspy.Predict(BasicQA)\n",
141 |     "\n",
142 |     "# Call the predictor on a particular input.\n",
143 |     "pred = generate_answer(question=question)\n",
144 |     "\n",
145 |     "# Print the input and the prediction.\n",
146 |     "print(f\"Question: {question}\")\n",
147 |     "print(f\"Predicted Answer: {pred.answer}\")"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "### Create Retriever Model (rm)\n",
155 |     "\n",
156 |     "A Retriever Model refers to a component that is responsible for retrieving relevant information from a retrieval corpus based on user queries. In this case, we'll be using You.com's news API as a retriever."
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 14,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "import os\n",
166 |     "import warnings\n",
167 |     "from typing import Any, Literal, Optional, Union\n",
168 |     "\n",
169 |     "import requests\n",
170 |     "\n",
171 |     "import dspy\n",
172 |     "from dsp.utils import dotdict\n",
173 |     "\n",
174 |     "\n",
175 |     "class YouRM(dspy.Retrieve):\n",
176 |     "    \"\"\"Retriever for You.com's Search and News API.\n",
177 |     "\n",
178 |     "    [API reference](https://documentation.you.com/api-reference/)\n",
179 |     "\n",
180 |     "    Args:\n",
181 |     "        ydc_api_key: you.com API key, if `YDC_API_KEY` is not set in the environment\n",
182 |     "        k: If ``endpoint=\"search\"``, the max snippets to return per search hit.\n",
183 |     "           If ``endpoint=\"news\"``, the max articles to return.\n",
184 |     "        endpoint: you.com endpoints\n",
185 |     "        num_web_results: The max number of web results to return, must be under 20\n",
186 |     "        safesearch: Safesearch settings, one of \"off\", \"moderate\", \"strict\", defaults to moderate\n",
187 |     "        country: Country code, ex: 'US' for United States, see API reference for more info\n",
188 |     "        search_lang: (News API) Language codes, ex: 'en' for English, see API reference for more info\n",
189 |     "        ui_lang: (News API) User interface language for the response, ex: 'en' for English.\n",
190 |     "                            See API reference for more info\n",
191 |     "        spellcheck: (News API) Whether to spell check query or not, defaults to True\n",
192 |     "    \"\"\"\n",
193 |     "\n",
194 |     "    def __init__(\n",
195 |     "        self,\n",
196 |     "        ydc_api_key: Optional[str] = None,\n",
197 |     "        k: int = 3,\n",
198 |     "        endpoint: Literal[\"search\", \"news\"] = \"search\",\n",
199 |     "        num_web_results: Optional[int] = None,\n",
200 |     "        safesearch: Optional[Literal[\"off\", \"moderate\", \"strict\"]] = None,\n",
201 |     "        country: Optional[str] = None,\n",
202 |     "        search_lang: Optional[str] = None,\n",
203 |     "        ui_lang: Optional[str] = None,\n",
204 |     "        spellcheck: Optional[bool] = None,\n",
205 |     "    ):\n",
206 |     "        super().__init__(k=k)\n",
207 |     "\n",
208 |     "        # Data validation\n",
209 |     "        if not ydc_api_key and not os.environ.get(\"YDC_API_KEY\"):\n",
210 |     "            raise RuntimeError('You must supply `ydc_api_key` or set environment variable \"YDC_API_KEY\"')\n",
211 |     "\n",
212 |     "        if endpoint not in (\"search\", \"news\"):\n",
213 |     "            raise ValueError('`endpoint` must be either \"search\" or \"news\"')\n",
214 |     "\n",
215 |     "        # Raise warning if News API-specific fields are set but endpoint is not \"news\"\n",
216 |     "        if endpoint != \"news\":\n",
217 |     "            news_api_fields = (search_lang, ui_lang, spellcheck)\n",
218 |     "            for field in news_api_fields:\n",
219 |     "                if field:\n",
220 |     "                    warnings.warn(\n",
221 |     "                        (\n",
222 |     "                            f\"News API-specific field '{field}' is set but `{endpoint=}`. \"\n",
223 |     "                            \"This will have no effect.\"\n",
224 |     "                        ),\n",
225 |     "                        UserWarning,\n",
226 |     "                    )\n",
227 |     "\n",
228 |     "        self.ydc_api_key = ydc_api_key or os.environ.get(\"YDC_API_KEY\")\n",
229 |     "        self.endpoint = endpoint\n",
230 |     "        self.num_web_results = num_web_results\n",
231 |     "        self.safesearch = safesearch\n",
232 |     "        self.country = country\n",
233 |     "        self.search_lang = search_lang\n",
234 |     "        self.ui_lang = ui_lang\n",
235 |     "        self.spellcheck = spellcheck\n",
236 |     "\n",
237 |     "    def _generate_params(self, query: str) -> dict[str, Any]:\n",
238 |     "        params = {\"safesearch\": self.safesearch, \"country\": self.country}\n",
239 |     "\n",
240 |     "        if self.endpoint == \"search\":\n",
241 |     "            params.update(\n",
242 |     "                query=query,\n",
243 |     "                num_web_results=self.num_web_results,\n",
244 |     "            )\n",
245 |     "        elif self.endpoint == \"news\":\n",
246 |     "            params.update(\n",
247 |     "                q=query,\n",
248 |     "                count=self.num_web_results,\n",
249 |     "                search_lang=self.search_lang,\n",
250 |     "                ui_lang=self.ui_lang,\n",
251 |     "                spellcheck=self.spellcheck,\n",
252 |     "            )\n",
253 |     "\n",
254 |     "        # Remove `None` values\n",
255 |     "        params = {k: v for k, v in params.items() if v is not None}\n",
256 |     "        return params\n",
257 |     "\n",
258 |     "    def forward(self, query_or_queries: Union[str, list[str]], k: Optional[int] = None) -> dspy.Prediction:\n",
259 |     "        k = k if k is not None else self.k\n",
260 |     "\n",
261 |     "        queries = [query_or_queries] if isinstance(query_or_queries, str) else query_or_queries\n",
262 |     "        docs: list[str]\n",
263 |     "        for query in queries:\n",
264 |     "            headers = {\"X-API-Key\": self.ydc_api_key}\n",
265 |     "            params = self._generate_params(query)\n",
266 |     "            response = requests.get(\n",
267 |     "                f\"https://api.ydc-index.io/{self.endpoint}\",\n",
268 |     "                params=params,\n",
269 |     "                headers=headers,\n",
270 |     "            )\n",
271 |     "            response.raise_for_status()\n",
272 |     "            results = response.json()\n",
273 |     "\n",
274 |     "            if self.endpoint == \"search\":\n",
275 |     "                docs = [snippet for hits in results[\"hits\"][:k] for snippet in hits[\"snippets\"]]\n",
276 |     "            elif self.endpoint == \"news\":\n",
277 |     "                docs = [article[\"description\"] for article in results[\"news\"][\"results\"][:k]]\n",
278 |     "        return [dotdict({\"long_text\": document}) for document in docs]"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 15,
284 |    "metadata": {},
285 |    "outputs": [
286 |     {
287 |      "data": {
288 |       "text/plain": [
289 |        "[{'long_text': \"It's not quite summer yet, though it might as well be ...\"},\n",
290 |        " {'long_text': 'PRINCETON, NJ - The Princeton wrestling team announced Thursday that the program will be welcoming seven incoming freshman as a part of the Class of 2028.'},\n",
291 |        " {'long_text': 'The new true crime series — from the creators of the award-winning podcast \"Father Wants Us Dead\" — investigates the 1989 cold-case killing of a Princeton grande dame.'}]"
292 |       ]
293 |      },
294 |      "execution_count": 15,
295 |      "metadata": {},
296 |      "output_type": "execute_result"
297 |     }
298 |    ],
299 |    "source": [
300 |     "# from dspy.retrieve.you_rm import YouRM\n",
301 |     "\n",
302 |     "news_rm = YouRM(endpoint=\"news\")\n",
303 |     "res = news_rm(\"Princeton\")\n",
304 |     "res"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "metadata": {},
310 |    "source": [
311 |     "### Retrieve\n",
312 |     "\n",
313 |     "A module `dspy.Retrieve(k)` will search for the top-k passages that match a given query. \n",
314 |     " \n",
315 |     "By default, this will use the retriever we configure in `dspy.settings.configure()`."
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 16,
321 |    "metadata": {},
322 |    "outputs": [],
323 |    "source": [
324 |     "dspy.settings.configure(lm=turbo, rm=news_rm)"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 17,
330 |    "metadata": {},
331 |    "outputs": [
332 |     {
333 |      "name": "stdout",
334 |      "output_type": "stream",
335 |      "text": [
336 |       "Top 3 passages for question: What is latest news about Princeton University? \n",
337 |       " ------------------------------ \n",
338 |       "\n",
339 |       "1] Reunions events begin Thursday, May 23, and run through Sunday, May 26. \n",
340 |       "\n",
341 |       "2] More than a dozen students at Princeton University said they were ending their hunger strike amid continued anti-Israel demonstrations at the university. \n",
342 |       "\n",
343 |       "3] Over a dozen students at Princeton University have been on hunger strike for the past week as part of a Gaza solidarity encampment on campus protesting Israel’s war on Gaza and calling on the university to disclose and divest from companies with ties to Israel, among other demands. \n",
344 |       "\n"
345 |      ]
346 |     }
347 |    ],
348 |    "source": [
349 |     "question = \"What is latest news about Princeton University?\"\n",
350 |     "\n",
351 |     "retrieve = dspy.Retrieve(k=3)\n",
352 |     "topK_passages = retrieve(question).passages\n",
353 |     "\n",
354 |     "print(f\"Top {retrieve.k} passages for question: {question} \\n\", '-' * 30, '\\n')\n",
355 |     "\n",
356 |     "for idx, passage in enumerate(topK_passages):\n",
357 |     "    print(f'{idx+1}]', passage, '\\n')"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "markdown",
362 |    "metadata": {},
363 |    "source": [
364 |     "## ChatVC: RAG with News API\n",
365 |     "\n",
366 |     "Given a question, we'll search for the latest news through you.com news API and then feed them as context for answer generation."
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "markdown",
371 |    "metadata": {},
372 |    "source": [
373 |     "### Signature\n",
374 |     "\n",
375 |     "Let's start by defining this signature: `context, question --> answer.`"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 61,
381 |    "metadata": {},
382 |    "outputs": [],
383 |    "source": [
384 |     "class GenerateAnswer(dspy.Signature):\n",
385 |     "    \"\"\"Answer questions with the news in the context\"\"\"\n",
386 |     "    context = dspy.InputField(desc=\"may contain relevant news\")\n",
387 |     "    question = dspy.InputField()\n",
388 |     "    answer = dspy.OutputField(desc=\"highlights key points in context - often between 200-500 words\")"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "markdown",
393 |    "metadata": {},
394 |    "source": [
395 |     "### Module\n",
396 |     "\n",
397 |     "* The `__init__` method will simply declare the sub-modules it needs: `dspy.Retrieve` and `dspy.ChainOfThought`. The latter is defined to implement our GenerateAnswer signature.\n",
398 |     "* The `forward` method will describe the control flow of answering the question using the modules we have."
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 38,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "class RAG(dspy.Module):\n",
408 |     "    def __init__(self, num_passages=3):\n",
409 |     "        super().__init__()\n",
410 |     "\n",
411 |     "        self.retrieve = dspy.Retrieve(k=num_passages)\n",
412 |     "        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)\n",
413 |     "    \n",
414 |     "    def forward(self, question):\n",
415 |     "        context = self.retrieve(question).passages\n",
416 |     "        prediction = self.generate_answer(context=context, question=question)\n",
417 |     "        return dspy.Prediction(context=context, answer=prediction.answer)"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "markdown",
422 |    "metadata": {},
423 |    "source": [
424 |     "### Try it out!"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": 39,
430 |    "metadata": {},
431 |    "outputs": [
432 |     {
433 |      "name": "stdout",
434 |      "output_type": "stream",
435 |      "text": [
436 |       "Question: Princeton\n",
437 |       "Predicted Answer: The Princeton wrestling team announced that they will be welcoming seven incoming freshmen as part of the Class of 2028. Additionally, a new true crime series investigates the 1989 cold-case killing of a Princeton grande dame.\n"
438 |      ]
439 |     }
440 |    ],
441 |    "source": [
442 |     "my_question = \"Princeton\"\n",
443 |     "\n",
444 |     "# Get the prediction. This contains `pred.context` and `pred.answer`.\n",
445 |     "uncompiled_rag = RAG()  # uncompiled (i.e., zero-shot) program\n",
446 |     "pred = uncompiled_rag(my_question)\n",
447 |     "\n",
448 |     "# Print the contexts and the answer.\n",
449 |     "print(f\"Question: {my_question}\")\n",
450 |     "print(f\"Predicted Answer: {pred.answer}\")"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "markdown",
455 |    "metadata": {},
456 |    "source": [
457 |     "inspect the chain of thought for the LM - to iterate and modify signature."
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": 41,
463 |    "metadata": {},
464 |    "outputs": [
465 |     {
466 |      "name": "stdout",
467 |      "output_type": "stream",
468 |      "text": [
469 |       "\n",
470 |       "\n",
471 |       "\n",
472 |       "\n",
473 |       "Answer questions with the news in the context\n",
474 |       "\n",
475 |       "---\n",
476 |       "\n",
477 |       "Follow the following format.\n",
478 |       "\n",
479 |       "Context: may contain relevant news\n",
480 |       "\n",
481 |       "Question: ${question}\n",
482 |       "\n",
483 |       "Reasoning: Let's think step by step in order to ${produce the answer}. We ...\n",
484 |       "\n",
485 |       "Answer: highlights key points in context - often between 50-100 words\n",
486 |       "\n",
487 |       "---\n",
488 |       "\n",
489 |       "Context:\n",
490 |       "[1] «It's not quite summer yet, though it might as well be ...»\n",
491 |       "[2] «PRINCETON, NJ - The Princeton wrestling team announced Thursday that the program will be welcoming seven incoming freshman as a part of the Class of 2028.»\n",
492 |       "[3] «The new true crime series — from the creators of the award-winning podcast \"Father Wants Us Dead\" — investigates the 1989 cold-case killing of a Princeton grande dame.»\n",
493 |       "\n",
494 |       "Question: Princeton\n",
495 |       "\n",
496 |       "Reasoning: Let's think step by step in order to\u001b[32m Reasoning: Let's think step by step in order to identify the relevant information about Princeton from the context. We have three pieces of information: the weather, a wrestling team announcement, and a true crime series related to Princeton.\n",
497 |       "\n",
498 |       "Answer: The Princeton wrestling team announced that they will be welcoming seven incoming freshmen as part of the Class of 2028. Additionally, a new true crime series investigates the 1989 cold-case killing of a Princeton grande dame.\u001b[0m\n",
499 |       "\n",
500 |       "\n",
501 |       "\n"
502 |      ]
503 |     }
504 |    ],
505 |    "source": [
506 |     "turbo.inspect_history(n=1)"
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "markdown",
511 |    "metadata": {},
512 |    "source": [
513 |     "## ChatVC: ReAct Agent with Tools\n",
514 |     "\n",
515 |     "* ReAct: an LLM agent designed to tackle complex tasks in an interactive fashion\n",
516 |     "* In this example, we add multiple retrievers (news and search API) as tools in ReAct to shape the agent's interaction and response mechanisms"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": 87,
522 |    "metadata": {},
523 |    "outputs": [
524 |     {
525 |      "name": "stdout",
526 |      "output_type": "stream",
527 |      "text": [
528 |       "Search\n",
529 |       "News\n"
530 |      ]
531 |     }
532 |    ],
533 |    "source": [
534 |     "# # youRM are both of type dspy.Retrieve, which has name=\"Search\" and both these tools is search\n",
535 |     "# so to be able to use both tools separately, the name needs to be differentiated\n",
536 |     "# set these manually\n",
537 |     "\n",
538 |     "search_rm = YouRM(endpoint=\"search\")\n",
539 |     "print(search_rm.name)\n",
540 |     "news_rm = YouRM(endpoint=\"news\")\n",
541 |     "news_rm.name = \"News\"\n",
542 |     "print(news_rm.name)\n",
543 |     "\n",
544 |     "gen = dspy.ReAct('question -> answer', tools=[search_rm, news_rm])"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": 88,
550 |    "metadata": {},
551 |    "outputs": [
552 |     {
553 |      "name": "stdout",
554 |      "output_type": "stream",
555 |      "text": [
556 |       "Question: What news do you have on Princeton that might be interesting to a VC firm?\n",
557 |       "Final Predicted Answer (after ReAct process): Princeton recently hosted an event where 600 leaders from academia, business, and government gathered to explore the rapidly evolving possibilities and challenges of artificial intelligence. This event could be of interest to a VC firm looking for investment opportunities or collaborations in the AI sector.\n"
558 |      ]
559 |     }
560 |    ],
561 |    "source": [
562 |     "# Call the ReAct module on a particular input\n",
563 |     "question = 'What news do you have on Princeton that might be interesting to a VC firm?'\n",
564 |     "result = gen(question=question)\n",
565 |     "\n",
566 |     "print(f\"Question: {question}\")\n",
567 |     "print(f\"Final Predicted Answer (after ReAct process): {result.answer}\")"
568 |    ]
569 |   },
570 |   {
571 |    "cell_type": "code",
572 |    "execution_count": 89,
573 |    "metadata": {},
574 |    "outputs": [
575 |     {
576 |      "name": "stdout",
577 |      "output_type": "stream",
578 |      "text": [
579 |       "\n",
580 |       "\n",
581 |       "\n",
582 |       "\n",
583 |       "You will be given `question` and you will respond with `answer`.\n",
584 |       "\n",
585 |       "To do this, you will interleave Thought, Action, and Observation steps.\n",
586 |       "\n",
587 |       "Thought can reason about the current situation, and Action can be the following types:\n",
588 |       "\n",
589 |       "(1) Search[query], which takes a search query and returns one or more potentially relevant passages from a corpus\n",
590 |       "(2) News[query], which takes a search query and returns one or more potentially relevant passages from a corpus\n",
591 |       "(3) Finish[answer], which returns the final `answer` and finishes the task\n",
592 |       "\n",
593 |       "---\n",
594 |       "\n",
595 |       "Follow the following format.\n",
596 |       "\n",
597 |       "Question: ${question}\n",
598 |       "Thought 1: next steps to take based on last observation\n",
599 |       "Action 1: always either Search[query] or News[query] or, when done, Finish[answer]\n",
600 |       "\n",
601 |       "---\n",
602 |       "\n",
603 |       "Question: What news do you have on Princeton that might be interesting to a VC firm?\n",
604 |       "Thought 1:\u001b[32m Thought 1: To provide relevant news about Princeton that might be interesting to a VC firm, I should look for recent developments in the areas of technology, startups, investments, and academic research coming out of Princeton University or the Princeton area.\n",
605 |       "\n",
606 |       "Action 1: News[Princeton University technology startups investments]\u001b[0m\n",
607 |       "\n",
608 |       "\n",
609 |       "\n",
610 |       "\n",
611 |       "\n",
612 |       "\n",
613 |       "\n",
614 |       "You will be given `question` and you will respond with `answer`.\n",
615 |       "\n",
616 |       "To do this, you will interleave Thought, Action, and Observation steps.\n",
617 |       "\n",
618 |       "Thought can reason about the current situation, and Action can be the following types:\n",
619 |       "\n",
620 |       "(1) Search[query], which takes a search query and returns one or more potentially relevant passages from a corpus\n",
621 |       "(2) News[query], which takes a search query and returns one or more potentially relevant passages from a corpus\n",
622 |       "(3) Finish[answer], which returns the final `answer` and finishes the task\n",
623 |       "\n",
624 |       "---\n",
625 |       "\n",
626 |       "Follow the following format.\n",
627 |       "\n",
628 |       "Question: ${question}\n",
629 |       "\n",
630 |       "Thought 1: next steps to take based on last observation\n",
631 |       "\n",
632 |       "Action 1: always either Search[query] or News[query] or, when done, Finish[answer]\n",
633 |       "\n",
634 |       "Observation 1: observations based on action\n",
635 |       "\n",
636 |       "Thought 2: next steps to take based on last observation\n",
637 |       "\n",
638 |       "Action 2: always either Search[query] or News[query] or, when done, Finish[answer]\n",
639 |       "\n",
640 |       "---\n",
641 |       "\n",
642 |       "Question: What news do you have on Princeton that might be interesting to a VC firm?\n",
643 |       "\n",
644 |       "Thought 1: Thought 1: To provide relevant news about Princeton that might be interesting to a VC firm, I should look for recent developments in the areas of technology, startups, investments, and academic research coming out of Princeton University or the Princeton area.\n",
645 |       "\n",
646 |       "Action 1: News[Princeton University technology startups investments]\n",
647 |       "\n",
648 |       "Observation 1:\n",
649 |       "[1] «{'long_text': 'Read more about tech startup news that breaks down funding, growth, and long-term trajectory of companies across every stage and industry.'}»\n",
650 |       "[2] «{'long_text': 'Princeton welcomed 600 leaders from academia, business and government to campus to explore the rapidly evolving possibilities and challenges of artificial intelligence and to begin charting a course for New Jersey’s role in the future of AI.'}»\n",
651 |       "[3] «{'long_text': 'The University’s endowment investments declined for a second consecutive year, according to the Princeton University Investment Co.'}»\n",
652 |       "\n",
653 |       "Thought 2:\u001b[32m Thought 2: The second observation mentions a significant event where Princeton welcomed leaders from various sectors to discuss the future of AI, which could be highly relevant to a VC firm interested in technology and innovation. This event could indicate potential investment opportunities or collaborations in the AI sector.\n",
654 |       "\n",
655 |       "Action 2: Finish[Princeton recently hosted an event where 600 leaders from academia, business, and government gathered to explore the rapidly evolving possibilities and challenges of artificial intelligence. This event could be of interest to a VC firm looking for investment opportunities or collaborations in the AI sector.]\u001b[0m\n",
656 |       "\n",
657 |       "\n",
658 |       "\n"
659 |      ]
660 |     }
661 |    ],
662 |    "source": [
663 |     "turbo.inspect_history(n=3)"
664 |    ]
665 |   }
666 |  ],
667 |  "metadata": {
668 |   "kernelspec": {
669 |    "display_name": "Python 3",
670 |    "language": "python",
671 |    "name": "python3"
672 |   },
673 |   "language_info": {
674 |    "codemirror_mode": {
675 |     "name": "ipython",
676 |     "version": 3
677 |    },
678 |    "file_extension": ".py",
679 |    "mimetype": "text/x-python",
680 |    "name": "python",
681 |    "nbconvert_exporter": "python",
682 |    "pygments_lexer": "ipython3",
683 |    "version": "3.9.6"
684 |   }
685 |  },
686 |  "nbformat": 4,
687 |  "nbformat_minor": 2
688 | }
689 | 


--------------------------------------------------------------------------------
/you_news_and_llama_index.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# you.com News API <> Llama Index"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "DISCLAIMER: This notebook requires the latest version of LlamaIndex to be installed from this [branch](https://github.com/run-llama/llama_index/pull/13934).  The cells in this notebook will not successfully run if the latest version of LlamaIndex is not being utilized."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Creating a chatbot that can provide personalized news on any topic using you.com news API involves several steps, including obtaining the necessary credentials, and programming your chatbot to use the API. "
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Install required packages"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 1,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "%%capture\n",
 38 |     "! pip install llama-index-retrievers-you==0.1.2\n",
 39 |     "! pip install dotenv==0.0.5\n",
 40 |     "! pip install python-dotenv==1.0.1"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "## Load API keys"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 2,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "data": {
 57 |       "text/plain": [
 58 |        "True"
 59 |       ]
 60 |      },
 61 |      "execution_count": 2,
 62 |      "metadata": {},
 63 |      "output_type": "execute_result"
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "# assumes a .env file exists with api keys YDC_API_KEY and OPENAI_API_KEY\n",
 68 |     "\n",
 69 |     "from dotenv import load_dotenv\n",
 70 |     "import os\n",
 71 |     "\n",
 72 |     "load_dotenv()"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## Create a retriever"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "ename": "TypeError",
 89 |      "evalue": "YouRetriever.__init__() got an unexpected keyword argument 'endpoint_type'",
 90 |      "output_type": "error",
 91 |      "traceback": [
 92 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 93 |       "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
 94 |       "Cell \u001b[0;32mIn[3], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# TODO update the pinned version of llama-index-retrievers-you after PR is merged: https://github.com/run-llama/llama_index/pull/13934\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mllama_index\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mretrievers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01myou\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m YouRetriever\n\u001b[0;32m----> 4\u001b[0m retriever \u001b[38;5;241m=\u001b[39m \u001b[43mYouRetriever\u001b[49m\u001b[43m(\u001b[49m\u001b[43mendpoint_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnews\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      5\u001b[0m retrieved_results \u001b[38;5;241m=\u001b[39m retriever\u001b[38;5;241m.\u001b[39mretrieve(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnational parks in the US\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28mprint\u001b[39m(retrieved_results[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mget_content())\n",
 95 |       "\u001b[0;31mTypeError\u001b[0m: YouRetriever.__init__() got an unexpected keyword argument 'endpoint_type'"
 96 |      ]
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "# TODO update the pinned version of llama-index-retrievers-you after PR is merged: https://github.com/run-llama/llama_index/pull/13934\n",
101 |     "from llama_index.retrievers.you import YouRetriever\n",
102 |     "\n",
103 |     "retriever = YouRetriever()\n",
104 |     "retrieved_results = retriever.retrieve(\"national parks in the US\")\n",
105 |     "\n",
106 |     "print(retrieved_results[0].get_content())"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "# Create a chat engine\n",
114 |     "\n",
115 |     "Conceptually, it is a stateful analogy of a Query Engine. "
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 26,
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "ename": "NameError",
125 |      "evalue": "name 'retriever' is not defined",
126 |      "output_type": "error",
127 |      "traceback": [
128 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
129 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
130 |       "Cell \u001b[0;32mIn[26], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mllama_index\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mchat_engine\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcontext\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ContextChatEngine\n\u001b[0;32m----> 3\u001b[0m chat_engine \u001b[38;5;241m=\u001b[39m ContextChatEngine\u001b[38;5;241m.\u001b[39mfrom_defaults(retriever\u001b[38;5;241m=\u001b[39m\u001b[43mretriever\u001b[49m)\n\u001b[1;32m      4\u001b[0m response \u001b[38;5;241m=\u001b[39m chat_engine\u001b[38;5;241m.\u001b[39mchat(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCanada\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28mprint\u001b[39m(response)\n",
131 |       "\u001b[0;31mNameError\u001b[0m: name 'retriever' is not defined"
132 |      ]
133 |     }
134 |    ],
135 |    "source": [
136 |     "from llama_index.core.chat_engine.context import ContextChatEngine\n",
137 |     "\n",
138 |     "chat_engine = ContextChatEngine.from_defaults(retriever=retriever)\n",
139 |     "response = chat_engine.chat(\"Canada\")\n",
140 |     "\n",
141 |     "print(response)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "### Include a timeframe"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 27,
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "ename": "NameError",
158 |      "evalue": "name 'retriever' is not defined",
159 |      "output_type": "error",
160 |      "traceback": [
161 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
162 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
163 |       "Cell \u001b[0;32mIn[27], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mllama_index\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mchat_engine\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcontext\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ContextChatEngine\n\u001b[0;32m----> 3\u001b[0m chat_engine \u001b[38;5;241m=\u001b[39m ContextChatEngine\u001b[38;5;241m.\u001b[39mfrom_defaults(retriever\u001b[38;5;241m=\u001b[39m\u001b[43mretriever\u001b[49m)\n\u001b[1;32m      4\u001b[0m response \u001b[38;5;241m=\u001b[39m chat_engine\u001b[38;5;241m.\u001b[39mchat(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNorway in Jan 2024\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28mprint\u001b[39m(response)\n",
164 |       "\u001b[0;31mNameError\u001b[0m: name 'retriever' is not defined"
165 |      ]
166 |     }
167 |    ],
168 |    "source": [
169 |     "from llama_index.core.chat_engine.context import ContextChatEngine\n",
170 |     "\n",
171 |     "chat_engine = ContextChatEngine.from_defaults(retriever=retriever)\n",
172 |     "response = chat_engine.chat(\"Norway in Jan 2024\")\n",
173 |     "\n",
174 |     "print(response)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "# Create a chat engine with memory\n",
182 |     "\n",
183 |     "By keeping track of the conversation history, it can answer questions with past context in mind.\n",
184 |     " \n",
185 |     "`condense_plus_context` - A combination of condense_question and context. Look at the chat history and re-write the user message to be a retrieval query for the index. The retrieved text is inserted into the system prompt, so that the chat engine can either respond naturally or use the context from the query engine."
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 28,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "ename": "ModuleNotFoundError",
195 |      "evalue": "No module named 'llama_index.llms'",
196 |      "output_type": "error",
197 |      "traceback": [
198 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
199 |       "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
200 |       "Cell \u001b[0;32mIn[28], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mllama_index\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmemory\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ChatMemoryBuffer\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mllama_index\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mchat_engine\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CondensePlusContextChatEngine\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mllama_index\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mllms\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mopenai\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m OpenAI\n\u001b[1;32m      5\u001b[0m memory \u001b[38;5;241m=\u001b[39m ChatMemoryBuffer\u001b[38;5;241m.\u001b[39mfrom_defaults(token_limit\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3900\u001b[39m)\n\u001b[1;32m      6\u001b[0m llm \u001b[38;5;241m=\u001b[39m OpenAI(model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgpt-4o\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
201 |       "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'llama_index.llms'"
202 |      ]
203 |     }
204 |    ],
205 |    "source": [
206 |     "from llama_index.core.memory import ChatMemoryBuffer\n",
207 |     "from llama_index.core.chat_engine import CondensePlusContextChatEngine\n",
208 |     "from llama_index.llms.openai import OpenAI\n",
209 |     "\n",
210 |     "memory = ChatMemoryBuffer.from_defaults(token_limit=3900)\n",
211 |     "llm = OpenAI(model=\"gpt-4o\")\n",
212 |     "\n",
213 |     "chat_engine = CondensePlusContextChatEngine.from_defaults(\n",
214 |     "    retriever=retriever,\n",
215 |     "    memory=memory,\n",
216 |     "    llm=llm,\n",
217 |     ")"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 29,
223 |    "metadata": {},
224 |    "outputs": [
225 |     {
226 |      "ename": "NameError",
227 |      "evalue": "name 'chat_engine' is not defined",
228 |      "output_type": "error",
229 |      "traceback": [
230 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
231 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
232 |       "Cell \u001b[0;32mIn[29], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# relevant news article with a summary of the news\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mchat_engine\u001b[49m\u001b[38;5;241m.\u001b[39mchat(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSalesforce\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28mprint\u001b[39m(response)\n",
233 |       "\u001b[0;31mNameError\u001b[0m: name 'chat_engine' is not defined"
234 |      ]
235 |     }
236 |    ],
237 |    "source": [
238 |     "# relevant news article with a summary of the news\n",
239 |     "response = chat_engine.chat(\"Salesforce\")\n",
240 |     "print(response)"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 30,
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "ename": "NameError",
250 |      "evalue": "name 'chat_engine' is not defined",
251 |      "output_type": "error",
252 |      "traceback": [
253 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
254 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
255 |       "Cell \u001b[0;32mIn[30], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# uses memory to relate the Hubspot query to market performance news in the query above\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mchat_engine\u001b[49m\u001b[38;5;241m.\u001b[39mchat(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHubspot\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28mprint\u001b[39m(response)\n",
256 |       "\u001b[0;31mNameError\u001b[0m: name 'chat_engine' is not defined"
257 |      ]
258 |     }
259 |    ],
260 |    "source": [
261 |     "# uses memory to relate the Hubspot query to market performance news in the query above\n",
262 |     "response = chat_engine.chat(\"Hubspot\")\n",
263 |     "print(response)"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "markdown",
268 |    "metadata": {},
269 |    "source": [
270 |     "## Personalize news\n",
271 |     "\n",
272 |     "Using a custom prompt, this news feed can be customized to specific threads, such as this one focused on politics and economics."
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 31,
278 |    "metadata": {},
279 |    "outputs": [
280 |     {
281 |      "ename": "NameError",
282 |      "evalue": "name 'OpenAI' is not defined",
283 |      "output_type": "error",
284 |      "traceback": [
285 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
286 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
287 |       "Cell \u001b[0;32mIn[31], line 16\u001b[0m\n\u001b[1;32m      5\u001b[0m custom_prompt \u001b[38;5;241m=\u001b[39m PromptTemplate(\n\u001b[1;32m      6\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\\\u001b[39;00m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;124;03mYou're a top-tier reporter focused on news about {question}.\\n\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     13\u001b[0m )\n\u001b[1;32m     15\u001b[0m memory \u001b[38;5;241m=\u001b[39m ChatMemoryBuffer\u001b[38;5;241m.\u001b[39mfrom_defaults(token_limit\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3900\u001b[39m)\n\u001b[0;32m---> 16\u001b[0m llm \u001b[38;5;241m=\u001b[39m \u001b[43mOpenAI\u001b[49m(model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgpt-4o\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     18\u001b[0m chat_engine \u001b[38;5;241m=\u001b[39m CondensePlusContextChatEngine\u001b[38;5;241m.\u001b[39mfrom_defaults(\n\u001b[1;32m     19\u001b[0m     retriever\u001b[38;5;241m=\u001b[39mretriever,\n\u001b[1;32m     20\u001b[0m     condense_question_prompt\u001b[38;5;241m=\u001b[39mcustom_prompt,\n\u001b[1;32m     21\u001b[0m     memory\u001b[38;5;241m=\u001b[39mmemory,\n\u001b[1;32m     22\u001b[0m     llm\u001b[38;5;241m=\u001b[39mllm,\n\u001b[1;32m     23\u001b[0m )\n",
288 |       "\u001b[0;31mNameError\u001b[0m: name 'OpenAI' is not defined"
289 |      ]
290 |     }
291 |    ],
292 |    "source": [
293 |     "from llama_index.core.memory import ChatMemoryBuffer\n",
294 |     "from llama_index.core.chat_engine import CondensePlusContextChatEngine\n",
295 |     "from llama_index.core import PromptTemplate\n",
296 |     "\n",
297 |     "custom_prompt = PromptTemplate(\n",
298 |     "    \"\"\"\\\n",
299 |     "You're a top-tier reporter focused on news about {question}.\\n\n",
300 |     "YOU SHOULD ALWAYS RESEARCH THE FOLLOWING QUERIES: \\n\n",
301 |     "1. {question} recent news and latest development\\n\n",
302 |     "2. recent regulatory changes affecting {question}\\n\n",
303 |     "3. recent economic events impacting {question}\\n\n",
304 |     "\"\"\"\n",
305 |     ")\n",
306 |     "\n",
307 |     "memory = ChatMemoryBuffer.from_defaults(token_limit=3900)\n",
308 |     "llm = OpenAI(model=\"gpt-4o\")\n",
309 |     "\n",
310 |     "chat_engine = CondensePlusContextChatEngine.from_defaults(\n",
311 |     "    retriever=retriever,\n",
312 |     "    condense_question_prompt=custom_prompt,\n",
313 |     "    memory=memory,\n",
314 |     "    llm=llm,\n",
315 |     ")"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 32,
321 |    "metadata": {},
322 |    "outputs": [
323 |     {
324 |      "ename": "NameError",
325 |      "evalue": "name 'chat_engine' is not defined",
326 |      "output_type": "error",
327 |      "traceback": [
328 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
329 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
330 |       "Cell \u001b[0;32mIn[32], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mchat_engine\u001b[49m\u001b[38;5;241m.\u001b[39mchat(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMongolia\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28mprint\u001b[39m(response)\n",
331 |       "\u001b[0;31mNameError\u001b[0m: name 'chat_engine' is not defined"
332 |      ]
333 |     }
334 |    ],
335 |    "source": [
336 |     "response = chat_engine.chat(\"Mongolia\")\n",
337 |     "print(response)"
338 |    ]
339 |   }
340 |  ],
341 |  "metadata": {
342 |   "kernelspec": {
343 |    "display_name": "Python 3",
344 |    "language": "python",
345 |    "name": "python3"
346 |   },
347 |   "language_info": {
348 |    "codemirror_mode": {
349 |     "name": "ipython",
350 |     "version": 3
351 |    },
352 |    "file_extension": ".py",
353 |    "mimetype": "text/x-python",
354 |    "name": "python",
355 |    "nbconvert_exporter": "python",
356 |    "pygments_lexer": "ipython3",
357 |    "version": "3.10.14"
358 |   }
359 |  },
360 |  "nbformat": 4,
361 |  "nbformat_minor": 2
362 | }
363 | 


--------------------------------------------------------------------------------
/you_pinecone_multiretriever.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "code",
  5 |       "execution_count": 1,
  6 |       "metadata": {
  7 |         "id": "ohp-N8APT5YR"
  8 |       },
  9 |       "outputs": [],
 10 |       "source": [
 11 |         "!pip install -qU \\\n",
 12 |         "  langchain==0.0.335 \\\n",
 13 |         "  pinecone-client==2.2.4 \\\n",
 14 |         "  openai \\\n",
 15 |         "  datasets \\\n",
 16 |         "  tiktoken\n"
 17 |       ]
 18 |     },
 19 |     {
 20 |       "cell_type": "code",
 21 |       "execution_count": 2,
 22 |       "metadata": {
 23 |         "id": "6_1V2yd2RVH1"
 24 |       },
 25 |       "outputs": [],
 26 |       "source": [
 27 |         "from langchain.retrievers.you import YouRetriever\n",
 28 |         "from langchain.chains import RetrievalQA\n",
 29 |         "from langchain.chat_models.openai import ChatOpenAI\n",
 30 |         "import os\n",
 31 |         "\n",
 32 |         "os.environ[\"OPENAI_API_KEY\"] = \"<<OPENAI_API_KEY>>\"\n",
 33 |         "os.environ[\"YDC_API_KEY\"] = \"<<YDC_API_KEY>>\"\n",
 34 |         "\n",
 35 |         "llm = ChatOpenAI(model=\"gpt-3.5-turbo-16k\")\n",
 36 |         "\n",
 37 |         "you_retriever = YouRetriever()\n"
 38 |       ]
 39 |     },
 40 |     {
 41 |       "cell_type": "markdown",
 42 |       "metadata": {
 43 |         "id": "-3kKs-5_Zvw-"
 44 |       },
 45 |       "source": [
 46 |         "## Create Pinecone Indes"
 47 |       ]
 48 |     },
 49 |     {
 50 |       "cell_type": "code",
 51 |       "execution_count": 3,
 52 |       "metadata": {
 53 |         "colab": {
 54 |           "base_uri": "https://localhost:8080/"
 55 |         },
 56 |         "id": "4PIbLT-bWimO",
 57 |         "outputId": "5c319dc9-b6e1-4718-82f6-c36a2303949f"
 58 |       },
 59 |       "outputs": [
 60 |         {
 61 |           "output_type": "execute_result",
 62 |           "data": {
 63 |             "text/plain": [
 64 |               "Dataset({\n",
 65 |               "    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],\n",
 66 |               "    num_rows: 41584\n",
 67 |               "})"
 68 |             ]
 69 |           },
 70 |           "metadata": {},
 71 |           "execution_count": 3
 72 |         }
 73 |       ],
 74 |       "source": [
 75 |         "from datasets import load_dataset\n",
 76 |         "\n",
 77 |         "dataset = load_dataset(\n",
 78 |         "    \"jamescalam/ai-arxiv-chunked\",\n",
 79 |         "    split=\"train\"\n",
 80 |         ")\n",
 81 |         "\n",
 82 |         "dataset"
 83 |       ]
 84 |     },
 85 |     {
 86 |       "cell_type": "code",
 87 |       "execution_count": 4,
 88 |       "metadata": {
 89 |         "id": "NE3HGAnOZ1f6"
 90 |       },
 91 |       "outputs": [],
 92 |       "source": [
 93 |         "import pinecone\n",
 94 |         "\n",
 95 |         "# get API key from app.pinecone.io and environment from console\n",
 96 |         "pinecone.init(\n",
 97 |         "    api_key=\"<<PINECONE_API_KEY>>\",\n",
 98 |         "    environment=\"<<PINECONE_ENV>>\"\n",
 99 |         ")"
100 |       ]
101 |     },
102 |     {
103 |       "cell_type": "code",
104 |       "execution_count": 5,
105 |       "metadata": {
106 |         "id": "z_613JFoaVc8"
107 |       },
108 |       "outputs": [],
109 |       "source": [
110 |         "import time\n",
111 |         "\n",
112 |         "index_name = 'you-pinecone'\n",
113 |         "\n",
114 |         "if index_name not in pinecone.list_indexes():\n",
115 |         "    pinecone.create_index(\n",
116 |         "        index_name,\n",
117 |         "        dimension=1536,\n",
118 |         "        metric='cosine'\n",
119 |         "    )\n",
120 |         "    # wait for index to finish initialization\n",
121 |         "    while not pinecone.describe_index(index_name).status['ready']:\n",
122 |         "        time.sleep(1)\n",
123 |         "\n",
124 |         "index = pinecone.Index(index_name)"
125 |       ]
126 |     },
127 |     {
128 |       "cell_type": "code",
129 |       "execution_count": 9,
130 |       "metadata": {
131 |         "id": "2FwLb0PhaetT"
132 |       },
133 |       "outputs": [],
134 |       "source": [
135 |         "from langchain.embeddings.openai import OpenAIEmbeddings\n",
136 |         "\n",
137 |         "embed_model = OpenAIEmbeddings(\n",
138 |         "    model=\"text-embedding-ada-002\",\n",
139 |         "    disallowed_special=()\n",
140 |         ")"
141 |       ]
142 |     },
143 |     {
144 |       "cell_type": "code",
145 |       "execution_count": null,
146 |       "metadata": {
147 |         "colab": {
148 |           "base_uri": "https://localhost:8080/",
149 |           "height": 49,
150 |           "referenced_widgets": [
151 |             "052489e2f8f44922a7c0921f586d9434",
152 |             "0e29cd87f84d443789b1381c305f94eb",
153 |             "50f29efa1f004ac99f70e36f0e584954",
154 |             "d0194b6d5d9e4dd1943373e3f9dcc27b",
155 |             "aa183e6299d747b6b3b9cc705de44909",
156 |             "2dfa397456cb475d86aa842f848c9f3e",
157 |             "6bd3fd3959e84eddaf03acb4cade42c4",
158 |             "c7b10b88f2214064925d980b0466fb9d",
159 |             "5c12481a504e43fd8a2ff24393a7c617",
160 |             "852d273e2e5740fbb102e2a1c8dc1459",
161 |             "d99c983ccb9f4c1f80d66d2400f660bb"
162 |           ]
163 |         },
164 |         "id": "JgX-ot35ag31",
165 |         "outputId": "44d6e33b-5ff3-40c0-d8ac-d1f19a38b608"
166 |       },
167 |       "outputs": [
168 |         {
169 |           "output_type": "display_data",
170 |           "data": {
171 |             "text/plain": [
172 |               "  0%|          | 0/416 [00:00<?, ?it/s]"
173 |             ],
174 |             "application/vnd.jupyter.widget-view+json": {
175 |               "version_major": 2,
176 |               "version_minor": 0,
177 |               "model_id": "052489e2f8f44922a7c0921f586d9434"
178 |             }
179 |           },
180 |           "metadata": {}
181 |         }
182 |       ],
183 |       "source": [
184 |         "from tqdm.auto import tqdm  # for progress bar\n",
185 |         "\n",
186 |         "data = dataset.to_pandas()  # this makes it easier to iterate over the dataset\n",
187 |         "\n",
188 |         "batch_size = 100\n",
189 |         "\n",
190 |         "for i in tqdm(range(0, len(data), batch_size)):\n",
191 |         "    i_end = min(len(data), i+batch_size)\n",
192 |         "    # get batch of data\n",
193 |         "    batch = data.iloc[i:i_end]\n",
194 |         "    # generate unique ids for each chunk\n",
195 |         "    ids = [f\"{x['doi']}-{x['chunk-id']}\" for i, x in batch.iterrows()]\n",
196 |         "    # get text to embed\n",
197 |         "    texts = [x['chunk'] for _, x in batch.iterrows()]\n",
198 |         "    # embed text\n",
199 |         "    embeds = embed_model.embed_documents(texts)\n",
200 |         "    # get metadata to store in Pinecone\n",
201 |         "    metadata = [\n",
202 |         "        {'text': x['chunk'],\n",
203 |         "         'source': x['source'],\n",
204 |         "         'title': x['title']} for i, x in batch.iterrows()\n",
205 |         "    ]\n",
206 |         "    # add to Pinecone\n",
207 |         "    index.upsert(vectors=zip(ids, embeds, metadata))"
208 |       ]
209 |     },
210 |     {
211 |       "cell_type": "code",
212 |       "execution_count": 10,
213 |       "metadata": {
214 |         "colab": {
215 |           "base_uri": "https://localhost:8080/"
216 |         },
217 |         "id": "mrNOhBiOak3D",
218 |         "outputId": "87f494c0-44e2-4c45-dd24-3d12522c7eb1"
219 |       },
220 |       "outputs": [
221 |         {
222 |           "output_type": "stream",
223 |           "name": "stderr",
224 |           "text": [
225 |             "/usr/local/lib/python3.10/dist-packages/langchain/vectorstores/pinecone.py:59: UserWarning: Passing in `embedding` as a Callable is deprecated. Please pass in an Embeddings object instead.\n",
226 |             "  warnings.warn(\n"
227 |           ]
228 |         }
229 |       ],
230 |       "source": [
231 |         "from langchain.vectorstores import Pinecone\n",
232 |         "\n",
233 |         "text_field = \"text\"  # the metadata field that contains our text\n",
234 |         "\n",
235 |         "# initialize the vector store object\n",
236 |         "vectorstore = Pinecone(\n",
237 |         "    index, embed_model.embed_query, text_field\n",
238 |         ")\n",
239 |         "pinecone_retriever = vectorstore.as_retriever()"
240 |       ]
241 |     },
242 |     {
243 |       "cell_type": "code",
244 |       "execution_count": 11,
245 |       "metadata": {
246 |         "id": "LJXhzDq4SuoH"
247 |       },
248 |       "outputs": [],
249 |       "source": [
250 |         "retriever_infos = [\n",
251 |         "    {\n",
252 |         "        \"name\": \"pinecone\",\n",
253 |         "        \"description\": \"use this tool when you need information about LLMs (llama 2, gpt-4, etc) or ML\",\n",
254 |         "        \"retriever\": pinecone_retriever\n",
255 |         "    }, {\n",
256 |         "        \"name\": \"you.com\",\n",
257 |         "        \"description\": \"use this tool for general purpose queries that can be found on the web\",\n",
258 |         "        \"retriever\": you_retriever\n",
259 |         "    }\n",
260 |         "]"
261 |       ]
262 |     },
263 |     {
264 |       "cell_type": "code",
265 |       "execution_count": 12,
266 |       "metadata": {
267 |         "id": "EBz0LpcqSwNB"
268 |       },
269 |       "outputs": [],
270 |       "source": [
271 |         "from langchain.chains.router.multi_retrieval_qa import MultiRetrievalQAChain\n",
272 |         "\n",
273 |         "retrieval_qa = MultiRetrievalQAChain.from_retrievers(\n",
274 |         "    llm=llm, retriever_infos=retriever_infos,\n",
275 |         "    verbose=True\n",
276 |         ")"
277 |       ]
278 |     },
279 |     {
280 |       "cell_type": "code",
281 |       "execution_count": 13,
282 |       "metadata": {
283 |         "colab": {
284 |           "base_uri": "https://localhost:8080/"
285 |         },
286 |         "id": "OR1t0Bxbbe7u",
287 |         "outputId": "b53447f7-54f4-4246-b0a8-64fd7b50f339"
288 |       },
289 |       "outputs": [
290 |         {
291 |           "output_type": "stream",
292 |           "name": "stdout",
293 |           "text": [
294 |             "\n",
295 |             "\n",
296 |             "\u001b[1m> Entering new MultiRetrievalQAChain chain...\u001b[0m\n"
297 |           ]
298 |         },
299 |         {
300 |           "output_type": "stream",
301 |           "name": "stderr",
302 |           "text": [
303 |             "/usr/local/lib/python3.10/dist-packages/langchain/chains/llm.py:321: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n",
304 |             "  warnings.warn(\n"
305 |           ]
306 |         },
307 |         {
308 |           "output_type": "stream",
309 |           "name": "stdout",
310 |           "text": [
311 |             "pinecone: {'query': 'tell me about the llama 2 LLM'}\n",
312 |             "\u001b[1m> Finished chain.\u001b[0m\n"
313 |           ]
314 |         }
315 |       ],
316 |       "source": [
317 |         "res = retrieval_qa.invoke({\"input\": \"tell me about the llama 2 llm\"})"
318 |       ]
319 |     },
320 |     {
321 |       "cell_type": "code",
322 |       "source": [
323 |         "res"
324 |       ],
325 |       "metadata": {
326 |         "colab": {
327 |           "base_uri": "https://localhost:8080/"
328 |         },
329 |         "id": "C58Nx5RZ-Dcl",
330 |         "outputId": "3bff5482-a1b5-4da0-97e5-06cbb5dd8c88"
331 |       },
332 |       "execution_count": 14,
333 |       "outputs": [
334 |         {
335 |           "output_type": "execute_result",
336 |           "data": {
337 |             "text/plain": [
338 |               "{'input': 'tell me about the llama 2 llm',\n",
339 |               " 'query': 'tell me about the llama 2 LLM',\n",
340 |               " 'result': 'Llama 2 is a collection of pretrained and fine-tuned large language models (LLMs). These LLMs range in scale from 7 billion to 70 billion parameters. The main focus of Llama 2 is on optimizing the LLMs for dialogue use cases. \\n\\nAccording to the developers, Llama 2 models outperform open-source chat models on most benchmarks that were tested. They also claim that based on their humane evaluations for helpfulness and safety, Llama 2 models may be a suitable substitute for closed-source models.\\n\\nThe approach to fine-tuning and safety of the Llama 2 models is described in detail in the research work. However, the specific details of the fine-tuning and safety methods are not mentioned in the given context.\\n\\nOverall, Llama 2 aims to provide high-performing language models for dialogue applications, offering improved performance compared to existing open-source models.'}"
341 |             ]
342 |           },
343 |           "metadata": {},
344 |           "execution_count": 14
345 |         }
346 |       ]
347 |     },
348 |     {
349 |       "cell_type": "code",
350 |       "execution_count": 15,
351 |       "metadata": {
352 |         "colab": {
353 |           "base_uri": "https://localhost:8080/"
354 |         },
355 |         "id": "A2egZBwcbuBI",
356 |         "outputId": "41c73e4e-08b3-4282-fa23-8c87994af857"
357 |       },
358 |       "outputs": [
359 |         {
360 |           "output_type": "stream",
361 |           "name": "stdout",
362 |           "text": [
363 |             "\n",
364 |             "\n",
365 |             "\u001b[1m> Entering new MultiRetrievalQAChain chain...\u001b[0m\n"
366 |           ]
367 |         },
368 |         {
369 |           "output_type": "stream",
370 |           "name": "stderr",
371 |           "text": [
372 |             "/usr/local/lib/python3.10/dist-packages/langchain/chains/llm.py:321: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n",
373 |             "  warnings.warn(\n"
374 |           ]
375 |         },
376 |         {
377 |           "output_type": "stream",
378 |           "name": "stdout",
379 |           "text": [
380 |             "you.com: {'query': 'who is the current German chancellor?'}\n",
381 |             "\u001b[1m> Finished chain.\u001b[0m\n"
382 |           ]
383 |         }
384 |       ],
385 |       "source": [
386 |         "res = retrieval_qa.invoke({\"input\": \"who is the German chancellor now?\"})"
387 |       ]
388 |     },
389 |     {
390 |       "cell_type": "code",
391 |       "execution_count": 16,
392 |       "metadata": {
393 |         "colab": {
394 |           "base_uri": "https://localhost:8080/"
395 |         },
396 |         "id": "n_qzKbCHbwuY",
397 |         "outputId": "c64c6835-9c99-4faf-95df-27f34c10a96d"
398 |       },
399 |       "outputs": [
400 |         {
401 |           "output_type": "execute_result",
402 |           "data": {
403 |             "text/plain": [
404 |               "{'input': 'who is the German chancellor now?',\n",
405 |               " 'query': 'who is the current German chancellor?',\n",
406 |               " 'result': 'The current German chancellor is Olaf Scholz.'}"
407 |             ]
408 |           },
409 |           "metadata": {},
410 |           "execution_count": 16
411 |         }
412 |       ],
413 |       "source": [
414 |         "res"
415 |       ]
416 |     },
417 |     {
418 |       "cell_type": "code",
419 |       "execution_count": null,
420 |       "metadata": {
421 |         "id": "utIgAtb8cuzy"
422 |       },
423 |       "outputs": [],
424 |       "source": []
425 |     }
426 |   ],
427 |   "metadata": {
428 |     "colab": {
429 |       "provenance": []
430 |     },
431 |     "kernelspec": {
432 |       "display_name": "Python 3",
433 |       "name": "python3"
434 |     },
435 |     "language_info": {
436 |       "name": "python"
437 |     },
438 |     "widgets": {
439 |       "application/vnd.jupyter.widget-state+json": {
440 |         "052489e2f8f44922a7c0921f586d9434": {
441 |           "model_module": "@jupyter-widgets/controls",
442 |           "model_name": "HBoxModel",
443 |           "model_module_version": "1.5.0",
444 |           "state": {
445 |             "_dom_classes": [],
446 |             "_model_module": "@jupyter-widgets/controls",
447 |             "_model_module_version": "1.5.0",
448 |             "_model_name": "HBoxModel",
449 |             "_view_count": null,
450 |             "_view_module": "@jupyter-widgets/controls",
451 |             "_view_module_version": "1.5.0",
452 |             "_view_name": "HBoxView",
453 |             "box_style": "",
454 |             "children": [
455 |               "IPY_MODEL_0e29cd87f84d443789b1381c305f94eb",
456 |               "IPY_MODEL_50f29efa1f004ac99f70e36f0e584954",
457 |               "IPY_MODEL_d0194b6d5d9e4dd1943373e3f9dcc27b"
458 |             ],
459 |             "layout": "IPY_MODEL_aa183e6299d747b6b3b9cc705de44909"
460 |           }
461 |         },
462 |         "0e29cd87f84d443789b1381c305f94eb": {
463 |           "model_module": "@jupyter-widgets/controls",
464 |           "model_name": "HTMLModel",
465 |           "model_module_version": "1.5.0",
466 |           "state": {
467 |             "_dom_classes": [],
468 |             "_model_module": "@jupyter-widgets/controls",
469 |             "_model_module_version": "1.5.0",
470 |             "_model_name": "HTMLModel",
471 |             "_view_count": null,
472 |             "_view_module": "@jupyter-widgets/controls",
473 |             "_view_module_version": "1.5.0",
474 |             "_view_name": "HTMLView",
475 |             "description": "",
476 |             "description_tooltip": null,
477 |             "layout": "IPY_MODEL_2dfa397456cb475d86aa842f848c9f3e",
478 |             "placeholder": "​",
479 |             "style": "IPY_MODEL_6bd3fd3959e84eddaf03acb4cade42c4",
480 |             "value": "100%"
481 |           }
482 |         },
483 |         "50f29efa1f004ac99f70e36f0e584954": {
484 |           "model_module": "@jupyter-widgets/controls",
485 |           "model_name": "FloatProgressModel",
486 |           "model_module_version": "1.5.0",
487 |           "state": {
488 |             "_dom_classes": [],
489 |             "_model_module": "@jupyter-widgets/controls",
490 |             "_model_module_version": "1.5.0",
491 |             "_model_name": "FloatProgressModel",
492 |             "_view_count": null,
493 |             "_view_module": "@jupyter-widgets/controls",
494 |             "_view_module_version": "1.5.0",
495 |             "_view_name": "ProgressView",
496 |             "bar_style": "success",
497 |             "description": "",
498 |             "description_tooltip": null,
499 |             "layout": "IPY_MODEL_c7b10b88f2214064925d980b0466fb9d",
500 |             "max": 416,
501 |             "min": 0,
502 |             "orientation": "horizontal",
503 |             "style": "IPY_MODEL_5c12481a504e43fd8a2ff24393a7c617",
504 |             "value": 416
505 |           }
506 |         },
507 |         "d0194b6d5d9e4dd1943373e3f9dcc27b": {
508 |           "model_module": "@jupyter-widgets/controls",
509 |           "model_name": "HTMLModel",
510 |           "model_module_version": "1.5.0",
511 |           "state": {
512 |             "_dom_classes": [],
513 |             "_model_module": "@jupyter-widgets/controls",
514 |             "_model_module_version": "1.5.0",
515 |             "_model_name": "HTMLModel",
516 |             "_view_count": null,
517 |             "_view_module": "@jupyter-widgets/controls",
518 |             "_view_module_version": "1.5.0",
519 |             "_view_name": "HTMLView",
520 |             "description": "",
521 |             "description_tooltip": null,
522 |             "layout": "IPY_MODEL_852d273e2e5740fbb102e2a1c8dc1459",
523 |             "placeholder": "​",
524 |             "style": "IPY_MODEL_d99c983ccb9f4c1f80d66d2400f660bb",
525 |             "value": " 416/416 [26:51&lt;00:00,  3.60s/it]"
526 |           }
527 |         },
528 |         "aa183e6299d747b6b3b9cc705de44909": {
529 |           "model_module": "@jupyter-widgets/base",
530 |           "model_name": "LayoutModel",
531 |           "model_module_version": "1.2.0",
532 |           "state": {
533 |             "_model_module": "@jupyter-widgets/base",
534 |             "_model_module_version": "1.2.0",
535 |             "_model_name": "LayoutModel",
536 |             "_view_count": null,
537 |             "_view_module": "@jupyter-widgets/base",
538 |             "_view_module_version": "1.2.0",
539 |             "_view_name": "LayoutView",
540 |             "align_content": null,
541 |             "align_items": null,
542 |             "align_self": null,
543 |             "border": null,
544 |             "bottom": null,
545 |             "display": null,
546 |             "flex": null,
547 |             "flex_flow": null,
548 |             "grid_area": null,
549 |             "grid_auto_columns": null,
550 |             "grid_auto_flow": null,
551 |             "grid_auto_rows": null,
552 |             "grid_column": null,
553 |             "grid_gap": null,
554 |             "grid_row": null,
555 |             "grid_template_areas": null,
556 |             "grid_template_columns": null,
557 |             "grid_template_rows": null,
558 |             "height": null,
559 |             "justify_content": null,
560 |             "justify_items": null,
561 |             "left": null,
562 |             "margin": null,
563 |             "max_height": null,
564 |             "max_width": null,
565 |             "min_height": null,
566 |             "min_width": null,
567 |             "object_fit": null,
568 |             "object_position": null,
569 |             "order": null,
570 |             "overflow": null,
571 |             "overflow_x": null,
572 |             "overflow_y": null,
573 |             "padding": null,
574 |             "right": null,
575 |             "top": null,
576 |             "visibility": null,
577 |             "width": null
578 |           }
579 |         },
580 |         "2dfa397456cb475d86aa842f848c9f3e": {
581 |           "model_module": "@jupyter-widgets/base",
582 |           "model_name": "LayoutModel",
583 |           "model_module_version": "1.2.0",
584 |           "state": {
585 |             "_model_module": "@jupyter-widgets/base",
586 |             "_model_module_version": "1.2.0",
587 |             "_model_name": "LayoutModel",
588 |             "_view_count": null,
589 |             "_view_module": "@jupyter-widgets/base",
590 |             "_view_module_version": "1.2.0",
591 |             "_view_name": "LayoutView",
592 |             "align_content": null,
593 |             "align_items": null,
594 |             "align_self": null,
595 |             "border": null,
596 |             "bottom": null,
597 |             "display": null,
598 |             "flex": null,
599 |             "flex_flow": null,
600 |             "grid_area": null,
601 |             "grid_auto_columns": null,
602 |             "grid_auto_flow": null,
603 |             "grid_auto_rows": null,
604 |             "grid_column": null,
605 |             "grid_gap": null,
606 |             "grid_row": null,
607 |             "grid_template_areas": null,
608 |             "grid_template_columns": null,
609 |             "grid_template_rows": null,
610 |             "height": null,
611 |             "justify_content": null,
612 |             "justify_items": null,
613 |             "left": null,
614 |             "margin": null,
615 |             "max_height": null,
616 |             "max_width": null,
617 |             "min_height": null,
618 |             "min_width": null,
619 |             "object_fit": null,
620 |             "object_position": null,
621 |             "order": null,
622 |             "overflow": null,
623 |             "overflow_x": null,
624 |             "overflow_y": null,
625 |             "padding": null,
626 |             "right": null,
627 |             "top": null,
628 |             "visibility": null,
629 |             "width": null
630 |           }
631 |         },
632 |         "6bd3fd3959e84eddaf03acb4cade42c4": {
633 |           "model_module": "@jupyter-widgets/controls",
634 |           "model_name": "DescriptionStyleModel",
635 |           "model_module_version": "1.5.0",
636 |           "state": {
637 |             "_model_module": "@jupyter-widgets/controls",
638 |             "_model_module_version": "1.5.0",
639 |             "_model_name": "DescriptionStyleModel",
640 |             "_view_count": null,
641 |             "_view_module": "@jupyter-widgets/base",
642 |             "_view_module_version": "1.2.0",
643 |             "_view_name": "StyleView",
644 |             "description_width": ""
645 |           }
646 |         },
647 |         "c7b10b88f2214064925d980b0466fb9d": {
648 |           "model_module": "@jupyter-widgets/base",
649 |           "model_name": "LayoutModel",
650 |           "model_module_version": "1.2.0",
651 |           "state": {
652 |             "_model_module": "@jupyter-widgets/base",
653 |             "_model_module_version": "1.2.0",
654 |             "_model_name": "LayoutModel",
655 |             "_view_count": null,
656 |             "_view_module": "@jupyter-widgets/base",
657 |             "_view_module_version": "1.2.0",
658 |             "_view_name": "LayoutView",
659 |             "align_content": null,
660 |             "align_items": null,
661 |             "align_self": null,
662 |             "border": null,
663 |             "bottom": null,
664 |             "display": null,
665 |             "flex": null,
666 |             "flex_flow": null,
667 |             "grid_area": null,
668 |             "grid_auto_columns": null,
669 |             "grid_auto_flow": null,
670 |             "grid_auto_rows": null,
671 |             "grid_column": null,
672 |             "grid_gap": null,
673 |             "grid_row": null,
674 |             "grid_template_areas": null,
675 |             "grid_template_columns": null,
676 |             "grid_template_rows": null,
677 |             "height": null,
678 |             "justify_content": null,
679 |             "justify_items": null,
680 |             "left": null,
681 |             "margin": null,
682 |             "max_height": null,
683 |             "max_width": null,
684 |             "min_height": null,
685 |             "min_width": null,
686 |             "object_fit": null,
687 |             "object_position": null,
688 |             "order": null,
689 |             "overflow": null,
690 |             "overflow_x": null,
691 |             "overflow_y": null,
692 |             "padding": null,
693 |             "right": null,
694 |             "top": null,
695 |             "visibility": null,
696 |             "width": null
697 |           }
698 |         },
699 |         "5c12481a504e43fd8a2ff24393a7c617": {
700 |           "model_module": "@jupyter-widgets/controls",
701 |           "model_name": "ProgressStyleModel",
702 |           "model_module_version": "1.5.0",
703 |           "state": {
704 |             "_model_module": "@jupyter-widgets/controls",
705 |             "_model_module_version": "1.5.0",
706 |             "_model_name": "ProgressStyleModel",
707 |             "_view_count": null,
708 |             "_view_module": "@jupyter-widgets/base",
709 |             "_view_module_version": "1.2.0",
710 |             "_view_name": "StyleView",
711 |             "bar_color": null,
712 |             "description_width": ""
713 |           }
714 |         },
715 |         "852d273e2e5740fbb102e2a1c8dc1459": {
716 |           "model_module": "@jupyter-widgets/base",
717 |           "model_name": "LayoutModel",
718 |           "model_module_version": "1.2.0",
719 |           "state": {
720 |             "_model_module": "@jupyter-widgets/base",
721 |             "_model_module_version": "1.2.0",
722 |             "_model_name": "LayoutModel",
723 |             "_view_count": null,
724 |             "_view_module": "@jupyter-widgets/base",
725 |             "_view_module_version": "1.2.0",
726 |             "_view_name": "LayoutView",
727 |             "align_content": null,
728 |             "align_items": null,
729 |             "align_self": null,
730 |             "border": null,
731 |             "bottom": null,
732 |             "display": null,
733 |             "flex": null,
734 |             "flex_flow": null,
735 |             "grid_area": null,
736 |             "grid_auto_columns": null,
737 |             "grid_auto_flow": null,
738 |             "grid_auto_rows": null,
739 |             "grid_column": null,
740 |             "grid_gap": null,
741 |             "grid_row": null,
742 |             "grid_template_areas": null,
743 |             "grid_template_columns": null,
744 |             "grid_template_rows": null,
745 |             "height": null,
746 |             "justify_content": null,
747 |             "justify_items": null,
748 |             "left": null,
749 |             "margin": null,
750 |             "max_height": null,
751 |             "max_width": null,
752 |             "min_height": null,
753 |             "min_width": null,
754 |             "object_fit": null,
755 |             "object_position": null,
756 |             "order": null,
757 |             "overflow": null,
758 |             "overflow_x": null,
759 |             "overflow_y": null,
760 |             "padding": null,
761 |             "right": null,
762 |             "top": null,
763 |             "visibility": null,
764 |             "width": null
765 |           }
766 |         },
767 |         "d99c983ccb9f4c1f80d66d2400f660bb": {
768 |           "model_module": "@jupyter-widgets/controls",
769 |           "model_name": "DescriptionStyleModel",
770 |           "model_module_version": "1.5.0",
771 |           "state": {
772 |             "_model_module": "@jupyter-widgets/controls",
773 |             "_model_module_version": "1.5.0",
774 |             "_model_name": "DescriptionStyleModel",
775 |             "_view_count": null,
776 |             "_view_module": "@jupyter-widgets/base",
777 |             "_view_module_version": "1.2.0",
778 |             "_view_name": "StyleView",
779 |             "description_width": ""
780 |           }
781 |         }
782 |       }
783 |     }
784 |   },
785 |   "nbformat": 4,
786 |   "nbformat_minor": 0
787 | }


--------------------------------------------------------------------------------