├── README.md ├── all_tools_llm.py ├── auto_medium.py ├── data ├── code.txt ├── images │ ├── Lanchain_db.jpeg │ ├── LangChain.jpeg │ ├── LangChain_Google.jpeg │ └── langchain.jpg ├── markdown.txt ├── notebook.txt ├── summary_strategy.pdf └── tracklist.csv ├── get_doc.py ├── news_api.py └── notebooks ├── Chat_with_CSV_&_Excel_using_LangChain_and_OpenAI.ipynb ├── Langchain_doc_chroma.ipynb ├── chains.ipynb ├── langchain.ipynb ├── langchain_deeplearning_course.ipynb ├── langchain_pinecone.ipynb ├── langchain_text.file.ipynb └── llama_and_chroma.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Langchain 🦜🔗 2 | ### This repository will show how Langchain🦜🔗 library can be used and integrated 3 | 4 | Medium articles: 5 | 6 | [Give openai GPT internet access](https://medium.com/@rubentak/give-openai-models-with-internet-access-using-langchain-7d5849f33e03) 7 | 8 | 9 | [LangChain chains](https://medium.com/@rubentak/langchain-using-different-langchain-chains-to-write-a-new-episode-for-the-office-us-7c45d869d895) 10 | 11 | [Talk with your CSV files!](https://medium.com/@rubentak/talk-to-your-data-base-with-gpt-models-using-langchain-csv-19e2b32aa729) 12 | 13 | [Talk to your PDF files in a Pinecone Vector Databases with GPT-4: A Step-by-Step Tutorial](https://medium.com/@rubentak/talk-to-your-pdf-files-in-a-pinecone-vector-databases-with-gpt-4-a-step-by-step-tutorial-1632cf7aa041) 14 | 15 | 16 | LangChain is a Python library that allows you to chain together multiple APIs and external data sources to create more complex and dynamic AI applications. It acts as a middle layer between different machine learning models and external data sources like databases, web APIs, and file systems, allowing you to easily incorporate them into your AI workflows. 17 | 18 | With LangChain, you can chain together multiple models from different libraries, like OpenAI's GPT-3 or Hugging Face's transformers, and connect them to a variety of external data sources. This allows you to create more complex AI applications that can access and utilize a wider range of data sources. 19 | 20 | For example, you can use LangChain to create an AI chatbot that can answer questions by accessing data from a database or external API. The chatbot can use multiple models to interpret and understand the user's questions and then use the external data sources to provide relevant answers. 21 | 22 | Overall, LangChain provides a flexible and powerful tool for building more sophisticated AI applications by allowing you to connect and utilize a wider range of models and data sources. 23 | 24 |
25 | Langchain image 26 | 27 | [Documentation Langchain](https://python.langchain.com/en/latest/index.html) 28 | -------------------------------------------------------------------------------- /all_tools_llm.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | This is a script to run all the tools in the langchain library. It is meant to be used as a reference for how to use the library. 4 | https://github.com/hwchase17/langchain/blob/master/docs/modules/agents/tools/getting_started.md 5 | 6 | ''' 7 | 8 | 9 | from langchain. agents import load_tools 10 | from langchain. agents import initialize_agent 11 | from langchain. agents import AgentType 12 | from langchain. memory import ConversationBufferMemory 13 | from langchain. chat_models import ChatOpenAI 14 | import os 15 | import credentials 16 | from newsapi import NewsApiClient 17 | 18 | 19 | 20 | #%% 21 | # set up openai api key 22 | openai_api_key = os.environ.get('OPENAI_API_KEY') 23 | os.environ["OPENAI_API_KEY"] = credentials. OPENAI_API_KEY 24 | os.environ["GOOGLE_API_KEY"] = credentials. google_api_key 25 | os.environ["GOOGLE_CSE_ID"] = credentials. google_cse_id 26 | os.environ["WOLFRAM_ALPHA_APPID"] = credentials. wolfram_alpha_appid 27 | #os.environ["NEWS_API_KEY"] = credentials.news_api_key 28 | 29 | #newsapi = NewsApiClient(api_key='...') 30 | 31 | memory = ConversationBufferMemory() 32 | llm = ChatOpenAI( ) 33 | tools = load_tools([ 34 | 'wikipedia', 35 | 'llm-math', 36 | 'google-search', 37 | 'python_repl', 38 | 'wolfram-alpha', 39 | 'terminal', 40 | #'news-api', 41 | #'podcast-api', 42 | #'openweathermap-api' 43 | ], llm=llm) 44 | agent = initialize_agent( 45 | tools, 46 | llm, 47 | agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION, 48 | verbose=True, 49 | memory=memory 50 | ) 51 | 52 | 53 | def get_prompt_input_key(inputs, memory_variables): 54 | input_keys = list(inputs.keys()) 55 | 56 | for key in memory_variables: 57 | if key in input_keys: 58 | return key 59 | 60 | raise ValueError(f"One input key expected got {input_keys}") 61 | 62 | agent.run({'chat_history': [], 'input': "What's up ChatGPT?"}) 63 | 64 | -------------------------------------------------------------------------------- /auto_medium.py: -------------------------------------------------------------------------------- 1 | '''Reading in a python notebook file as a text document and cleaning''' 2 | 3 | 4 | #%% READING 5 | # Importing the libraries 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | import pandas as pd 9 | 10 | # Reading in a python files as a text document 11 | text = open("notebooks/langchain.ipynb", "r") 12 | 13 | # print frirst 5 lines of text 14 | for x in range(50): 15 | print(text.readline()) 16 | 17 | #%% CLEANING 18 | # when "celltype is code" then "source" is code 19 | # when "celltype is markdown" then "source" is markdown 20 | import json 21 | # extracht the code and markdown from the text file 22 | code = [] 23 | markdown = [] 24 | notebook = [] 25 | 26 | with open('notebooks/langchain.ipynb', 'r') as f: 27 | data = json.load(f) 28 | 29 | for cell in data['cells']: 30 | if cell['cell_type'] == 'markdown': 31 | markdown.append(cell['source']) 32 | elif cell['cell_type'] == 'code': 33 | code.append(cell['source']) 34 | 35 | for cell in data['cells']: 36 | notebook.append(cell['source']) 37 | 38 | # append the markdown and code to a text file 39 | with open('data/markdown.txt', 'w') as f: 40 | for item in markdown: 41 | f.write("%s\n" % item) 42 | 43 | with open('data/code.txt', 'w') as f: 44 | for item in code: 45 | f.write("%s\n" % item) 46 | 47 | with open('data/notebook.txt', 'w') as f: 48 | for item in notebook: 49 | f.write("%s\n" % item) 50 | 51 | # read in the markdown and code text files 52 | markdown = open("data/markdown.txt", "r") 53 | code = open("data/code.txt", "r") 54 | notebook = open("data/notebook.txt", "r") 55 | # 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /data/code.txt: -------------------------------------------------------------------------------- 1 | ['# import libraries\n', 'import os\n', 'from langchain.llms import OpenAI'] 2 | ['#create a new openai api key\n', '#os.environ["OPENAI_API_KEY"] = "..."'] 3 | ['# set up openai api key\n', "openai_api_key = os.environ.get('OPENAI_API_KEY')"] 4 | ['# create a llm\n', 'llm = OpenAI(temperature = 0.9)'] 5 | ['text = "What are 4 countries where they eat a lot of potatoes?"\n', 'print(llm(text))'] 6 | ['from langchain.prompts import PromptTemplate'] 7 | ['prompt = PromptTemplate(\n', ' input_variables=["food"],\n', ' template="What are 4 countries where they eat a lot of {food}?",\n', ')'] 8 | ['print(prompt.format(food="potatoes"))'] 9 | ['print(llm(prompt.format(food="potatoes")))'] 10 | ['print(llm(prompt.format(food="rice")))'] 11 | ['from langchain.prompts import PromptTemplate\n', 'from langchain.llms import OpenAI\n', 'from langchain.chains import LLMChain'] 12 | ['llm = OpenAI(temperature=0.9)\n', '\n', 'prompt = PromptTemplate(\n', ' input_variables=["food"],\n', ' template="What are 4 countries where they eat a lot of {food}?",\n', ')'] 13 | ['chain = LLMChain(llm=llm, prompt=prompt)'] 14 | ['print(chain.run("potatoes"))\n', 'print(chain.run("rice"))'] 15 | ['# Install serpapi\n', '#!pip install google-search-results'] 16 | ['#import libraries\n', 'from langchain.agents import load_tools\n', 'from langchain.agents import initialize_agent\n', 'from langchain.llms import OpenAI'] 17 | ['# Load the model\n', 'llm = OpenAI(temperature=0)'] 18 | ['# Load in some tools to use\n', '\n', '#os.environ["SERPAPI_API_KEY"] = "..."\n', '\n', 'tools = load_tools(["serpapi", "llm-math"], llm=llm)'] 19 | ['agent = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True)'] 20 | ['agent.run("What is the hight of Obama? And how many cans of coke can you stack to reach that height?")'] 21 | ['from langchain import OpenAI, ConversationChain'] 22 | ['llm = OpenAI(temperature=0)\n', 'conversation = ConversationChain(llm=llm, verbose=True)'] 23 | ['conversation.predict(input="Hi how are you doing!")'] 24 | ['conversation.predict(input="I\'m doing well! Just having a conversation with my newly created langchain agent with memory.")'] 25 | ['conversation.predict(input="I would like to get to know a bit about what LangChain is")'] 26 | ['conversation.predict(input="...")'] 27 | [] 28 | -------------------------------------------------------------------------------- /data/images/Lanchain_db.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rubentak/Langchain/f458ac02d6725e2083b0570793bfd6dd4066a426/data/images/Lanchain_db.jpeg -------------------------------------------------------------------------------- /data/images/LangChain.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rubentak/Langchain/f458ac02d6725e2083b0570793bfd6dd4066a426/data/images/LangChain.jpeg -------------------------------------------------------------------------------- /data/images/LangChain_Google.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rubentak/Langchain/f458ac02d6725e2083b0570793bfd6dd4066a426/data/images/LangChain_Google.jpeg -------------------------------------------------------------------------------- /data/images/langchain.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rubentak/Langchain/f458ac02d6725e2083b0570793bfd6dd4066a426/data/images/langchain.jpg -------------------------------------------------------------------------------- /data/markdown.txt: -------------------------------------------------------------------------------- 1 | ['# Getting started with Langchain\n', '\n', 'In this tutorial, we will walk through the basics of using LangChain to create a simple AI chatbot that can answer questions using OpenAI and Serpapi.'] 2 | ['### The temperature of a llm is a hyperparameter that controls the randomness of the output. It is a value between 0 and 1. A higher temperature will result in more random output. A lower temperature will result in more predictable output. For this tutorial, we will set the temperature to 0.9. You can play aroun with this yourself to see how it affects the output.'] 3 | ['### This works! But what if we what if we want to ask a question about a different food? We can use the prompt template class to do this.'] 4 | ['# Getting started with prompt templates\n', '\n', '### A prompt template is a string that contains variables that can be filled in with different values. For example, you could have a prompt template that looks like this:'] 5 | ['Notice how the answer is different from the previous one. This is because of the randomness of the llm. If we want to get the same answer every time, we can set the seed of the llm.'] 6 | ['# Chaining\n', '\n', '### Here, we combine LLMs and prompts in multistep workflows using the prompt template class.'] 7 | ['# Agents: Dynamically call chains based on user input\n', '\n', 'SerpApi is a search engine results page (SERP) scraping and parsing API that allows developers to retrieve and analyze data from various search engines, including Google, Bing, Yahoo, and more.\n', '\n', '\n', 'To use SerpApi in Python, you will need to install the SerpApi Python module using pip. An API key can be created here:\n', '\n', '[Serpapi key](https://serpapi.com/users/sign_up)\n', '\n', '[Serpapi documentation](https://python.langchain.com/en/latest/modules/agents/tools/examples/serpapi.html)\n', '\n', 'In the next part, we are going to combine OpenAI with SerpApi to create a chatbot that can answer questions with internet search results.\n', '\n', '[Agent langchain documentation](https://python.langchain.com/en/latest/modules/agents.html)\n', '\n', '\n'] 8 | ['Tools are functions that agents can use to interact with the world. These tools can be generic utilities (e.g. search), other chains, or even other agents.\n', 'Here we use the SerpApi and llm-math tools. We will use the SerpApi tool to search the internet for answers to our questions. We will use the llm-math tool to answer math questions.\n', '\n', '[Tools documentation](https://python.langchain.com/en/latest/modules/agents/tools/getting_started.html)'] 9 | ["### Finally, let's initialize an agent with:\n", ' 1. The tools\n', ' 2. The language model\n', ' 3. The type of agent we want to use.'] 10 | ['For the agent, I have chosen the zero-shot-react-description agent. This agent uses the ReAct framework to determine which tool to use based solely on the tool’s description. Any number of tools can be provided. This agent requires that a description is provided for each tool.\n', '\n', '[Agent types](https://python.langchain.com/en/latest/modules/agents/agents/agent_types.html)\n', '\n', "By setting verbose=True, we can see the agent's internal state as it processes the input.\n", '\n', 'Other agent types could also be chosen\n', '\n', 'Now, let us ask our agent a question for which it will use the serpapi tool to search the internet for an answer and need to calculate the answer using the llm-math tool.'] 11 | ['# Memory: Add state to chains and agents\n', '\n', 'By adding memory to our agents, we can make them more dynamic and interactive. We can have a conversation and the agent will remember what we said and use that information to answer our questions.\n'] 12 | ['# Continue the conversation yourself!'] 13 | ['# This is the end of the tutorial. We hope you enjoyed it!\n', '\n', 'Follow me on Github and Medium for more content:\n', '\n', '- [Github](https://github.com/rubentak)\n', '- [Medium](https://medium.com/@rubentak)'] 14 | -------------------------------------------------------------------------------- /data/notebook.txt: -------------------------------------------------------------------------------- 1 | ['# Getting started with Langchain\n', '\n', 'In this tutorial, we will walk through the basics of using LangChain to create a simple AI chatbot that can answer questions using OpenAI and Serpapi.'] 2 | ['# import libraries\n', 'import os\n', 'from langchain.llms import OpenAI'] 3 | ['#create a new openai api key\n', '#os.environ["OPENAI_API_KEY"] = "..."'] 4 | ['# set up openai api key\n', "openai_api_key = os.environ.get('OPENAI_API_KEY')"] 5 | ['### The temperature of a llm is a hyperparameter that controls the randomness of the output. It is a value between 0 and 1. A higher temperature will result in more random output. A lower temperature will result in more predictable output. For this tutorial, we will set the temperature to 0.9. You can play aroun with this yourself to see how it affects the output.'] 6 | ['# create a llm\n', 'llm = OpenAI(temperature = 0.9)'] 7 | ['text = "What are 4 countries where they eat a lot of potatoes?"\n', 'print(llm(text))'] 8 | ['### This works! But what if we what if we want to ask a question about a different food? We can use the prompt template class to do this.'] 9 | ['# Getting started with prompt templates\n', '\n', '### A prompt template is a string that contains variables that can be filled in with different values. For example, you could have a prompt template that looks like this:'] 10 | ['from langchain.prompts import PromptTemplate'] 11 | ['prompt = PromptTemplate(\n', ' input_variables=["food"],\n', ' template="What are 4 countries where they eat a lot of {food}?",\n', ')'] 12 | ['print(prompt.format(food="potatoes"))'] 13 | ['print(llm(prompt.format(food="potatoes")))'] 14 | ['Notice how the answer is different from the previous one. This is because of the randomness of the llm. If we want to get the same answer every time, we can set the seed of the llm.'] 15 | ['print(llm(prompt.format(food="rice")))'] 16 | ['# Chaining\n', '\n', '### Here, we combine LLMs and prompts in multistep workflows using the prompt template class.'] 17 | ['from langchain.prompts import PromptTemplate\n', 'from langchain.llms import OpenAI\n', 'from langchain.chains import LLMChain'] 18 | ['llm = OpenAI(temperature=0.9)\n', '\n', 'prompt = PromptTemplate(\n', ' input_variables=["food"],\n', ' template="What are 4 countries where they eat a lot of {food}?",\n', ')'] 19 | ['chain = LLMChain(llm=llm, prompt=prompt)'] 20 | ['print(chain.run("potatoes"))\n', 'print(chain.run("rice"))'] 21 | ['# Agents: Dynamically call chains based on user input\n', '\n', 'SerpApi is a search engine results page (SERP) scraping and parsing API that allows developers to retrieve and analyze data from various search engines, including Google, Bing, Yahoo, and more.\n', '\n', '\n', 'To use SerpApi in Python, you will need to install the SerpApi Python module using pip. An API key can be created here:\n', '\n', '[Serpapi key](https://serpapi.com/users/sign_up)\n', '\n', '[Serpapi documentation](https://python.langchain.com/en/latest/modules/agents/tools/examples/serpapi.html)\n', '\n', 'In the next part, we are going to combine OpenAI with SerpApi to create a chatbot that can answer questions with internet search results.\n', '\n', '[Agent langchain documentation](https://python.langchain.com/en/latest/modules/agents.html)\n', '\n', '\n'] 22 | ['# Install serpapi\n', '#!pip install google-search-results'] 23 | ['#import libraries\n', 'from langchain.agents import load_tools\n', 'from langchain.agents import initialize_agent\n', 'from langchain.llms import OpenAI'] 24 | ['# Load the model\n', 'llm = OpenAI(temperature=0)'] 25 | ['Tools are functions that agents can use to interact with the world. These tools can be generic utilities (e.g. search), other chains, or even other agents.\n', 'Here we use the SerpApi and llm-math tools. We will use the SerpApi tool to search the internet for answers to our questions. We will use the llm-math tool to answer math questions.\n', '\n', '[Tools documentation](https://python.langchain.com/en/latest/modules/agents/tools/getting_started.html)'] 26 | ['# Load in some tools to use\n', '\n', '#os.environ["SERPAPI_API_KEY"] = "..."\n', '\n', 'tools = load_tools(["serpapi", "llm-math"], llm=llm)'] 27 | ["### Finally, let's initialize an agent with:\n", ' 1. The tools\n', ' 2. The language model\n', ' 3. The type of agent we want to use.'] 28 | ['agent = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True)'] 29 | ['For the agent, I have chosen the zero-shot-react-description agent. This agent uses the ReAct framework to determine which tool to use based solely on the tool’s description. Any number of tools can be provided. This agent requires that a description is provided for each tool.\n', '\n', '[Agent types](https://python.langchain.com/en/latest/modules/agents/agents/agent_types.html)\n', '\n', "By setting verbose=True, we can see the agent's internal state as it processes the input.\n", '\n', 'Other agent types could also be chosen\n', '\n', 'Now, let us ask our agent a question for which it will use the serpapi tool to search the internet for an answer and need to calculate the answer using the llm-math tool.'] 30 | ['agent.run("What is the hight of Obama? And how many cans of coke can you stack to reach that height?")'] 31 | ['# Memory: Add state to chains and agents\n', '\n', 'By adding memory to our agents, we can make them more dynamic and interactive. We can have a conversation and the agent will remember what we said and use that information to answer our questions.\n'] 32 | ['from langchain import OpenAI, ConversationChain'] 33 | ['llm = OpenAI(temperature=0)\n', 'conversation = ConversationChain(llm=llm, verbose=True)'] 34 | ['conversation.predict(input="Hi how are you doing!")'] 35 | ['conversation.predict(input="I\'m doing well! Just having a conversation with my newly created langchain agent with memory.")'] 36 | ['conversation.predict(input="I would like to get to know a bit about what LangChain is")'] 37 | ['# Continue the conversation yourself!'] 38 | ['conversation.predict(input="...")'] 39 | ['# This is the end of the tutorial. We hope you enjoyed it!\n', '\n', 'Follow me on Github and Medium for more content:\n', '\n', '- [Github](https://github.com/rubentak)\n', '- [Medium](https://medium.com/@rubentak)'] 40 | [] 41 | -------------------------------------------------------------------------------- /data/summary_strategy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rubentak/Langchain/f458ac02d6725e2083b0570793bfd6dd4066a426/data/summary_strategy.pdf -------------------------------------------------------------------------------- /get_doc.py: -------------------------------------------------------------------------------- 1 | # convert Langchain_doc_chroma.ipynb to get_doc.py 2 | -------------------------------------------------------------------------------- /news_api.py: -------------------------------------------------------------------------------- 1 | from newsapi import NewsApiClient 2 | 3 | # Init 4 | newsapi = NewsApiClient(api_key='07e3b0dee63f4126a1ac417bc51732a2') 5 | 6 | # /v2/top-headlines 7 | top_headlines = newsapi.get_top_headlines(q='bitcoin', 8 | sources='bbc-news,the-verge', 9 | category='business', 10 | language='en', 11 | country='us') 12 | 13 | # /v2/everything 14 | all_articles = newsapi.get_everything(q='bitcoin', 15 | sources='bbc-news,the-verge', 16 | domains='bbc.co.uk,techcrunch.com', 17 | from_param='2017-12-01', 18 | to='2017-12-12', 19 | language='en', 20 | sort_by='relevancy', 21 | page=2) 22 | 23 | # /v2/top-headlines/sources 24 | sources = newsapi.get_sources() -------------------------------------------------------------------------------- /notebooks/Chat_with_CSV_&_Excel_using_LangChain_and_OpenAI.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "language": "python", 11 | "display_name": "Python 3 (ipykernel)" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "source": [ 21 | "# Chat with your csv files using Langchain and OpenAI\n", 22 | "\n", 23 | "In this notebook we will use Langchain and OpenAI to create a question-answering system for a csv file. We will use the tracklist.csv file from my spotify repository - [Github](https://github.com/rubentak/Spotify)\n", 24 | "\n", 25 | "\n", 26 | "\n", 27 | "### First install langchain and openai if these are not installed" 28 | ], 29 | "metadata": { 30 | "id": "r0alwu2qZzGU" 31 | } 32 | }, 33 | { 34 | "cell_type": "code", 35 | "source": [ 36 | "# !pip install -q langchain openai os\n" 37 | ], 38 | "metadata": { 39 | "colab": { 40 | "base_uri": "https://localhost:8080/" 41 | }, 42 | "id": "JGXR0-MsaE1T", 43 | "outputId": "5d097565-7a4b-4f2b-a3f3-09a733a43f6f", 44 | "ExecuteTime": { 45 | "end_time": "2023-05-04T08:13:27.996158Z", 46 | "start_time": "2023-05-04T08:13:27.989429Z" 47 | } 48 | }, 49 | "execution_count": 13, 50 | "outputs": [] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "source": [ 55 | "### Load the libraries" 56 | ], 57 | "metadata": { 58 | "collapsed": false 59 | } 60 | }, 61 | { 62 | "cell_type": "code", 63 | "source": [ 64 | "from langchain.document_loaders import CSVLoader\n", 65 | "from langchain.indexes import VectorstoreIndexCreator\n", 66 | "from langchain.chains import RetrievalQA\n", 67 | "from langchain.llms import OpenAI\n", 68 | "import os" 69 | ], 70 | "metadata": { 71 | "id": "0X1EX8eRcRCk", 72 | "ExecuteTime": { 73 | "start_time": "2023-05-23T18:54:01.742509Z", 74 | "end_time": "2023-05-23T18:54:01.745852Z" 75 | } 76 | }, 77 | "execution_count": 2, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "source": [ 83 | "Get your OpenAI Key from here - https://platform.openai.com/account/api-keys" 84 | ], 85 | "metadata": { 86 | "id": "aJOrUeJedzMF" 87 | } 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "source": [ 92 | "### Set enviorment variable and download the csv file" 93 | ], 94 | "metadata": { 95 | "collapsed": false 96 | } 97 | }, 98 | { 99 | "cell_type": "code", 100 | "source": [ 101 | "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" 102 | ], 103 | "metadata": { 104 | "id": "Zmk3KQMqZvsO", 105 | "ExecuteTime": { 106 | "start_time": "2023-05-23T18:54:35.062762Z", 107 | "end_time": "2023-05-23T18:54:35.065989Z" 108 | } 109 | }, 110 | "execution_count": 5, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "source": [ 116 | "# Load the documents\n", 117 | "loader = CSVLoader(file_path='../data/tracklist.csv')" 118 | ], 119 | "metadata": { 120 | "id": "TR8dXxxHbY_b", 121 | "ExecuteTime": { 122 | "start_time": "2023-05-23T18:54:35.424704Z", 123 | "end_time": "2023-05-23T18:54:35.427875Z" 124 | } 125 | }, 126 | "execution_count": 6, 127 | "outputs": [] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "source": [ 132 | "# Create an index using the loaded documents\n", 133 | "index_creator = VectorstoreIndexCreator()\n", 134 | "docsearch = index_creator.from_loaders([loader])" 135 | ], 136 | "metadata": { 137 | "colab": { 138 | "base_uri": "https://localhost:8080/" 139 | }, 140 | "id": "3P4s3IKXaar8", 141 | "outputId": "602eedc3-7adb-4871-c685-c77c43544058", 142 | "ExecuteTime": { 143 | "start_time": "2023-05-23T18:54:35.798332Z", 144 | "end_time": "2023-05-23T18:54:45.841283Z" 145 | } 146 | }, 147 | "execution_count": 7, 148 | "outputs": [ 149 | { 150 | "name": "stderr", 151 | "output_type": "stream", 152 | "text": [ 153 | "Using embedded DuckDB without persistence: data will be transient\n" 154 | ] 155 | } 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 8, 161 | "metadata": { 162 | "id": "-vruq68YZnmL", 163 | "ExecuteTime": { 164 | "start_time": "2023-05-23T18:54:47.816082Z", 165 | "end_time": "2023-05-23T18:54:47.822430Z" 166 | } 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "# Create a question-answering chain using the index\n", 171 | "chain = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type=\"stuff\", retriever=docsearch.vectorstore.as_retriever(), input_key=\"question\")" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "source": [ 177 | "# Pass a query to the chain\n", 178 | "query = \"Do you have a column called tempo?\"\n", 179 | "response = chain({\"question\": query})" 180 | ], 181 | "metadata": { 182 | "id": "Jwy7gjr0aXBr", 183 | "ExecuteTime": { 184 | "start_time": "2023-05-23T18:54:48.256986Z", 185 | "end_time": "2023-05-23T18:54:50.764984Z" 186 | } 187 | }, 188 | "execution_count": 9, 189 | "outputs": [] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "source": [ 194 | "print(response['result'])" 195 | ], 196 | "metadata": { 197 | "colab": { 198 | "base_uri": "https://localhost:8080/" 199 | }, 200 | "id": "9D8ajM74eC_c", 201 | "outputId": "22175dff-7b62-495e-8d98-9928095e5778", 202 | "ExecuteTime": { 203 | "start_time": "2023-05-23T18:54:50.767922Z", 204 | "end_time": "2023-05-23T18:54:50.772111Z" 205 | } 206 | }, 207 | "execution_count": 10, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | " Yes, tempo is one of the columns.\n" 214 | ] 215 | } 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 15, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | " added_at, id, name, popularity, uri, artist, album, release_date, duration_ms, length, danceability, acousticness, energy, instrumentalness, liveness, loudness, speechiness, tempo, time_signature, valence, mode, key, genres, and genre_group.\n" 227 | ] 228 | } 229 | ], 230 | "source": [ 231 | "# wrap it in a function\n", 232 | "query = \"What are all the columns in this file?\"\n", 233 | "def ask_question(query):\n", 234 | " response = chain({\"question\": query})\n", 235 | " return response['result']\n", 236 | "print(ask_question(query))" 237 | ], 238 | "metadata": { 239 | "collapsed": false, 240 | "ExecuteTime": { 241 | "start_time": "2023-05-23T18:57:26.085807Z", 242 | "end_time": "2023-05-23T18:57:34.823590Z" 243 | } 244 | } 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "source": [ 249 | "## Continue the conversation yourself!" 250 | ], 251 | "metadata": { 252 | "collapsed": false 253 | } 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "outputs": [], 259 | "source": [ 260 | "query = \"...\"\n", 261 | "def ask_question(query):\n", 262 | " response = chain({\"question\": query})\n", 263 | " return response['result']\n", 264 | "ask_question(query)" 265 | ], 266 | "metadata": { 267 | "collapsed": false, 268 | "ExecuteTime": { 269 | "start_time": "2023-05-23T18:55:23.815162Z", 270 | "end_time": "2023-05-23T18:55:26.098961Z" 271 | } 272 | } 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "outputs": [], 278 | "source": [], 279 | "metadata": { 280 | "collapsed": false 281 | } 282 | } 283 | ] 284 | } 285 | -------------------------------------------------------------------------------- /notebooks/Langchain_doc_chroma.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Scrape the LangChain documentation into a ChromaDB Vector Database and use it for a GPT-4 chatbot to talk with it!\n", 7 | "\n", 8 | "In this notebook, I will introduce you to vector databases. I will:\n", 9 | "- Web scrape the LangChain documentation\n", 10 | "- Store the LangChain documentation in a Chroma DB vector database\n", 11 | "- Create a retriever to retrieve the desired information\n", 12 | "- Create a Q&A chatbot with GPT-4\n", 13 | "- Show how you can delete and reopen a vector database locally to save space\n", 14 | "Visualise your vector database (very cool, read till the end!)\n", 15 | "\n", 16 | "This notebook is connected to a medium article: [Medium articles](https://medium.com/@rubentak)" 17 | ], 18 | "metadata": { 19 | "collapsed": false 20 | } 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": { 26 | "collapsed": true, 27 | "ExecuteTime": { 28 | "end_time": "2023-06-19T16:41:30.999311Z", 29 | "start_time": "2023-06-19T16:41:30.995203Z" 30 | } 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "# Import libraries\n", 35 | "from bs4 import BeautifulSoup\n", 36 | "import requests\n", 37 | "import re" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": "'\\n\\n\\n\\n\\n🦜️🔗 Langchain\\n\\n\\n\\n\\n
\\n
\\n\\n\\n\\n'" 47 | }, 48 | "execution_count": 4, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "# Function for getting the text data from a website url\n", 55 | "def get_data(url):\n", 56 | "\tr = requests.get(url)\n", 57 | "\treturn r.text\n", 58 | "\n", 59 | "get_data('https://python.langchain.com/en/latest/index.html')" 60 | ], 61 | "metadata": { 62 | "collapsed": false, 63 | "ExecuteTime": { 64 | "end_time": "2023-06-19T16:41:31.536319Z", 65 | "start_time": "2023-06-19T16:41:31.271768Z" 66 | } 67 | } 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": "[]" 76 | }, 77 | "execution_count": 5, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "# get links of website\n", 84 | "def get_links(website_link):\n", 85 | " html_data = get_data(website_link)\n", 86 | " soup = BeautifulSoup(html_data, \"html.parser\")\n", 87 | " list_links = []\n", 88 | " for link in soup.find_all(\"a\", href=True):\n", 89 | " list_links.append(link[\"href\"])\n", 90 | " return list_links\n", 91 | "\n", 92 | "sub_links = get_links('https://python.langchain.com/en/latest/index.html')\n", 93 | "sub_links" 94 | ], 95 | "metadata": { 96 | "collapsed": false, 97 | "ExecuteTime": { 98 | "end_time": "2023-06-19T16:41:32.242590Z", 99 | "start_time": "2023-06-19T16:41:32.048150Z" 100 | } 101 | } 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 6, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": "0" 110 | }, 111 | "execution_count": 6, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "len(sub_links)" 118 | ], 119 | "metadata": { 120 | "collapsed": false, 121 | "ExecuteTime": { 122 | "end_time": "2023-06-19T16:41:32.721007Z", 123 | "start_time": "2023-06-19T16:41:32.715151Z" 124 | } 125 | } 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 7, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "[]\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "# Add base path to all links\n", 141 | "def add_base_path(website_link, list_links):\n", 142 | " list_links_with_base_path = []\n", 143 | "\n", 144 | " for link in list_links:\n", 145 | "\n", 146 | " if not link.startswith('/'):\n", 147 | " link_with_base_path = website_link + link\n", 148 | " list_links_with_base_path.append(link_with_base_path)\n", 149 | "\n", 150 | "\t\t# if link.startswith('https://') dont add base path\n", 151 | " elif link.startswith('http://'):\n", 152 | " list_links_with_base_path.append(link)\n", 153 | "\n", 154 | " elif link.startswith('.'):\n", 155 | " link_with_base_path = website_link + link.split('/')[-1]\n", 156 | " list_links_with_base_path.append(link_with_base_path)\n", 157 | "\n", 158 | " return list_links_with_base_path\n", 159 | "\n", 160 | "link_list = add_base_path('https://python.langchain.com/en/latest/', sub_links)\n", 161 | "link_list_print = print(link_list)" 162 | ], 163 | "metadata": { 164 | "collapsed": false, 165 | "ExecuteTime": { 166 | "end_time": "2023-06-19T16:41:33.900107Z", 167 | "start_time": "2023-06-19T16:41:33.885118Z" 168 | } 169 | } 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 207, 174 | "outputs": [], 175 | "source": [ 176 | "def save_content(link_list):\n", 177 | " for i, link in enumerate(link_list):\n", 178 | " html_data = get_data(link)\n", 179 | " soup = BeautifulSoup(html_data, \"html.parser\")\n", 180 | " text = soup.get_text()\n", 181 | "\n", 182 | " # Remove the first 835 lines\n", 183 | " lines = text.splitlines()\n", 184 | " cleaned_text = \"\\n\".join(lines[835:])\n", 185 | "\n", 186 | " # Get the first 3 words in the cleaned text\n", 187 | " words = cleaned_text.split()[:3]\n", 188 | " file_name_prefix = \"_\".join(words)\n", 189 | "\n", 190 | " # Replace special characters and spaces with an underscore\n", 191 | " file_name_prefix = re.sub(r\"[^a-zA-Z0-9]+\", \"_\", file_name_prefix)\n", 192 | "\n", 193 | " # Get the current working directory\n", 194 | " current_dir = os.getcwd()\n", 195 | "\n", 196 | " # Move up one level to the parent directory\n", 197 | " parent_dir = os.path.dirname(current_dir)\n", 198 | "\n", 199 | " # Set the path to the data folder\n", 200 | " data_folder = os.path.join(parent_dir, \"data/langchain_doc\")\n", 201 | "\n", 202 | " # Create the data folder if it doesn't exist\n", 203 | " if not os.path.exists(data_folder):\n", 204 | " os.makedirs(data_folder)\n", 205 | "\n", 206 | " # Set the path to the output file\n", 207 | " output_file = os.path.join(data_folder, f\"{i}_{file_name_prefix}.txt\")\n", 208 | "\n", 209 | " # Save the cleaned content to the output file\n", 210 | " with open(output_file, \"w\") as f:\n", 211 | " f.write(cleaned_text)" 212 | ], 213 | "metadata": { 214 | "collapsed": false, 215 | "ExecuteTime": { 216 | "end_time": "2023-06-13T16:19:58.593491Z", 217 | "start_time": "2023-06-13T16:19:58.585704Z" 218 | } 219 | } 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 208, 224 | "outputs": [], 225 | "source": [ 226 | "# save the content of the links into txt files\n", 227 | "save_content(link_list)" 228 | ], 229 | "metadata": { 230 | "collapsed": false, 231 | "ExecuteTime": { 232 | "end_time": "2023-06-13T16:27:42.481314Z", 233 | "start_time": "2023-06-13T16:19:59.394583Z" 234 | } 235 | } 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "source": [ 240 | "# Q&A bot with langchain over a directory" 241 | ], 242 | "metadata": { 243 | "collapsed": false 244 | } 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 8, 249 | "outputs": [], 250 | "source": [ 251 | "# Import libraries\n", 252 | "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", 253 | "from langchain.chat_models import ChatOpenAI\n", 254 | "import os\n", 255 | "from langchain.vectorstores import Chroma\n", 256 | "from langchain.embeddings import OpenAIEmbeddings\n", 257 | "from langchain.llms import OpenAI\n", 258 | "from langchain.chains import RetrievalQA\n", 259 | "from langchain.document_loaders import DirectoryLoader" 260 | ], 261 | "metadata": { 262 | "collapsed": false, 263 | "ExecuteTime": { 264 | "end_time": "2023-06-19T16:41:39.120259Z", 265 | "start_time": "2023-06-19T16:41:37.999101Z" 266 | } 267 | } 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 10, 272 | "outputs": [], 273 | "source": [ 274 | "# Create a new openai api key\n", 275 | "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", 276 | "# set up openai api key\n", 277 | "openai_api_key = os.environ.get('OPENAI_API_KEY')" 278 | ], 279 | "metadata": { 280 | "collapsed": false, 281 | "ExecuteTime": { 282 | "end_time": "2023-06-19T16:41:51.728890Z", 283 | "start_time": "2023-06-19T16:41:51.721987Z" 284 | } 285 | } 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 11, 290 | "outputs": [ 291 | { 292 | "data": { 293 | "text/plain": "679" 294 | }, 295 | "execution_count": 11, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "# Print number of txt files in directory\n", 302 | "loader = DirectoryLoader('/Users/erictak/PycharmProjects/langchain/data/langchain_doc', glob=\"./*.txt\")\n", 303 | "doc = loader.load ( )\n", 304 | "len(doc)" 305 | ], 306 | "metadata": { 307 | "collapsed": false, 308 | "ExecuteTime": { 309 | "end_time": "2023-06-19T16:42:12.868777Z", 310 | "start_time": "2023-06-19T16:41:56.716136Z" 311 | } 312 | } 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 12, 317 | "outputs": [], 318 | "source": [ 319 | "# Splitting the text into chunks\n", 320 | "text_splitter = RecursiveCharacterTextSplitter (chunk_size=1000, chunk_overlap=200)\n", 321 | "texts = text_splitter.split_documents(doc)" 322 | ], 323 | "metadata": { 324 | "collapsed": false, 325 | "ExecuteTime": { 326 | "end_time": "2023-06-19T16:42:15.006310Z", 327 | "start_time": "2023-06-19T16:42:14.844815Z" 328 | } 329 | } 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 13, 334 | "outputs": [ 335 | { 336 | "data": { 337 | "text/plain": "5576" 338 | }, 339 | "execution_count": 13, 340 | "metadata": {}, 341 | "output_type": "execute_result" 342 | } 343 | ], 344 | "source": [ 345 | "# Count the number of chunks\n", 346 | "len(texts)" 347 | ], 348 | "metadata": { 349 | "collapsed": false, 350 | "ExecuteTime": { 351 | "end_time": "2023-06-19T16:42:15.204405Z", 352 | "start_time": "2023-06-19T16:42:15.197499Z" 353 | } 354 | } 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 14, 359 | "outputs": [ 360 | { 361 | "data": { 362 | "text/plain": "Document(page_content='Twitter\\n\\nContents\\n\\nInstallation and Setup\\n\\nDocument Loader\\n\\nTwitter#\\n\\nTwitter is an online social media and social networking service.\\n\\nInstallation and Setup#\\n\\npip install tweepy\\n\\nWe must initialize the loader with the Twitter API token, and we need to set up the Twitter username.\\n\\nDocument Loader#\\n\\nSee a usage example.\\n\\nfrom langchain.document_loaders import TwitterTweetLoader\\n\\nprevious\\n\\nTrello\\n\\nnext\\n\\nUnstructured\\n\\nContents\\n\\nInstallation and Setup\\n\\nDocument Loader\\n\\nBy Harrison Chase\\n\\n© Copyright 2023, Harrison Chase.\\n\\nLast updated on Jun 13, 2023.', metadata={'source': '/Users/erictak/PycharmProjects/langchain/data/langchain_doc/592_Twitter_Contents_Installation.txt'})" 363 | }, 364 | "execution_count": 14, 365 | "metadata": {}, 366 | "output_type": "execute_result" 367 | } 368 | ], 369 | "source": [ 370 | "# Print the first chunk\n", 371 | "texts[0]" 372 | ], 373 | "metadata": { 374 | "collapsed": false, 375 | "ExecuteTime": { 376 | "end_time": "2023-06-19T16:42:16.482727Z", 377 | "start_time": "2023-06-19T16:42:16.460830Z" 378 | } 379 | } 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "source": [ 384 | "# Data base creation with ChromaDB\n", 385 | "\n", 386 | "https://www.youtube.com/watch?v=3yPBVii7Ct0" 387 | ], 388 | "metadata": { 389 | "collapsed": false 390 | } 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 15, 395 | "outputs": [ 396 | { 397 | "name": "stderr", 398 | "output_type": "stream", 399 | "text": [ 400 | "Using embedded DuckDB with persistence: data will be stored in: db\n" 401 | ] 402 | } 403 | ], 404 | "source": [ 405 | "# Embed and store the texts\n", 406 | "# Supplying a persist_directory will store the embeddings on disk\n", 407 | "persist_directory = 'db'\n", 408 | "\n", 409 | "# OpenAI embeddings\n", 410 | "embedding = OpenAIEmbeddings()\n", 411 | "\n", 412 | "vectordb = Chroma.from_documents(documents=texts,\n", 413 | " embedding=embedding,\n", 414 | " persist_directory=persist_directory)" 415 | ], 416 | "metadata": { 417 | "collapsed": false, 418 | "ExecuteTime": { 419 | "end_time": "2023-06-19T16:43:22.594752Z", 420 | "start_time": "2023-06-19T16:42:18.312526Z" 421 | } 422 | } 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 16, 427 | "outputs": [ 428 | { 429 | "data": { 430 | "text/plain": "FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))", 431 | "application/vnd.jupyter.widget-view+json": { 432 | "version_major": 2, 433 | "version_minor": 0, 434 | "model_id": "125530d1ac8c4ce3b0eac61cf501ec54" 435 | } 436 | }, 437 | "metadata": {}, 438 | "output_type": "display_data" 439 | } 440 | ], 441 | "source": [ 442 | "# Persist the db to disk\n", 443 | "vectordb.persist()\n", 444 | "vectordb = None" 445 | ], 446 | "metadata": { 447 | "collapsed": false, 448 | "ExecuteTime": { 449 | "end_time": "2023-06-19T16:43:27.867043Z", 450 | "start_time": "2023-06-19T16:43:22.595566Z" 451 | } 452 | } 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 17, 457 | "outputs": [ 458 | { 459 | "name": "stderr", 460 | "output_type": "stream", 461 | "text": [ 462 | "Using embedded DuckDB with persistence: data will be stored in: db\n" 463 | ] 464 | } 465 | ], 466 | "source": [ 467 | "# Now we can load the persisted database from disk, and use it as normal.\n", 468 | "vectordb = Chroma(persist_directory=persist_directory,\n", 469 | " embedding_function=embedding)" 470 | ], 471 | "metadata": { 472 | "collapsed": false, 473 | "ExecuteTime": { 474 | "end_time": "2023-06-19T16:43:31.243716Z", 475 | "start_time": "2023-06-19T16:43:29.911945Z" 476 | } 477 | } 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "source": [ 482 | "# Create retriever" 483 | ], 484 | "metadata": { 485 | "collapsed": false 486 | } 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 41, 491 | "outputs": [], 492 | "source": [ 493 | "retriever = vectordb.as_retriever()" 494 | ], 495 | "metadata": { 496 | "collapsed": false, 497 | "ExecuteTime": { 498 | "end_time": "2023-06-19T16:26:30.027619Z", 499 | "start_time": "2023-06-19T16:26:30.025333Z" 500 | } 501 | } 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 42, 506 | "outputs": [], 507 | "source": [ 508 | "docs = retriever.get_relevant_documents(\"What to do when getting started?\")" 509 | ], 510 | "metadata": { 511 | "collapsed": false, 512 | "ExecuteTime": { 513 | "end_time": "2023-06-19T16:26:30.878277Z", 514 | "start_time": "2023-06-19T16:26:30.666287Z" 515 | } 516 | } 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 43, 521 | "outputs": [ 522 | { 523 | "data": { 524 | "text/plain": "[Document(page_content='Step 1: Create Tools# Agents are largely defined by the tools they can use. If you have a specific task you want the agent to accomplish, you have to give it access to the right tools. We have many tools natively in LangChain, so you should first look to see if any of them meet your needs. But we also make it easy to define a custom tool, so if you need custom tools you should absolutely do that.\\n\\n(Optional) Step 2: Modify Agent# The built-in LangChain agent types are designed to work well in generic situations, but you may be able to improve performance by modifying the agent implementation. There are several ways you could do this:\\n\\nModify the base prompt. This can be used to give the agent more context on how it should behave, etc. Modify the output parser. This is necessary if the agent is having trouble parsing the language model output.', metadata={'source': '/Users/erictak/PycharmProjects/langchain/data/langchain_doc/644_Agents_Contents_Create.txt'}),\n Document(page_content='Step 1: Create Tools# Agents are largely defined by the tools they can use. If you have a specific task you want the agent to accomplish, you have to give it access to the right tools. We have many tools natively in LangChain, so you should first look to see if any of them meet your needs. But we also make it easy to define a custom tool, so if you need custom tools you should absolutely do that.\\n\\n(Optional) Step 2: Modify Agent# The built-in LangChain agent types are designed to work well in generic situations, but you may be able to improve performance by modifying the agent implementation. There are several ways you could do this:\\n\\nModify the base prompt. This can be used to give the agent more context on how it should behave, etc. Modify the output parser. This is necessary if the agent is having trouble parsing the language model output.', metadata={'source': '/Users/erictak/PycharmProjects/langchain/data/langchain_doc/644_Agents_Contents_Create.txt'}),\n Document(page_content='Step 1: Create Tools# Agents are largely defined by the tools they can use. If you have a specific task you want the agent to accomplish, you have to give it access to the right tools. We have many tools natively in LangChain, so you should first look to see if any of them meet your needs. But we also make it easy to define a custom tool, so if you need custom tools you should absolutely do that.\\n\\n(Optional) Step 2: Modify Agent# The built-in LangChain agent types are designed to work well in generic situations, but you may be able to improve performance by modifying the agent implementation. There are several ways you could do this:\\n\\nModify the base prompt. This can be used to give the agent more context on how it should behave, etc. Modify the output parser. This is necessary if the agent is having trouble parsing the language model output.', metadata={'source': '/Users/erictak/PycharmProjects/langchain/data/langchain_doc/644_Agents_Contents_Create.txt'}),\n Document(page_content='Step 1: Create Tools# Agents are largely defined by the tools they can use. If you have a specific task you want the agent to accomplish, you have to give it access to the right tools. We have many tools natively in LangChain, so you should first look to see if any of them meet your needs. But we also make it easy to define a custom tool, so if you need custom tools you should absolutely do that.\\n\\n(Optional) Step 2: Modify Agent# The built-in LangChain agent types are designed to work well in generic situations, but you may be able to improve performance by modifying the agent implementation. There are several ways you could do this:\\n\\nModify the base prompt. This can be used to give the agent more context on how it should behave, etc. Modify the output parser. This is necessary if the agent is having trouble parsing the language model output.', metadata={'source': '/Users/erictak/PycharmProjects/langchain/data/langchain_doc/426_Agents_Contents_Create.txt'})]" 525 | }, 526 | "execution_count": 43, 527 | "metadata": {}, 528 | "output_type": "execute_result" 529 | } 530 | ], 531 | "source": [ 532 | "docs" 533 | ], 534 | "metadata": { 535 | "collapsed": false, 536 | "ExecuteTime": { 537 | "end_time": "2023-06-19T16:26:31.158685Z", 538 | "start_time": "2023-06-19T16:26:31.152005Z" 539 | } 540 | } 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": 44, 545 | "outputs": [ 546 | { 547 | "data": { 548 | "text/plain": "4" 549 | }, 550 | "execution_count": 44, 551 | "metadata": {}, 552 | "output_type": "execute_result" 553 | } 554 | ], 555 | "source": [ 556 | "len(docs)" 557 | ], 558 | "metadata": { 559 | "collapsed": false, 560 | "ExecuteTime": { 561 | "end_time": "2023-06-19T16:26:31.757606Z", 562 | "start_time": "2023-06-19T16:26:31.753080Z" 563 | } 564 | } 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": 45, 569 | "outputs": [], 570 | "source": [ 571 | "retriever = vectordb.as_retriever(search_kwargs={\"k\": 2})" 572 | ], 573 | "metadata": { 574 | "collapsed": false, 575 | "ExecuteTime": { 576 | "end_time": "2023-06-19T16:26:32.357106Z", 577 | "start_time": "2023-06-19T16:26:32.352369Z" 578 | } 579 | } 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": 46, 584 | "outputs": [ 585 | { 586 | "data": { 587 | "text/plain": "'similarity'" 588 | }, 589 | "execution_count": 46, 590 | "metadata": {}, 591 | "output_type": "execute_result" 592 | } 593 | ], 594 | "source": [ 595 | "retriever.search_type" 596 | ], 597 | "metadata": { 598 | "collapsed": false, 599 | "ExecuteTime": { 600 | "end_time": "2023-06-19T16:26:32.812894Z", 601 | "start_time": "2023-06-19T16:26:32.810205Z" 602 | } 603 | } 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": 47, 608 | "outputs": [ 609 | { 610 | "data": { 611 | "text/plain": "{'k': 2}" 612 | }, 613 | "execution_count": 47, 614 | "metadata": {}, 615 | "output_type": "execute_result" 616 | } 617 | ], 618 | "source": [ 619 | "retriever.search_kwargs" 620 | ], 621 | "metadata": { 622 | "collapsed": false, 623 | "ExecuteTime": { 624 | "end_time": "2023-06-19T16:26:34.043554Z", 625 | "start_time": "2023-06-19T16:26:34.016639Z" 626 | } 627 | } 628 | }, 629 | { 630 | "cell_type": "markdown", 631 | "source": [ 632 | "# Create a question answering chain" 633 | ], 634 | "metadata": { 635 | "collapsed": false 636 | } 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 49, 641 | "outputs": [], 642 | "source": [ 643 | "# Create the chain to answer questions\n", 644 | "qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),\n", 645 | " chain_type=\"stuff\",\n", 646 | " retriever=retriever,\n", 647 | " return_source_documents=True,\n", 648 | " verbose=True)" 649 | ], 650 | "metadata": { 651 | "collapsed": false, 652 | "ExecuteTime": { 653 | "end_time": "2023-06-19T16:27:30.194519Z", 654 | "start_time": "2023-06-19T16:27:30.192880Z" 655 | } 656 | } 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": 50, 661 | "outputs": [], 662 | "source": [ 663 | "# Cite sources\n", 664 | "def process_llm_response(llm_response):\n", 665 | " print(llm_response['result'])\n", 666 | " print('\\n\\nSources:')\n", 667 | " for source in llm_response[\"source_documents\"]:\n", 668 | " print(source.metadata['source'])" 669 | ], 670 | "metadata": { 671 | "collapsed": false, 672 | "ExecuteTime": { 673 | "end_time": "2023-06-19T16:27:30.714970Z", 674 | "start_time": "2023-06-19T16:27:30.710251Z" 675 | } 676 | } 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": 51, 681 | "outputs": [ 682 | { 683 | "name": "stdout", 684 | "output_type": "stream", 685 | "text": [ 686 | "\n", 687 | "\n", 688 | "\u001B[1m> Entering new RetrievalQA chain...\u001B[0m\n", 689 | "\n", 690 | "\u001B[1m> Finished chain.\u001B[0m\n", 691 | " Step 1: Create Tools (Optional), Step 2: Modify Agent (Optional), Step 3: Modify Agent Executor.\n", 692 | "\n", 693 | "\n", 694 | "Sources:\n", 695 | "/Users/erictak/PycharmProjects/langchain/data/langchain_doc/426_Agents_Contents_Create.txt\n", 696 | "/Users/erictak/PycharmProjects/langchain/data/langchain_doc/644_Agents_Contents_Create.txt\n" 697 | ] 698 | } 699 | ], 700 | "source": [ 701 | "# Question\n", 702 | "query = \"What are the steps of the Quickstart Guide?\"\n", 703 | "llm_response = qa_chain(query)\n", 704 | "process_llm_response(llm_response)" 705 | ], 706 | "metadata": { 707 | "collapsed": false, 708 | "ExecuteTime": { 709 | "end_time": "2023-06-19T16:27:34.967867Z", 710 | "start_time": "2023-06-19T16:27:31.647843Z" 711 | } 712 | } 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": 53, 717 | "outputs": [ 718 | { 719 | "name": "stdout", 720 | "output_type": "stream", 721 | "text": [ 722 | "\n", 723 | "\n", 724 | "\u001B[1m> Entering new RetrievalQA chain...\u001B[0m\n", 725 | "\n", 726 | "\u001B[1m> Finished chain.\u001B[0m\n", 727 | " Custom Agent, Custom LLM Agent, Custom LLM Agent (with a ChatModel), Custom MRKL Agent, Custom MultiAction Agent, Custom Agent with Tool Retrieval, Conversation Agent (for Chat Models), Conversation Agent MRKL, MRKL Chat, ReAct, Self Ask With Search, Structured Tool Chat Agent.\n", 728 | "\n", 729 | "\n", 730 | "Sources:\n", 731 | "/Users/erictak/PycharmProjects/langchain/data/langchain_doc/382_Agents_Agents_Note.txt\n", 732 | "/Users/erictak/PycharmProjects/langchain/data/langchain_doc/382_Agents_Agents_Note.txt\n" 733 | ] 734 | } 735 | ], 736 | "source": [ 737 | "# Break it down\n", 738 | "query = \"What are all agent types?\"\n", 739 | "llm_response = qa_chain(query)\n", 740 | "process_llm_response(llm_response)\n", 741 | "#llm_response" 742 | ], 743 | "metadata": { 744 | "collapsed": false, 745 | "ExecuteTime": { 746 | "end_time": "2023-06-19T16:28:07.848794Z", 747 | "start_time": "2023-06-19T16:28:02.784404Z" 748 | } 749 | } 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": 54, 754 | "outputs": [ 755 | { 756 | "data": { 757 | "text/plain": "('similarity', )" 758 | }, 759 | "execution_count": 54, 760 | "metadata": {}, 761 | "output_type": "execute_result" 762 | } 763 | ], 764 | "source": [ 765 | "qa_chain.retriever.search_type , qa_chain.retriever.vectorstore" 766 | ], 767 | "metadata": { 768 | "collapsed": false, 769 | "ExecuteTime": { 770 | "end_time": "2023-06-19T16:28:14.817109Z", 771 | "start_time": "2023-06-19T16:28:14.812659Z" 772 | } 773 | } 774 | }, 775 | { 776 | "cell_type": "code", 777 | "execution_count": 55, 778 | "outputs": [ 779 | { 780 | "name": "stdout", 781 | "output_type": "stream", 782 | "text": [ 783 | "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n", 784 | "\n", 785 | "{context}\n", 786 | "\n", 787 | "Question: {question}\n", 788 | "Helpful Answer:\n" 789 | ] 790 | } 791 | ], 792 | "source": [ 793 | "print(qa_chain.combine_documents_chain.llm_chain.prompt.template)" 794 | ], 795 | "metadata": { 796 | "collapsed": false, 797 | "ExecuteTime": { 798 | "end_time": "2023-06-19T16:28:15.148649Z", 799 | "start_time": "2023-06-19T16:28:15.140236Z" 800 | } 801 | } 802 | }, 803 | { 804 | "cell_type": "markdown", 805 | "source": [ 806 | "# Deleteing the DB" 807 | ], 808 | "metadata": { 809 | "collapsed": false 810 | } 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": 250, 815 | "outputs": [ 816 | { 817 | "name": "stdout", 818 | "output_type": "stream", 819 | "text": [ 820 | "updating: db/ (stored 0%)\r\n", 821 | "updating: db/chroma-embeddings.parquet (deflated 29%)\r\n", 822 | "updating: db/index/ (stored 0%)\r\n", 823 | "updating: db/index/index_metadata_b9a5e02f-ebd0-4b13-8858-b30b211c4546.pkl (deflated 5%)\r\n", 824 | "updating: db/index/id_to_uuid_b9a5e02f-ebd0-4b13-8858-b30b211c4546.pkl (deflated 37%)\r\n", 825 | "updating: db/index/uuid_to_id_d80886e4-65e1-4231-8c73-99ff58d68061.pkl (deflated 39%)\r\n", 826 | "updating: db/index/index_b9a5e02f-ebd0-4b13-8858-b30b211c4546.bin (deflated 17%)\r\n", 827 | "updating: db/index/index_d80886e4-65e1-4231-8c73-99ff58d68061.bin (deflated 17%)\r\n", 828 | "updating: db/index/uuid_to_id_b9a5e02f-ebd0-4b13-8858-b30b211c4546.pkl (deflated 41%)\r\n", 829 | "updating: db/index/id_to_uuid_d80886e4-65e1-4231-8c73-99ff58d68061.pkl (deflated 32%)\r\n", 830 | "updating: db/index/index_metadata_d80886e4-65e1-4231-8c73-99ff58d68061.pkl (deflated 5%)\r\n", 831 | "updating: db/chroma-collections.parquet (deflated 50%)\r\n", 832 | "updating: db/.DS_Store (deflated 96%)\r\n" 833 | ] 834 | } 835 | ], 836 | "source": [ 837 | "!zip -r db.zip ./db" 838 | ], 839 | "metadata": { 840 | "collapsed": false, 841 | "ExecuteTime": { 842 | "end_time": "2023-06-13T17:27:10.552238Z", 843 | "start_time": "2023-06-13T17:27:05.048974Z" 844 | } 845 | } 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": 251, 850 | "outputs": [], 851 | "source": [ 852 | "# To clean up, you can delete the collection\n", 853 | "vectordb.delete_collection()\n", 854 | "vectordb.persist()\n", 855 | "\n", 856 | "# Delete the directory\n", 857 | "!rm -rf db/" 858 | ], 859 | "metadata": { 860 | "collapsed": false, 861 | "ExecuteTime": { 862 | "end_time": "2023-06-13T17:27:10.802965Z", 863 | "start_time": "2023-06-13T17:27:10.552089Z" 864 | } 865 | } 866 | }, 867 | { 868 | "cell_type": "markdown", 869 | "source": [ 870 | "# Starting again loading the db" 871 | ], 872 | "metadata": { 873 | "collapsed": false 874 | } 875 | }, 876 | { 877 | "cell_type": "code", 878 | "execution_count": 57, 879 | "outputs": [ 880 | { 881 | "name": "stdout", 882 | "output_type": "stream", 883 | "text": [ 884 | "Archive: db.zip\r\n", 885 | "replace db/chroma-embeddings.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C\r\n" 886 | ] 887 | } 888 | ], 889 | "source": [ 890 | "!unzip db.zip" 891 | ], 892 | "metadata": { 893 | "collapsed": false, 894 | "ExecuteTime": { 895 | "end_time": "2023-06-19T16:30:09.309839Z", 896 | "start_time": "2023-06-19T16:29:00.698188Z" 897 | } 898 | } 899 | }, 900 | { 901 | "cell_type": "code", 902 | "execution_count": null, 903 | "outputs": [], 904 | "source": [ 905 | "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" 906 | ], 907 | "metadata": { 908 | "collapsed": false 909 | } 910 | }, 911 | { 912 | "cell_type": "code", 913 | "execution_count": 59, 914 | "outputs": [ 915 | { 916 | "name": "stderr", 917 | "output_type": "stream", 918 | "text": [ 919 | "Using embedded DuckDB with persistence: data will be stored in: db\n" 920 | ] 921 | } 922 | ], 923 | "source": [ 924 | "persist_directory = 'db'\n", 925 | "embedding = OpenAIEmbeddings()\n", 926 | "\n", 927 | "vectordb2 = Chroma(persist_directory=persist_directory,\n", 928 | " embedding_function=embedding,\n", 929 | " )\n", 930 | "\n", 931 | "retriever = vectordb2.as_retriever(search_kwargs={\"k\": 2})" 932 | ], 933 | "metadata": { 934 | "collapsed": false, 935 | "ExecuteTime": { 936 | "end_time": "2023-06-19T16:30:13.801920Z", 937 | "start_time": "2023-06-19T16:30:12.530863Z" 938 | } 939 | } 940 | }, 941 | { 942 | "cell_type": "markdown", 943 | "source": [ 944 | "#### Usung turbo GPT API" 945 | ], 946 | "metadata": { 947 | "collapsed": false 948 | } 949 | }, 950 | { 951 | "cell_type": "code", 952 | "execution_count": 60, 953 | "outputs": [], 954 | "source": [ 955 | "# Set up the turbo LLM\n", 956 | "turbo_llm = ChatOpenAI(\n", 957 | " temperature=0,\n", 958 | " model_name='gpt-3.5-turbo'\n", 959 | ")" 960 | ], 961 | "metadata": { 962 | "collapsed": false, 963 | "ExecuteTime": { 964 | "end_time": "2023-06-19T16:30:15.131031Z", 965 | "start_time": "2023-06-19T16:30:15.126008Z" 966 | } 967 | } 968 | }, 969 | { 970 | "cell_type": "code", 971 | "execution_count": 61, 972 | "outputs": [], 973 | "source": [ 974 | "# Create the chain to answer questions\n", 975 | "qa_chain = RetrievalQA.from_chain_type(llm=turbo_llm,\n", 976 | " chain_type=\"stuff\",\n", 977 | " retriever=retriever,\n", 978 | " return_source_documents=True,\n", 979 | " verbose=True)" 980 | ], 981 | "metadata": { 982 | "collapsed": false, 983 | "ExecuteTime": { 984 | "end_time": "2023-06-19T16:30:15.535025Z", 985 | "start_time": "2023-06-19T16:30:15.531668Z" 986 | } 987 | } 988 | }, 989 | { 990 | "cell_type": "code", 991 | "execution_count": 62, 992 | "outputs": [], 993 | "source": [ 994 | "# Cite sources\n", 995 | "def process_llm_response(llm_response):\n", 996 | " print(llm_response['result'])\n", 997 | " print('\\n\\nSources:')\n", 998 | " for source in llm_response[\"source_documents\"]:\n", 999 | " print(source.metadata['source'])" 1000 | ], 1001 | "metadata": { 1002 | "collapsed": false, 1003 | "ExecuteTime": { 1004 | "end_time": "2023-06-19T16:30:16.283860Z", 1005 | "start_time": "2023-06-19T16:30:16.275813Z" 1006 | } 1007 | } 1008 | }, 1009 | { 1010 | "cell_type": "code", 1011 | "execution_count": 63, 1012 | "outputs": [ 1013 | { 1014 | "name": "stdout", 1015 | "output_type": "stream", 1016 | "text": [ 1017 | "\n", 1018 | "\n", 1019 | "\u001B[1m> Entering new RetrievalQA chain...\u001B[0m\n", 1020 | "\n", 1021 | "\u001B[1m> Finished chain.\u001B[0m\n", 1022 | "There are two main types of agents mentioned in the context: Action Agents and Plan-and-Execute Agents. Action Agents decide the actions to take and execute those actions one at a time, while Plan-and-Execute Agents first decide a plan of actions to take, and then execute those actions one at a time.\n", 1023 | "\n", 1024 | "\n", 1025 | "Sources:\n", 1026 | "/Users/erictak/PycharmProjects/langchain/data/langchain_doc/639_Agents_Contents_Action.txt\n", 1027 | "/Users/erictak/PycharmProjects/langchain/data/langchain_doc/344_Agents_Contents_Action.txt\n" 1028 | ] 1029 | } 1030 | ], 1031 | "source": [ 1032 | "# Question\n", 1033 | "query = \"What are the agent types?\"\n", 1034 | "llm_response = qa_chain(query)\n", 1035 | "process_llm_response(llm_response)" 1036 | ], 1037 | "metadata": { 1038 | "collapsed": false, 1039 | "ExecuteTime": { 1040 | "end_time": "2023-06-19T16:30:22.264885Z", 1041 | "start_time": "2023-06-19T16:30:16.942379Z" 1042 | } 1043 | } 1044 | }, 1045 | { 1046 | "cell_type": "code", 1047 | "execution_count": 64, 1048 | "outputs": [ 1049 | { 1050 | "name": "stdout", 1051 | "output_type": "stream", 1052 | "text": [ 1053 | "Use the following pieces of context to answer the users question. \n", 1054 | "If you don't know the answer, just say that you don't know, don't try to make up an answer.\n", 1055 | "----------------\n", 1056 | "{context}\n" 1057 | ] 1058 | } 1059 | ], 1060 | "source": [ 1061 | "print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[0].prompt.template)" 1062 | ], 1063 | "metadata": { 1064 | "collapsed": false, 1065 | "ExecuteTime": { 1066 | "end_time": "2023-06-19T16:30:23.332015Z", 1067 | "start_time": "2023-06-19T16:30:23.325565Z" 1068 | } 1069 | } 1070 | }, 1071 | { 1072 | "cell_type": "code", 1073 | "execution_count": 65, 1074 | "outputs": [ 1075 | { 1076 | "name": "stdout", 1077 | "output_type": "stream", 1078 | "text": [ 1079 | "{question}\n" 1080 | ] 1081 | } 1082 | ], 1083 | "source": [ 1084 | "print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[1].prompt.template)" 1085 | ], 1086 | "metadata": { 1087 | "collapsed": false, 1088 | "ExecuteTime": { 1089 | "end_time": "2023-06-19T16:30:24.189842Z", 1090 | "start_time": "2023-06-19T16:30:24.183626Z" 1091 | } 1092 | } 1093 | }, 1094 | { 1095 | "cell_type": "code", 1096 | "execution_count": 65, 1097 | "outputs": [], 1098 | "source": [], 1099 | "metadata": { 1100 | "collapsed": false, 1101 | "ExecuteTime": { 1102 | "end_time": "2023-06-19T16:30:24.724602Z", 1103 | "start_time": "2023-06-19T16:30:24.717401Z" 1104 | } 1105 | } 1106 | }, 1107 | { 1108 | "cell_type": "markdown", 1109 | "source": [ 1110 | "# Visualizing the Vector db\n", 1111 | "https://github.com/mtybadger/chromaviz?ref=reactjsexample.com\n", 1112 | "\n", 1113 | "https://github.com/avantrio/chroma-viewer\n" 1114 | ], 1115 | "metadata": { 1116 | "collapsed": false 1117 | } 1118 | }, 1119 | { 1120 | "cell_type": "code", 1121 | "execution_count": null, 1122 | "outputs": [], 1123 | "source": [ 1124 | "from chromaviz import visualize_collection\n", 1125 | "visualize_collection(vectordb._collection)" 1126 | ], 1127 | "metadata": { 1128 | "collapsed": false 1129 | } 1130 | }, 1131 | { 1132 | "cell_type": "code", 1133 | "execution_count": null, 1134 | "outputs": [], 1135 | "source": [], 1136 | "metadata": { 1137 | "collapsed": false 1138 | } 1139 | } 1140 | ], 1141 | "metadata": { 1142 | "kernelspec": { 1143 | "display_name": "Python 3", 1144 | "language": "python", 1145 | "name": "python3" 1146 | }, 1147 | "language_info": { 1148 | "codemirror_mode": { 1149 | "name": "ipython", 1150 | "version": 2 1151 | }, 1152 | "file_extension": ".py", 1153 | "mimetype": "text/x-python", 1154 | "name": "python", 1155 | "nbconvert_exporter": "python", 1156 | "pygments_lexer": "ipython2", 1157 | "version": "2.7.6" 1158 | } 1159 | }, 1160 | "nbformat": 4, 1161 | "nbformat_minor": 0 1162 | } 1163 | -------------------------------------------------------------------------------- /notebooks/chains.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# LangChain Chains\n", 7 | "\n", 8 | "in this notebook we will take a look at different langchain chains" 9 | ], 10 | "metadata": { 11 | "collapsed": false 12 | } 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "outputs": [], 18 | "source": [ 19 | "# install the packages\n", 20 | "!pip install -Uqqq pip --progress-bar off\n", 21 | "!pip install -qqq langchain==0.0.149 --progress-bar off\n", 22 | "!pip install -qqq openai==0.27.4 --progress-bar off\n", 23 | "!pip install -qqq tiktoken==0.3.3 --progress-bar off\n", 24 | "!pip install -qqq watermark==2.3.1 --progress-bar off\n", 25 | "!pip install -qqq chromadb==0.3.21 --progress-bar off" 26 | ], 27 | "metadata": { 28 | "collapsed": false, 29 | "ExecuteTime": { 30 | "start_time": "2023-05-15T12:18:50.880652Z", 31 | "end_time": "2023-05-15T12:19:17.112307Z" 32 | } 33 | } 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "outputs": [], 39 | "source": [ 40 | "# import the packages\n", 41 | "import os\n", 42 | "import textwrap\n", 43 | "from getpass import getpass\n", 44 | "import chromadb\n", 45 | "import langchain\n", 46 | "import openai\n", 47 | "from langchain.chains import LLMBashChain, LLMChain, RetrievalQA, SimpleSequentialChain\n", 48 | "from langchain.chains.summarize import load_summarize_chain\n", 49 | "from langchain.chat_models import ChatOpenAI\n", 50 | "from langchain.docstore.document import Document\n", 51 | "from langchain.embeddings.openai import OpenAIEmbeddings\n", 52 | "from langchain.llms import OpenAI\n", 53 | "from langchain.prompts import PromptTemplate\n", 54 | "from langchain.text_splitter import CharacterTextSplitter\n", 55 | "from langchain.vectorstores import Chroma\n", 56 | "import credentials" 57 | ], 58 | "metadata": { 59 | "collapsed": false, 60 | "ExecuteTime": { 61 | "start_time": "2023-05-15T12:19:17.111791Z", 62 | "end_time": "2023-05-15T12:19:21.110437Z" 63 | } 64 | } 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "source": [ 69 | "command for Jupyter Notebook or IPython, which is used to load an extension called \"watermark\" and display version information for all the installed packages." 70 | ], 71 | "metadata": { 72 | "collapsed": false 73 | } 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 3, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "Python implementation: CPython\n", 84 | "Python version : 3.10.10\n", 85 | "IPython version : 8.12.0\n", 86 | "\n", 87 | "Compiler : Clang 14.0.6 \n", 88 | "OS : Darwin\n", 89 | "Release : 22.2.0\n", 90 | "Machine : x86_64\n", 91 | "Processor : i386\n", 92 | "CPU cores : 10\n", 93 | "Architecture: 64bit\n", 94 | "\n", 95 | "langchain: 0.0.149\n", 96 | "openai : 0.27.4\n", 97 | "sys : 3.10.10 (main, Mar 21 2023, 13:41:39) [Clang 14.0.6 ]\n", 98 | "chromadb : 0.3.21\n", 99 | "\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "%load_ext watermark\n", 105 | "%watermark --iversions -v -m" 106 | ], 107 | "metadata": { 108 | "collapsed": false, 109 | "ExecuteTime": { 110 | "start_time": "2023-05-15T12:19:25.807982Z", 111 | "end_time": "2023-05-15T12:19:25.846670Z" 112 | } 113 | } 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "source": [ 118 | "# Add a print_response function to wrap the response of the models" 119 | ], 120 | "metadata": { 121 | "collapsed": false 122 | } 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 4, 127 | "outputs": [], 128 | "source": [ 129 | "def print_response(response: str):\n", 130 | " print(\"\\n\".join(textwrap.wrap(response, width=100)))" 131 | ], 132 | "metadata": { 133 | "collapsed": false, 134 | "ExecuteTime": { 135 | "start_time": "2023-05-15T12:19:27.130425Z", 136 | "end_time": "2023-05-15T12:19:27.140915Z" 137 | } 138 | } 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "source": [ 143 | "# Create a new openai api key\n" 144 | ], 145 | "metadata": { 146 | "collapsed": false 147 | } 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 5, 152 | "outputs": [], 153 | "source": [ 154 | "OPENAI_API_KEY = getpass()\n", 155 | "os.environ[\"OPENAI_API_KEY\"] = credentials.OPENAI_API_KEY" 156 | ], 157 | "metadata": { 158 | "collapsed": false, 159 | "ExecuteTime": { 160 | "start_time": "2023-05-15T12:19:27.958916Z", 161 | "end_time": "2023-05-15T12:19:34.140795Z" 162 | } 163 | } 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "source": [ 168 | "# Create a new chat model" 169 | ], 170 | "metadata": { 171 | "collapsed": false 172 | } 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 6, 177 | "metadata": { 178 | "collapsed": true, 179 | "ExecuteTime": { 180 | "start_time": "2023-05-15T12:19:36.515176Z", 181 | "end_time": "2023-05-15T12:19:36.519258Z" 182 | } 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "model = ChatOpenAI(temperature=0.3, model_name=\"gpt-3.5-turbo\")" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "source": [ 192 | "# Create a template for the prompt" 193 | ], 194 | "metadata": { 195 | "collapsed": false 196 | } 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 7, 201 | "outputs": [], 202 | "source": [ 203 | "template = \"\"\"\n", 204 | "You have to come up with location to shoot (along with a 20-50 word description)\n", 205 | "for a new episode of the TV show \"The Office\" based on the theme.\n", 206 | "\n", 207 | "{theme_suggestion}\n", 208 | "\n", 209 | "ANSWER:\n", 210 | "\"\"\"\n", 211 | "prompt = PromptTemplate(input_variables=[\"theme_suggestion\"], template=template)\n", 212 | "\n", 213 | "location_chain = LLMChain(llm=model, prompt=prompt, verbose=True)" 214 | ], 215 | "metadata": { 216 | "collapsed": false, 217 | "ExecuteTime": { 218 | "start_time": "2023-05-15T12:19:37.378075Z", 219 | "end_time": "2023-05-15T12:19:37.384846Z" 220 | } 221 | } 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 8, 226 | "outputs": [ 227 | { 228 | "name": "stdout", 229 | "output_type": "stream", 230 | "text": [ 231 | "\n", 232 | "\n", 233 | "\u001B[1m> Entering new LLMChain chain...\u001B[0m\n", 234 | "Prompt after formatting:\n", 235 | "\u001B[32;1m\u001B[1;3m\n", 236 | "You have to come up with location to shoot (along with a 20-50 word description)\n", 237 | "for a new episode of the TV show \"The Office\" based on the theme.\n", 238 | "\n", 239 | "Visiting Europe\n", 240 | "\n", 241 | "ANSWER:\n", 242 | "\u001B[0m\n", 243 | "\n", 244 | "\u001B[1m> Finished chain.\u001B[0m\n" 245 | ] 246 | }, 247 | { 248 | "data": { 249 | "text/plain": "{'theme_suggestion': 'Visiting Europe',\n 'text': 'The Office crew heads to Paris, France for a company retreat. While there, they struggle with language barriers, cultural differences, and a surprise visit from a former employee. Will they be able to bond and come together as a team in the City of Love?'}" 250 | }, 251 | "execution_count": 8, 252 | "metadata": {}, 253 | "output_type": "execute_result" 254 | } 255 | ], 256 | "source": [ 257 | "response = location_chain(\"Visiting Europe\")\n", 258 | "response" 259 | ], 260 | "metadata": { 261 | "collapsed": false, 262 | "ExecuteTime": { 263 | "start_time": "2023-05-15T12:19:38.001174Z", 264 | "end_time": "2023-05-15T12:19:43.487915Z" 265 | } 266 | } 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 9, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "The Office crew heads to Paris, France for a company retreat. While there, they struggle with\n", 277 | "language barriers, cultural differences, and a surprise visit from a former employee. Will they be\n", 278 | "able to bond and come together as a team in the City of Love?\n" 279 | ] 280 | } 281 | ], 282 | "source": [ 283 | "print_response(response[\"text\"])" 284 | ], 285 | "metadata": { 286 | "collapsed": false, 287 | "ExecuteTime": { 288 | "start_time": "2023-05-15T12:19:44.684688Z", 289 | "end_time": "2023-05-15T12:19:44.694076Z" 290 | } 291 | } 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "source": [ 296 | "# Sequential chains\n", 297 | "\n", 298 | "\n", 299 | "[Documentation](https://python.langchain.com/en/latest/modules/chains/generic/sequential_chains.html)\n" 300 | ], 301 | "metadata": { 302 | "collapsed": false 303 | } 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 10, 308 | "outputs": [], 309 | "source": [ 310 | "template = \"\"\"\n", 311 | "Generate a short dialogue between Jim and Pam\n", 312 | "from the TV show \"The Office\" for a new episode based on the location\n", 313 | "\n", 314 | "{location}\n", 315 | "\n", 316 | "ANSWER:\n", 317 | "\"\"\"\n", 318 | "prompt = PromptTemplate(input_variables=[\"location\"], template=template)\n", 319 | "\n", 320 | "conversation_chain = LLMChain(llm=model, prompt=prompt, verbose=True)" 321 | ], 322 | "metadata": { 323 | "collapsed": false, 324 | "ExecuteTime": { 325 | "start_time": "2023-05-15T12:19:45.840574Z", 326 | "end_time": "2023-05-15T12:19:45.844871Z" 327 | } 328 | } 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "source": [ 333 | "### SimpleSequentialChain" 334 | ], 335 | "metadata": { 336 | "collapsed": false 337 | } 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 11, 342 | "outputs": [ 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "\n", 348 | "\n", 349 | "\u001B[1m> Entering new SimpleSequentialChain chain...\u001B[0m\n", 350 | "\n", 351 | "\n", 352 | "\u001B[1m> Entering new LLMChain chain...\u001B[0m\n", 353 | "Prompt after formatting:\n", 354 | "\u001B[32;1m\u001B[1;3m\n", 355 | "You have to come up with location to shoot (along with a 20-50 word description)\n", 356 | "for a new episode of the TV show \"The Office\" based on the theme.\n", 357 | "\n", 358 | "Visiting Europe\n", 359 | "\n", 360 | "ANSWER:\n", 361 | "\u001B[0m\n", 362 | "\n", 363 | "\u001B[1m> Finished chain.\u001B[0m\n", 364 | "\u001B[36;1m\u001B[1;3mThe Office crew heads to Paris for a business trip, but things take a romantic turn when Jim and Pam get lost in the city of love. Meanwhile, Dwight tries to navigate the French language and culture with hilarious results.\u001B[0m\n", 365 | "\n", 366 | "\n", 367 | "\u001B[1m> Entering new LLMChain chain...\u001B[0m\n", 368 | "Prompt after formatting:\n", 369 | "\u001B[32;1m\u001B[1;3m\n", 370 | "Generate a short dialogue between Jim and Pam\n", 371 | "from the TV show \"The Office\" for a new episode based on the location\n", 372 | "\n", 373 | "The Office crew heads to Paris for a business trip, but things take a romantic turn when Jim and Pam get lost in the city of love. Meanwhile, Dwight tries to navigate the French language and culture with hilarious results.\n", 374 | "\n", 375 | "ANSWER:\n", 376 | "\u001B[0m\n", 377 | "\n", 378 | "\u001B[1m> Finished chain.\u001B[0m\n", 379 | "\u001B[33;1m\u001B[1;3mJim: \"I can't believe we got lost in Paris, of all places.\"\n", 380 | "\n", 381 | "Pam: \"I know, right? But I have to admit, it's kind of romantic wandering around these streets with you.\"\n", 382 | "\n", 383 | "Jim: \"Yeah, it's like we're in our own little movie.\"\n", 384 | "\n", 385 | "Pam: \"Speaking of movies, we should find a cute little cafe and have some croissants and coffee.\"\n", 386 | "\n", 387 | "Jim: \"Sounds perfect. But first, let's take a selfie in front of the Eiffel Tower.\"\n", 388 | "\n", 389 | "Pam: \"Yes! And then we can send it to Dwight and Michael to make them jealous.\"\n", 390 | "\n", 391 | "Jim: \"Ha! They're probably struggling to order food in French right now.\"\n", 392 | "\n", 393 | "Pam: \"Well, at least we have each other to navigate this city with.\"\n", 394 | "\n", 395 | "Jim: \"Always, Pam. Always.\"\u001B[0m\n", 396 | "\n", 397 | "\u001B[1m> Finished chain.\u001B[0m\n" 398 | ] 399 | } 400 | ], 401 | "source": [ 402 | "sequential_chain = SimpleSequentialChain(\n", 403 | " chains=[location_chain, conversation_chain], verbose=True\n", 404 | ")\n", 405 | "response = sequential_chain.run(\"Visiting Europe\")\n" 406 | ], 407 | "metadata": { 408 | "collapsed": false, 409 | "ExecuteTime": { 410 | "start_time": "2023-05-15T12:19:46.918422Z", 411 | "end_time": "2023-05-15T12:20:09.152045Z" 412 | } 413 | } 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 12, 418 | "outputs": [ 419 | { 420 | "name": "stdout", 421 | "output_type": "stream", 422 | "text": [ 423 | "Jim: \"I can't believe we got lost in Paris, of all places.\"\n", 424 | "\n", 425 | "Pam: \"I know, right? But I have to admit, it's kind of romantic wandering around these streets with you.\"\n", 426 | "\n", 427 | "Jim: \"Yeah, it's like we're in our own little movie.\"\n", 428 | "\n", 429 | "Pam: \"Speaking of movies, we should find a cute little cafe and have some croissants and coffee.\"\n", 430 | "\n", 431 | "Jim: \"Sounds perfect. But first, let's take a selfie in front of the Eiffel Tower.\"\n", 432 | "\n", 433 | "Pam: \"Yes! And then we can send it to Dwight and Michael to make them jealous.\"\n", 434 | "\n", 435 | "Jim: \"Ha! They're probably struggling to order food in French right now.\"\n", 436 | "\n", 437 | "Pam: \"Well, at least we have each other to navigate this city with.\"\n", 438 | "\n", 439 | "Jim: \"Always, Pam. Always.\"\n" 440 | ] 441 | } 442 | ], 443 | "source": [ 444 | "conversation = response\n", 445 | "print(conversation)" 446 | ], 447 | "metadata": { 448 | "collapsed": false, 449 | "ExecuteTime": { 450 | "start_time": "2023-05-15T12:20:12.162182Z", 451 | "end_time": "2023-05-15T12:20:12.174800Z" 452 | } 453 | } 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "source": [ 458 | "# Summarization\n", 459 | "\n", 460 | "There are four different chain types: stuff, map_reduce, refine and map-rerank. Each of these is described in more detail below.\n", 461 | "\n", 462 | "\n", 463 | "### Stuffing\n", 464 | "Stuffing is the simplest method, whereby you simply stuff all the related data into the prompt as context to pass to the language model. This is implemented in LangChain as the StuffDocumentsChain.\n", 465 | "\n", 466 | "Pros: Only makes a single call to the LLM. When generating text, the LLM has access to all the data at once.\n", 467 | "\n", 468 | "Cons: Most LLMs have a context length, and for large documents (or many documents) this will not work as it will result in a prompt larger than the context length.\n", 469 | "\n", 470 | "The main downside of this method is that it only works on smaller pieces of data. Once you are working with many pieces of data, this approach is no longer feasible. The next two approaches are designed to help deal with that.\n", 471 | "\n", 472 | "\n", 473 | "### Map Reduce\n", 474 | "This method involves running an initial prompt on each chunk of data (for summarization tasks, this could be a summary of that chunk; for question-answering tasks, it could be an answer based solely on that chunk). Then a different prompt is run to combine all the initial outputs. This is implemented in the LangChain as the MapReduceDocumentsChain.\n", 475 | "\n", 476 | "Pros: Can scale to larger documents (and more documents) than StuffDocumentsChain. The calls to the LLM on individual documents are independent and can therefore be parallelized.\n", 477 | "\n", 478 | "Cons: Requires many more calls to the LLM than StuffDocumentsChain. Loses some information during the final combined call.\n", 479 | "\n", 480 | "\n", 481 | "### Refine\n", 482 | "This method involves running an initial prompt on the first chunk of data, generating some output. For the remaining documents, that output is passed in, along with the next document, asking the LLM to refine the output based on the new document.\n", 483 | "\n", 484 | "Pros: Can pull in more relevant context, and may be less lossy than MapReduceDocumentsChain.\n", 485 | "\n", 486 | "Cons: Requires many more calls to the LLM than StuffDocumentsChain. The calls are also NOT independent, meaning they cannot be paralleled like MapReduceDocumentsChain. There is also some potential dependencies on the ordering of the documents.\n", 487 | "\n", 488 | "\n", 489 | "### Map-Rerank (not implemented for summarization)\n", 490 | "This method involves running an initial prompt on each chunk of data, that not only tries to complete a task but also gives a score for how certain it is in its answer. The responses are then ranked according to this score, and the highest score is returned.\n", 491 | "\n", 492 | "Pros: Similar pros as MapReduceDocumentsChain. Requires fewer calls, compared to MapReduceDocumentsChain.\n", 493 | "\n", 494 | "Cons: Cannot combine information between documents. This means it is most useful when you expect there to be a single simple answer in a single document.\n", 495 | "\n", 496 | "[Documentation](https://docs.langchain.com/docs/components/chains/index_related_chains)\n", 497 | "[Example](https://python.langchain.com/en/latest/modules/chains/index_examples/summarize.html)" 498 | ], 499 | "metadata": { 500 | "collapsed": false 501 | } 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 13, 506 | "outputs": [], 507 | "source": [ 508 | "template = \"\"\"\n", 509 | "Write a concise bullet list summary of the conversation between Jim and Pam from the TV show \"The Office\":\n", 510 | "\n", 511 | "{text}\n", 512 | "\n", 513 | "Concise summary using markdown:\"\"\"\n", 514 | "\n", 515 | "prompt = PromptTemplate(template=template, input_variables=[\"text\"])\n", 516 | "summary_chain = load_summarize_chain(\n", 517 | " model, chain_type=\"stuff\", verbose=True, prompt=prompt\n", 518 | ")\n" 519 | ], 520 | "metadata": { 521 | "collapsed": false, 522 | "ExecuteTime": { 523 | "start_time": "2023-05-15T12:20:15.725018Z", 524 | "end_time": "2023-05-15T12:20:15.729853Z" 525 | } 526 | } 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 14, 531 | "outputs": [ 532 | { 533 | "data": { 534 | "text/plain": "[Document(page_content='Jim: \"I can\\'t believe we got lost in Paris, of all places.\"\\n\\nPam: \"I know, right? But I have to admit, it\\'s kind of romantic wandering around these streets with you.\"\\n\\nJim: \"Yeah, it\\'s like we\\'re in our own little movie.\"\\n\\nPam: \"Speaking of movies, we should find a cute little cafe and have some croissants and coffee.\"\\n\\nJim: \"Sounds perfect. But first, let\\'s take a selfie in front of the Eiffel Tower.\"\\n\\nPam: \"Yes! And then we can send it to Dwight and Michael to make them jealous.\"\\n\\nJim: \"Ha! They\\'re probably struggling to order food in French right now.\"\\n\\nPam: \"Well, at least we have each other to navigate this city with.\"\\n\\nJim: \"Always, Pam. Always.\"', metadata={})]" 535 | }, 536 | "execution_count": 14, 537 | "metadata": {}, 538 | "output_type": "execute_result" 539 | } 540 | ], 541 | "source": [ 542 | "docs = [Document(page_content=conversation)]\n", 543 | "docs" 544 | ], 545 | "metadata": { 546 | "collapsed": false, 547 | "ExecuteTime": { 548 | "start_time": "2023-05-15T12:20:16.460643Z", 549 | "end_time": "2023-05-15T12:20:16.464584Z" 550 | } 551 | } 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 15, 556 | "outputs": [ 557 | { 558 | "name": "stdout", 559 | "output_type": "stream", 560 | "text": [ 561 | "\n", 562 | "\n", 563 | "\u001B[1m> Entering new StuffDocumentsChain chain...\u001B[0m\n", 564 | "\n", 565 | "\n", 566 | "\u001B[1m> Entering new LLMChain chain...\u001B[0m\n", 567 | "Prompt after formatting:\n", 568 | "\u001B[32;1m\u001B[1;3m\n", 569 | "Write a concise bullet list summary of the conversation between Jim and Pam from the TV show \"The Office\":\n", 570 | "\n", 571 | "Jim: \"I can't believe we got lost in Paris, of all places.\"\n", 572 | "\n", 573 | "Pam: \"I know, right? But I have to admit, it's kind of romantic wandering around these streets with you.\"\n", 574 | "\n", 575 | "Jim: \"Yeah, it's like we're in our own little movie.\"\n", 576 | "\n", 577 | "Pam: \"Speaking of movies, we should find a cute little cafe and have some croissants and coffee.\"\n", 578 | "\n", 579 | "Jim: \"Sounds perfect. But first, let's take a selfie in front of the Eiffel Tower.\"\n", 580 | "\n", 581 | "Pam: \"Yes! And then we can send it to Dwight and Michael to make them jealous.\"\n", 582 | "\n", 583 | "Jim: \"Ha! They're probably struggling to order food in French right now.\"\n", 584 | "\n", 585 | "Pam: \"Well, at least we have each other to navigate this city with.\"\n", 586 | "\n", 587 | "Jim: \"Always, Pam. Always.\"\n", 588 | "\n", 589 | "Concise summary using markdown:\u001B[0m\n", 590 | "\n", 591 | "\u001B[1m> Finished chain.\u001B[0m\n", 592 | "\n", 593 | "\u001B[1m> Finished chain.\u001B[0m\n" 594 | ] 595 | } 596 | ], 597 | "source": [ 598 | "summary_result = summary_chain.run(docs)" 599 | ], 600 | "metadata": { 601 | "collapsed": false, 602 | "ExecuteTime": { 603 | "start_time": "2023-05-15T12:20:17.657879Z", 604 | "end_time": "2023-05-15T12:20:29.138814Z" 605 | } 606 | } 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": 16, 611 | "outputs": [ 612 | { 613 | "name": "stdout", 614 | "output_type": "stream", 615 | "text": [ 616 | "- Jim and Pam got lost in Paris.\n", 617 | "- They find it romantic to wander around the streets together.\n", 618 | "- They plan to find a cute cafe and have croissants and coffee.\n", 619 | "- They want to take a selfie in front of the Eiffel Tower and send it to Dwight and Michael to make them jealous.\n", 620 | "- They are grateful to have each other to navigate the city with.\n" 621 | ] 622 | } 623 | ], 624 | "source": [ 625 | "print(summary_result)" 626 | ], 627 | "metadata": { 628 | "collapsed": false, 629 | "ExecuteTime": { 630 | "start_time": "2023-05-15T12:20:31.856412Z", 631 | "end_time": "2023-05-15T12:20:31.865826Z" 632 | } 633 | } 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "source": [ 638 | "# Question answering\n", 639 | "\n", 640 | "Here we look at how to use LangChain for question answering over a list of documents. It covers four different types of chains: stuff, map_reduce, refine, map_rerank\n", 641 | "\n", 642 | "[Example](https://python.langchain.com/en/latest/modules/chains/index_examples/question_answering.html)" 643 | ], 644 | "metadata": { 645 | "collapsed": false 646 | } 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": 17, 651 | "outputs": [], 652 | "source": [ 653 | "template = \"\"\"\n", 654 | "You have to come up with a 200-300 word script for a new episode\n", 655 | "of the TV show \"The Office\" based on the theme\n", 656 | "\n", 657 | "{theme_suggestion}\n", 658 | "\n", 659 | "ANSWER:\n", 660 | "\"\"\"\n", 661 | "prompt = PromptTemplate(input_variables=[\"theme_suggestion\"], template=template)\n", 662 | "\n", 663 | "script_chain = LLMChain(llm=model, prompt=prompt, verbose=True)\n" 664 | ], 665 | "metadata": { 666 | "collapsed": false, 667 | "ExecuteTime": { 668 | "start_time": "2023-05-15T12:20:33.722925Z", 669 | "end_time": "2023-05-15T12:20:33.728311Z" 670 | } 671 | } 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": 21, 676 | "outputs": [ 677 | { 678 | "name": "stdout", 679 | "output_type": "stream", 680 | "text": [ 681 | "\n", 682 | "\n", 683 | "\u001B[1m> Entering new LLMChain chain...\u001B[0m\n", 684 | "Prompt after formatting:\n", 685 | "\u001B[32;1m\u001B[1;3m\n", 686 | "You have to come up with a 200-300 word script for a new episode\n", 687 | "of the TV show \"The Office\" based on the theme\n", 688 | "\n", 689 | "Going to the moon\n", 690 | "\n", 691 | "ANSWER:\n", 692 | "\u001B[0m\n", 693 | "\n", 694 | "\u001B[1m> Finished chain.\u001B[0m\n" 695 | ] 696 | } 697 | ], 698 | "source": [ 699 | "script_response = script_chain(\"Going to the moon\")" 700 | ], 701 | "metadata": { 702 | "collapsed": false, 703 | "ExecuteTime": { 704 | "start_time": "2023-05-15T12:24:06.552617Z", 705 | "end_time": "2023-05-15T12:25:02.920288Z" 706 | } 707 | } 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": 22, 712 | "outputs": [ 713 | { 714 | "name": "stdout", 715 | "output_type": "stream", 716 | "text": [ 717 | "FADE IN:\n", 718 | "\n", 719 | "INT. DUNDER MIFFLIN SCRANTON - DAY\n", 720 | "\n", 721 | "The employees of Dunder Mifflin are gathered in the conference room for a meeting. Michael Scott is standing at the front of the room, holding a toy rocket ship.\n", 722 | "\n", 723 | "MICHAEL: Good morning, everyone! Today, we’re going to talk about the moon.\n", 724 | "\n", 725 | "JIM: (whispering to Pam) Is he serious?\n", 726 | "\n", 727 | "PAM: (whispering back) I don’t know, but I’m afraid to ask.\n", 728 | "\n", 729 | "MICHAEL: As you all know, NASA is planning a mission to the moon in a few years. And I’ve been thinking, why should they have all the fun?\n", 730 | "\n", 731 | "DWIGHT: (excitedly) Are you suggesting we go to the moon, Michael?\n", 732 | "\n", 733 | "MICHAEL: (nodding) Yes, Dwight. I am.\n", 734 | "\n", 735 | "JIM: (sarcastically) Oh, great. Another one of Michael’s brilliant ideas.\n", 736 | "\n", 737 | "MICHAEL: (ignoring Jim) I’ve already contacted a space travel agency and they’ve agreed to take us to the moon.\n", 738 | "\n", 739 | "PAM: (concerned) Michael, I don’t think that’s a good idea. Going to the moon is dangerous.\n", 740 | "\n", 741 | "MICHAEL: (defensive) Pam, I’m not going to let a little thing like danger stop us from achieving our dreams.\n", 742 | "\n", 743 | "The employees exchange worried glances.\n", 744 | "\n", 745 | "CUT TO:\n", 746 | "\n", 747 | "INT. SPACE TRAVEL AGENCY - DAY\n", 748 | "\n", 749 | "Michael, Dwight, Jim, and Pam are standing in front of a large rocket ship.\n", 750 | "\n", 751 | "MICHAEL: (excitedly) This is it, guys. Our ticket to the moon.\n", 752 | "\n", 753 | "JIM: (sarcastically) Yay.\n", 754 | "\n", 755 | "PAM: (nervously) Michael, are you sure about this?\n", 756 | "\n", 757 | "MICHAEL: (confidently) Absolutely, Pam. Trust me, this is going to be the adventure of a lifetime.\n", 758 | "\n", 759 | "The employees reluctantly board the rocket ship.\n", 760 | "\n", 761 | "CUT TO:\n", 762 | "\n", 763 | "INT. ROCKET SHIP - DAY\n", 764 | "\n", 765 | "The employees are strapped into their seats, looking nervous.\n", 766 | "\n", 767 | "MICHAEL: (over the intercom) Attention, everyone. We are about to take off. Please fasten your seatbelts and prepare for liftoff.\n", 768 | "\n", 769 | "The rocket ship shakes and rumbles as it takes off.\n", 770 | "\n", 771 | "CUT TO:\n", 772 | "\n", 773 | "INT. MOON - DAY\n", 774 | "\n", 775 | "The employees are standing on the surface of the moon, wearing space suits.\n", 776 | "\n", 777 | "JIM: (in awe) Wow. We’re actually on the moon.\n", 778 | "\n", 779 | "DWIGHT: (excitedly) This is amazing. I can’t wait to explore.\n", 780 | "\n", 781 | "PAM: (worriedly) Michael, we need to get back to Earth. We can’t stay here forever.\n", 782 | "\n", 783 | "MICHAEL: (disappointed) Fine. Let’s go back.\n", 784 | "\n", 785 | "The employees board the rocket ship and take off.\n", 786 | "\n", 787 | "CUT TO:\n", 788 | "\n", 789 | "INT. DUNDER MIFFLIN SCRANTON - DAY\n", 790 | "\n", 791 | "The employees are back in the conference room, looking exhausted.\n", 792 | "\n", 793 | "MICHAEL: (smiling) Well, that was quite an adventure, wasn’t it?\n", 794 | "\n", 795 | "JIM: (sarcastically) Yeah, Michael. It was a blast.\n", 796 | "\n", 797 | "PAM: (smiling) I have to admit, it was pretty cool.\n", 798 | "\n", 799 | "DWIGHT: (excitedly) Can we go back to the moon someday?\n", 800 | "\n", 801 | "MICHAEL: (smiling) Who knows, Dwight. Maybe we will.\n", 802 | "\n", 803 | "FADE OUT.\n" 804 | ] 805 | } 806 | ], 807 | "source": [ 808 | "script = script_response[\"text\"]\n", 809 | "print(script)" 810 | ], 811 | "metadata": { 812 | "collapsed": false, 813 | "ExecuteTime": { 814 | "start_time": "2023-05-15T12:25:23.104278Z", 815 | "end_time": "2023-05-15T12:25:23.111993Z" 816 | } 817 | } 818 | }, 819 | { 820 | "cell_type": "markdown", 821 | "source": [ 822 | "### Split the script into chunks" 823 | ], 824 | "metadata": { 825 | "collapsed": false 826 | } 827 | }, 828 | { 829 | "cell_type": "code", 830 | "execution_count": 23, 831 | "outputs": [ 832 | { 833 | "data": { 834 | "text/plain": "2" 835 | }, 836 | "execution_count": 23, 837 | "metadata": {}, 838 | "output_type": "execute_result" 839 | } 840 | ], 841 | "source": [ 842 | "script_docs = [Document(page_content=script)]\n", 843 | "\n", 844 | "text_splitter = CharacterTextSplitter(chunk_size=2048, chunk_overlap=32)\n", 845 | "texts = text_splitter.split_documents(script_docs)\n", 846 | "len(texts)" 847 | ], 848 | "metadata": { 849 | "collapsed": false, 850 | "ExecuteTime": { 851 | "start_time": "2023-05-15T12:26:41.515984Z", 852 | "end_time": "2023-05-15T12:26:41.519878Z" 853 | } 854 | } 855 | }, 856 | { 857 | "cell_type": "code", 858 | "execution_count": 24, 859 | "outputs": [ 860 | { 861 | "data": { 862 | "text/plain": "Document(page_content='FADE IN:\\n\\nINT. DUNDER MIFFLIN SCRANTON - DAY\\n\\nThe employees of Dunder Mifflin are gathered in the conference room for a meeting. Michael Scott is standing at the front of the room, holding a toy rocket ship.\\n\\nMICHAEL: Good morning, everyone! Today, we’re going to talk about the moon.\\n\\nJIM: (whispering to Pam) Is he serious?\\n\\nPAM: (whispering back) I don’t know, but I’m afraid to ask.\\n\\nMICHAEL: As you all know, NASA is planning a mission to the moon in a few years. And I’ve been thinking, why should they have all the fun?\\n\\nDWIGHT: (excitedly) Are you suggesting we go to the moon, Michael?\\n\\nMICHAEL: (nodding) Yes, Dwight. I am.\\n\\nJIM: (sarcastically) Oh, great. Another one of Michael’s brilliant ideas.\\n\\nMICHAEL: (ignoring Jim) I’ve already contacted a space travel agency and they’ve agreed to take us to the moon.\\n\\nPAM: (concerned) Michael, I don’t think that’s a good idea. Going to the moon is dangerous.\\n\\nMICHAEL: (defensive) Pam, I’m not going to let a little thing like danger stop us from achieving our dreams.\\n\\nThe employees exchange worried glances.\\n\\nCUT TO:\\n\\nINT. SPACE TRAVEL AGENCY - DAY\\n\\nMichael, Dwight, Jim, and Pam are standing in front of a large rocket ship.\\n\\nMICHAEL: (excitedly) This is it, guys. Our ticket to the moon.\\n\\nJIM: (sarcastically) Yay.\\n\\nPAM: (nervously) Michael, are you sure about this?\\n\\nMICHAEL: (confidently) Absolutely, Pam. Trust me, this is going to be the adventure of a lifetime.\\n\\nThe employees reluctantly board the rocket ship.\\n\\nCUT TO:\\n\\nINT. ROCKET SHIP - DAY\\n\\nThe employees are strapped into their seats, looking nervous.\\n\\nMICHAEL: (over the intercom) Attention, everyone. We are about to take off. Please fasten your seatbelts and prepare for liftoff.\\n\\nThe rocket ship shakes and rumbles as it takes off.\\n\\nCUT TO:\\n\\nINT. MOON - DAY\\n\\nThe employees are standing on the surface of the moon, wearing space suits.\\n\\nJIM: (in awe) Wow. We’re actually on the moon.\\n\\nDWIGHT: (excitedly) This is amazing. I can’t wait to explore.', metadata={})" 863 | }, 864 | "execution_count": 24, 865 | "metadata": {}, 866 | "output_type": "execute_result" 867 | } 868 | ], 869 | "source": [ 870 | "texts[0]" 871 | ], 872 | "metadata": { 873 | "collapsed": false, 874 | "ExecuteTime": { 875 | "start_time": "2023-05-15T12:26:42.404655Z", 876 | "end_time": "2023-05-15T12:26:42.409217Z" 877 | } 878 | } 879 | }, 880 | { 881 | "cell_type": "markdown", 882 | "source": [ 883 | "### Create a database of embeddings" 884 | ], 885 | "metadata": { 886 | "collapsed": false 887 | } 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": 25, 892 | "outputs": [], 893 | "source": [ 894 | "embeddings = OpenAIEmbeddings()" 895 | ], 896 | "metadata": { 897 | "collapsed": false, 898 | "ExecuteTime": { 899 | "start_time": "2023-05-15T12:26:44.625644Z", 900 | "end_time": "2023-05-15T12:26:44.631084Z" 901 | } 902 | } 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": 26, 907 | "outputs": [ 908 | { 909 | "name": "stderr", 910 | "output_type": "stream", 911 | "text": [ 912 | "Using embedded DuckDB without persistence: data will be transient\n" 913 | ] 914 | } 915 | ], 916 | "source": [ 917 | "db = Chroma.from_documents(texts, embeddings)" 918 | ], 919 | "metadata": { 920 | "collapsed": false, 921 | "ExecuteTime": { 922 | "start_time": "2023-05-15T12:26:45.334841Z", 923 | "end_time": "2023-05-15T12:26:48.566773Z" 924 | } 925 | } 926 | }, 927 | { 928 | "cell_type": "markdown", 929 | "source": [ 930 | "### Create a question answering chain" 931 | ], 932 | "metadata": { 933 | "collapsed": false 934 | } 935 | }, 936 | { 937 | "cell_type": "code", 938 | "execution_count": 27, 939 | "outputs": [], 940 | "source": [ 941 | "qa_chain = RetrievalQA.from_chain_type(\n", 942 | " llm=model, chain_type=\"stuff\", retriever=db.as_retriever(search_kwargs={\"k\": 2}, verbose=True)\n", 943 | ")" 944 | ], 945 | "metadata": { 946 | "collapsed": false, 947 | "ExecuteTime": { 948 | "start_time": "2023-05-15T12:26:50.466600Z", 949 | "end_time": "2023-05-15T12:26:50.470999Z" 950 | } 951 | } 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": 28, 956 | "outputs": [ 957 | { 958 | "data": { 959 | "text/plain": "{'query': 'What is the place that The Office team is visiting?',\n 'result': 'The Office team is visiting the moon.'}" 960 | }, 961 | "execution_count": 28, 962 | "metadata": {}, 963 | "output_type": "execute_result" 964 | } 965 | ], 966 | "source": [ 967 | "response = qa_chain(\"What is the place that The Office team is visiting?\")\n", 968 | "response" 969 | ], 970 | "metadata": { 971 | "collapsed": false, 972 | "ExecuteTime": { 973 | "start_time": "2023-05-15T12:26:51.641574Z", 974 | "end_time": "2023-05-15T12:26:53.330807Z" 975 | } 976 | } 977 | }, 978 | { 979 | "cell_type": "code", 980 | "execution_count": 29, 981 | "outputs": [ 982 | { 983 | "name": "stdout", 984 | "output_type": "stream", 985 | "text": [ 986 | "The Office team is visiting the moon.\n" 987 | ] 988 | } 989 | ], 990 | "source": [ 991 | "print_response(response[\"result\"])" 992 | ], 993 | "metadata": { 994 | "collapsed": false, 995 | "ExecuteTime": { 996 | "start_time": "2023-05-15T12:26:59.189488Z", 997 | "end_time": "2023-05-15T12:26:59.195372Z" 998 | } 999 | } 1000 | }, 1001 | { 1002 | "cell_type": "markdown", 1003 | "source": [ 1004 | "#### Lets test if it knows that this is not in the text" 1005 | ], 1006 | "metadata": { 1007 | "collapsed": false 1008 | } 1009 | }, 1010 | { 1011 | "cell_type": "code", 1012 | "execution_count": 30, 1013 | "outputs": [ 1014 | { 1015 | "name": "stdout", 1016 | "output_type": "stream", 1017 | "text": [ 1018 | "There is no information provided in the context that suggests the client doesn't want to deal with\n", 1019 | "Michael.\n" 1020 | ] 1021 | } 1022 | ], 1023 | "source": [ 1024 | "response = qa_chain(\"Why the client doesn't want to deal with Michael?\")\n", 1025 | "print_response(response[\"result\"])" 1026 | ], 1027 | "metadata": { 1028 | "collapsed": false, 1029 | "ExecuteTime": { 1030 | "start_time": "2023-05-15T12:27:26.246452Z", 1031 | "end_time": "2023-05-15T12:27:30.268906Z" 1032 | } 1033 | } 1034 | }, 1035 | { 1036 | "cell_type": "markdown", 1037 | "source": [ 1038 | "# Bash Chain\n", 1039 | "\n", 1040 | "The BashChain is a special chain that allows you to run bash commands. It is useful for chaining together bash commands with other chains.\n", 1041 | "[Documentation](https://python.langchain.com/en/latest/modules/chains/examples/llm_bash.html?highlight=bash%20chain)" 1042 | ], 1043 | "metadata": { 1044 | "collapsed": false 1045 | } 1046 | }, 1047 | { 1048 | "cell_type": "code", 1049 | "execution_count": 31, 1050 | "outputs": [ 1051 | { 1052 | "name": "stdout", 1053 | "output_type": "stream", 1054 | "text": [ 1055 | "\n", 1056 | "\n", 1057 | "\u001B[1m> Entering new LLMBashChain chain...\u001B[0m\n", 1058 | "\n", 1059 | "Please write a bash script that prints a single line that Michael G. Scott from \"The Office\" might say\n", 1060 | "\u001B[32;1m\u001B[1;3m\n", 1061 | "```bash\n", 1062 | "echo \"That's what she said!\"\n", 1063 | "```\u001B[0m['```bash', 'echo \"That\\'s what she said!\"', '```']\n", 1064 | "\n", 1065 | "Answer: \u001B[33;1m\u001B[1;3mThat's what she said!\n", 1066 | "\u001B[0m\n", 1067 | "\u001B[1m> Finished chain.\u001B[0m\n" 1068 | ] 1069 | }, 1070 | { 1071 | "data": { 1072 | "text/plain": "\"That's what she said!\\n\"" 1073 | }, 1074 | "execution_count": 31, 1075 | "metadata": {}, 1076 | "output_type": "execute_result" 1077 | } 1078 | ], 1079 | "source": [ 1080 | "text = \"\"\"\n", 1081 | "Please write a bash script that prints a single line that Michael G. Scott from \"The Office\" might say\n", 1082 | "\"\"\"\n", 1083 | "\n", 1084 | "bash_chain = LLMBashChain(llm=OpenAI(temperature=0), verbose=True)\n", 1085 | "\n", 1086 | "bash_chain.run(text)" 1087 | ], 1088 | "metadata": { 1089 | "collapsed": false, 1090 | "ExecuteTime": { 1091 | "start_time": "2023-05-15T12:27:40.223294Z", 1092 | "end_time": "2023-05-15T12:27:44.388835Z" 1093 | } 1094 | } 1095 | }, 1096 | { 1097 | "cell_type": "code", 1098 | "execution_count": null, 1099 | "outputs": [], 1100 | "source": [], 1101 | "metadata": { 1102 | "collapsed": false 1103 | } 1104 | }, 1105 | { 1106 | "cell_type": "code", 1107 | "execution_count": null, 1108 | "outputs": [], 1109 | "source": [], 1110 | "metadata": { 1111 | "collapsed": false 1112 | } 1113 | } 1114 | ], 1115 | "metadata": { 1116 | "kernelspec": { 1117 | "display_name": "Python 3", 1118 | "language": "python", 1119 | "name": "python3" 1120 | }, 1121 | "language_info": { 1122 | "codemirror_mode": { 1123 | "name": "ipython", 1124 | "version": 2 1125 | }, 1126 | "file_extension": ".py", 1127 | "mimetype": "text/x-python", 1128 | "name": "python", 1129 | "nbconvert_exporter": "python", 1130 | "pygments_lexer": "ipython2", 1131 | "version": "2.7.6" 1132 | } 1133 | }, 1134 | "nbformat": 4, 1135 | "nbformat_minor": 0 1136 | } 1137 | -------------------------------------------------------------------------------- /notebooks/langchain.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Getting started with Langchain\n", 8 | "\n", 9 | "In this tutorial, we will walk through the basics of using LangChain to create a simple AI chatbot that can answer questions using OpenAI and Serpapi." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "ExecuteTime": { 17 | "end_time": "2023-05-07T10:53:34.209012Z", 18 | "start_time": "2023-05-07T10:53:31.599400Z" 19 | } 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "# import libraries\n", 24 | "import os\n", 25 | "from langchain.llms import OpenAI" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": { 32 | "ExecuteTime": { 33 | "end_time": "2023-05-04T07:51:05.142310Z", 34 | "start_time": "2023-05-04T07:51:05.138413Z" 35 | } 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "#create a new openai api key\n", 40 | "#os.environ[\"OPENAI_API_KEY\"] = \"...\"" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 5, 46 | "metadata": { 47 | "ExecuteTime": { 48 | "end_time": "2023-05-04T07:51:05.737195Z", 49 | "start_time": "2023-05-04T07:51:05.731132Z" 50 | } 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "# set up openai api key\n", 55 | "openai_api_key = os.environ.get('OPENAI_API_KEY')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "### The temperature of a llm is a hyperparameter that controls the randomness of the output. It is a value between 0 and 1. A higher temperature will result in more random output. A lower temperature will result in more predictable output. For this tutorial, we will set the temperature to 0.9. You can play around with this yourself to see how it affects the output." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 6, 68 | "metadata": { 69 | "ExecuteTime": { 70 | "end_time": "2023-05-04T07:51:09.477227Z", 71 | "start_time": "2023-05-04T07:51:08.896315Z" 72 | } 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "# create a llm\n", 77 | "llm = OpenAI(temperature = 0.9)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 7, 83 | "metadata": { 84 | "ExecuteTime": { 85 | "end_time": "2023-05-04T07:51:15.146461Z", 86 | "start_time": "2023-05-04T07:51:10.387988Z" 87 | } 88 | }, 89 | "outputs": [ 90 | { 91 | "name": "stdout", 92 | "output_type": "stream", 93 | "text": [ 94 | "\n", 95 | "\n", 96 | "1. Ireland \n", 97 | "2. Russia \n", 98 | "3. United States \n", 99 | "4. Germany\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "text = \"What are 4 countries where they eat a lot of potatoes?\"\n", 105 | "print(llm(text))" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "### This works! But what if we what if we want to ask a question about a different food? We can use the prompt template class to do this." 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "# Getting started with prompt templates\n", 120 | "\n", 121 | "### A prompt template is a string that contains variables that can be filled in with different values. For example, you could have a prompt template that looks like this:" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 8, 127 | "metadata": { 128 | "ExecuteTime": { 129 | "end_time": "2023-05-04T07:51:21.064906Z", 130 | "start_time": "2023-05-04T07:51:21.055502Z" 131 | } 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "from langchain.prompts import PromptTemplate" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 9, 141 | "metadata": { 142 | "ExecuteTime": { 143 | "end_time": "2023-05-04T07:51:21.665992Z", 144 | "start_time": "2023-05-04T07:51:21.661148Z" 145 | } 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "prompt = PromptTemplate(\n", 150 | " input_variables=[\"food\"],\n", 151 | " template=\"What are 4 countries where they eat a lot of {food}?\",\n", 152 | ")" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 10, 158 | "metadata": { 159 | "ExecuteTime": { 160 | "end_time": "2023-05-04T07:51:22.267425Z", 161 | "start_time": "2023-05-04T07:51:22.259980Z" 162 | } 163 | }, 164 | "outputs": [ 165 | { 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "What are 4 countries where they eat a lot of potatoes?\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "print(prompt.format(food=\"potatoes\"))" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 11, 180 | "metadata": { 181 | "ExecuteTime": { 182 | "end_time": "2023-05-04T07:51:25.790414Z", 183 | "start_time": "2023-05-04T07:51:23.070055Z" 184 | } 185 | }, 186 | "outputs": [ 187 | { 188 | "name": "stdout", 189 | "output_type": "stream", 190 | "text": [ 191 | "\n", 192 | "\n", 193 | "1. Ireland \n", 194 | "2. Germany \n", 195 | "3. Russia \n", 196 | "4. Poland\n" 197 | ] 198 | } 199 | ], 200 | "source": [ 201 | "print(llm(prompt.format(food=\"potatoes\")))" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "Notice how the answer is different from the previous one. This is because of the randomness of the llm. If we want to get the same answer every time, we can set the seed of the llm." 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 13, 214 | "metadata": { 215 | "ExecuteTime": { 216 | "end_time": "2023-05-04T07:52:37.534975Z", 217 | "start_time": "2023-05-04T07:52:33.797048Z" 218 | } 219 | }, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "\n", 226 | "\n", 227 | "1. China \n", 228 | "2. India \n", 229 | "3. Indonesia \n", 230 | "4. Japan\n" 231 | ] 232 | } 233 | ], 234 | "source": [ 235 | "print(llm(prompt.format(food=\"rice\")))" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "# Chaining\n", 243 | "\n", 244 | "### Here, we combine LLMs and prompts in multistep workflows using the prompt template class." 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 14, 250 | "metadata": { 251 | "ExecuteTime": { 252 | "end_time": "2023-05-04T07:52:42.076305Z", 253 | "start_time": "2023-05-04T07:52:42.073724Z" 254 | } 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "# import libraries\n", 259 | "from langchain.prompts import PromptTemplate\n", 260 | "from langchain.llms import OpenAI\n", 261 | "from langchain.chains import LLMChain" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 15, 267 | "metadata": { 268 | "ExecuteTime": { 269 | "end_time": "2023-05-04T07:52:42.738643Z", 270 | "start_time": "2023-05-04T07:52:42.733814Z" 271 | } 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "llm = OpenAI(temperature=0.9)\n", 276 | "\n", 277 | "prompt = PromptTemplate(\n", 278 | " input_variables=[\"food\"],\n", 279 | " template=\"What are 4 countries where they eat a lot of {food}?\",\n", 280 | ")" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 16, 286 | "metadata": { 287 | "ExecuteTime": { 288 | "end_time": "2023-05-04T07:52:44.089168Z", 289 | "start_time": "2023-05-04T07:52:44.083051Z" 290 | } 291 | }, 292 | "outputs": [], 293 | "source": [ 294 | "chain = LLMChain(llm=llm, prompt=prompt)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 17, 300 | "metadata": { 301 | "ExecuteTime": { 302 | "end_time": "2023-05-04T07:52:47.473090Z", 303 | "start_time": "2023-05-04T07:52:44.458688Z" 304 | } 305 | }, 306 | "outputs": [ 307 | { 308 | "name": "stdout", 309 | "output_type": "stream", 310 | "text": [ 311 | "\n", 312 | "\n", 313 | "1. Ireland\n", 314 | "2. United States\n", 315 | "3. Germany\n", 316 | "4. United Kingdom\n", 317 | "\n", 318 | "\n", 319 | "1. China\n", 320 | "2. Japan\n", 321 | "3. India\n", 322 | "4. Indonesia\n" 323 | ] 324 | } 325 | ], 326 | "source": [ 327 | "print(chain.run(\"potatoes\"))\n", 328 | "print(chain.run(\"rice\"))" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "# Agents: Dynamically call chains based on user input\n", 336 | "\n", 337 | "SerpApi is a search engine results page (SERP) scraping and parsing API that allows developers to retrieve and analyze data from various search engines, including Google, Bing, Yahoo, and more.\n", 338 | "\n", 339 | "\n", 340 | "To use SerpApi in Python, you will need to install the SerpApi Python module using pip. An API key can be created here:\n", 341 | "\n", 342 | "[Serpapi key](https://serpapi.com/users/sign_up)\n", 343 | "\n", 344 | "[Serpapi documentation](https://python.langchain.com/en/latest/modules/agents/tools/examples/serpapi.html)\n", 345 | "\n", 346 | "In the next part, we are going to combine OpenAI with SerpApi to create a chatbot that can answer questions with internet search results.\n", 347 | "\n", 348 | "[Agent langchain documentation](https://python.langchain.com/en/latest/modules/agents.html)\n", 349 | "\n", 350 | "\n" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 89, 356 | "metadata": { 357 | "ExecuteTime": { 358 | "end_time": "2023-04-25T19:11:54.670645Z", 359 | "start_time": "2023-04-25T19:11:51.130043Z" 360 | } 361 | }, 362 | "outputs": [ 363 | { 364 | "name": "stdout", 365 | "output_type": "stream", 366 | "text": [ 367 | "Requirement already satisfied: google-search-results in /Users/erictak/miniconda3/lib/python3.10/site-packages (2.4.2)\r\n", 368 | "Requirement already satisfied: requests in /Users/erictak/miniconda3/lib/python3.10/site-packages (from google-search-results) (2.28.1)\r\n", 369 | "Requirement already satisfied: certifi>=2017.4.17 in /Users/erictak/miniconda3/lib/python3.10/site-packages (from requests->google-search-results) (2022.12.7)\r\n", 370 | "Requirement already satisfied: idna<4,>=2.5 in /Users/erictak/miniconda3/lib/python3.10/site-packages (from requests->google-search-results) (3.4)\r\n", 371 | "Requirement already satisfied: charset-normalizer<3,>=2 in /Users/erictak/miniconda3/lib/python3.10/site-packages (from requests->google-search-results) (2.0.4)\r\n", 372 | "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/erictak/miniconda3/lib/python3.10/site-packages (from requests->google-search-results) (1.26.15)\r\n" 373 | ] 374 | } 375 | ], 376 | "source": [ 377 | "# Install serpapi\n", 378 | "#!pip install google-search-results" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 18, 384 | "metadata": { 385 | "ExecuteTime": { 386 | "end_time": "2023-05-04T07:52:52.926304Z", 387 | "start_time": "2023-05-04T07:52:52.891879Z" 388 | } 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "#import libraries\n", 393 | "from langchain.agents import load_tools\n", 394 | "from langchain.agents import initialize_agent\n", 395 | "from langchain.llms import OpenAI" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 19, 401 | "metadata": { 402 | "ExecuteTime": { 403 | "end_time": "2023-05-04T07:52:53.308175Z", 404 | "start_time": "2023-05-04T07:52:53.304615Z" 405 | } 406 | }, 407 | "outputs": [], 408 | "source": [ 409 | "# Load the model\n", 410 | "llm = OpenAI(temperature=0)" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "metadata": {}, 416 | "source": [ 417 | "Tools are functions that agents can use to interact with the world. These tools can be generic utilities (e.g. search), other chains, or even other agents.\n", 418 | "Here we use the SerpApi and llm-math tools. We will use the SerpApi tool to search the internet for answers to our questions. We will use the llm-math tool to answer math questions.\n", 419 | "\n", 420 | "[Tools documentation](https://python.langchain.com/en/latest/modules/agents/tools/getting_started.html)" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 2, 426 | "metadata": { 427 | "ExecuteTime": { 428 | "end_time": "2023-05-07T12:41:37.766193Z", 429 | "start_time": "2023-05-07T12:41:37.193083Z" 430 | } 431 | }, 432 | "outputs": [ 433 | { 434 | "ename": "NameError", 435 | "evalue": "name 'load_tools' is not defined", 436 | "output_type": "error", 437 | "traceback": [ 438 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", 439 | "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)", 440 | "Cell \u001B[0;32mIn[2], line 5\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;66;03m# Load in some tools to use\u001B[39;00m\n\u001B[1;32m 2\u001B[0m \n\u001B[1;32m 3\u001B[0m \u001B[38;5;66;03m#os.environ[\"SERPAPI_API_KEY\"] = \"...\"\u001B[39;00m\n\u001B[0;32m----> 5\u001B[0m tools \u001B[38;5;241m=\u001B[39m \u001B[43mload_tools\u001B[49m([\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mserpapi\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mllm-math\u001B[39m\u001B[38;5;124m\"\u001B[39m], llm\u001B[38;5;241m=\u001B[39mllm)\n", 441 | "\u001B[0;31mNameError\u001B[0m: name 'load_tools' is not defined" 442 | ] 443 | } 444 | ], 445 | "source": [ 446 | "# Load in some tools to use\n", 447 | "\n", 448 | "#os.environ[\"SERPAPI_API_KEY\"] = \"...\"\n", 449 | "\n", 450 | "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "metadata": {}, 456 | "source": [ 457 | "### Finally, let's initialize an agent with:\n", 458 | " 1. The tools\n", 459 | " 2. The language model\n", 460 | " 3. The type of agent we want to use." 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 21, 466 | "metadata": { 467 | "ExecuteTime": { 468 | "end_time": "2023-05-04T07:53:11.849563Z", 469 | "start_time": "2023-05-04T07:53:11.842091Z" 470 | } 471 | }, 472 | "outputs": [], 473 | "source": [ 474 | "agent = initialize_agent(tools, llm, agent=\"zero-shot-react-description\", verbose=True)" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": {}, 480 | "source": [ 481 | "For the agent, I have chosen the zero-shot-react-description agent. This agent uses the ReAct framework to determine which tool to use based solely on the tool’s description. Any number of tools can be provided. This agent requires that a description is provided for each tool.\n", 482 | "\n", 483 | "[Agent types](https://python.langchain.com/en/latest/modules/agents/agents/agent_types.html)\n", 484 | "\n", 485 | "By setting verbose=True, we can see the agent's internal state as it processes the input.\n", 486 | "\n", 487 | "Other agent types could also be chosen\n", 488 | "\n", 489 | "Now, let us ask our agent a question for which it will use the Serpapi tool to search the internet for an answer and need to calculate the answer using the llm-math tool." 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 22, 495 | "metadata": { 496 | "ExecuteTime": { 497 | "end_time": "2023-05-04T07:53:41.299371Z", 498 | "start_time": "2023-05-04T07:53:16.761842Z" 499 | } 500 | }, 501 | "outputs": [ 502 | { 503 | "name": "stdout", 504 | "output_type": "stream", 505 | "text": [ 506 | "\n", 507 | "\n", 508 | "\u001B[1m> Entering new AgentExecutor chain...\u001B[0m\n", 509 | "\u001B[32;1m\u001B[1;3m I need to find out Obama's height and then use a calculator to figure out how many cans of coke I need.\n", 510 | "Action: Search\n", 511 | "Action Input: Obama's height\u001B[0m\n", 512 | "Observation: \u001B[36;1m\u001B[1;3m6′ 2″\u001B[0m\n", 513 | "Thought:\u001B[32;1m\u001B[1;3m I need to convert this to centimeters and then use a calculator to figure out how many cans of coke I need.\n", 514 | "Action: Calculator\n", 515 | "Action Input: 6 feet 2 inches in centimeters\u001B[0m\n", 516 | "Observation: \u001B[33;1m\u001B[1;3mAnswer: 187.96\u001B[0m\n", 517 | "Thought:\u001B[32;1m\u001B[1;3m I now know the number of centimeters and can use a calculator to figure out how many cans of coke I need.\n", 518 | "Action: Calculator\n", 519 | "Action Input: 187.96 cm divided by 12 cm (the height of a can of coke)\u001B[0m\n", 520 | "Observation: \u001B[33;1m\u001B[1;3mAnswer: 15.663333333333334\u001B[0m\n", 521 | "Thought:\u001B[32;1m\u001B[1;3m I now know the final answer\n", 522 | "Final Answer: 15.66 cans of coke can be stacked to reach Obama's height of 6 feet 2 inches.\u001B[0m\n", 523 | "\n", 524 | "\u001B[1m> Finished chain.\u001B[0m\n" 525 | ] 526 | }, 527 | { 528 | "data": { 529 | "text/plain": [ 530 | "\"15.66 cans of coke can be stacked to reach Obama's height of 6 feet 2 inches.\"" 531 | ] 532 | }, 533 | "execution_count": 22, 534 | "metadata": {}, 535 | "output_type": "execute_result" 536 | } 537 | ], 538 | "source": [ 539 | "agent.run(\"What is the hight of Obama? And how many cans of coke can you stack to reach that height?\")" 540 | ] 541 | }, 542 | { 543 | "cell_type": "markdown", 544 | "metadata": {}, 545 | "source": [ 546 | "# Memory: Add state to chains and agents\n", 547 | "\n", 548 | "By adding memory to our agents, we can make them more dynamic and interactive. We can have a conversation and the agent will remember what we said and use that information to answer our questions.\n" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": 30, 554 | "metadata": { 555 | "ExecuteTime": { 556 | "end_time": "2023-05-04T07:55:17.264738Z", 557 | "start_time": "2023-05-04T07:55:17.258348Z" 558 | } 559 | }, 560 | "outputs": [], 561 | "source": [ 562 | "from langchain import OpenAI, ConversationChain" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 31, 568 | "metadata": { 569 | "ExecuteTime": { 570 | "end_time": "2023-05-04T07:55:17.763388Z", 571 | "start_time": "2023-05-04T07:55:17.759465Z" 572 | } 573 | }, 574 | "outputs": [], 575 | "source": [ 576 | "llm = OpenAI(temperature=0)\n", 577 | "conversation = ConversationChain(llm=llm, verbose=True)" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": 32, 583 | "metadata": { 584 | "ExecuteTime": { 585 | "end_time": "2023-05-04T07:55:19.754062Z", 586 | "start_time": "2023-05-04T07:55:18.363774Z" 587 | } 588 | }, 589 | "outputs": [ 590 | { 591 | "name": "stdout", 592 | "output_type": "stream", 593 | "text": [ 594 | "\n", 595 | "\n", 596 | "\u001B[1m> Entering new ConversationChain chain...\u001B[0m\n", 597 | "Prompt after formatting:\n", 598 | "\u001B[32;1m\u001B[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", 599 | "\n", 600 | "Current conversation:\n", 601 | "\n", 602 | "Human: Hi how are you doing!\n", 603 | "AI:\u001B[0m\n", 604 | "\n", 605 | "\u001B[1m> Finished chain.\u001B[0m\n" 606 | ] 607 | }, 608 | { 609 | "data": { 610 | "text/plain": [ 611 | "\" Hi there! I'm doing great, thanks for asking. How about you?\"" 612 | ] 613 | }, 614 | "execution_count": 32, 615 | "metadata": {}, 616 | "output_type": "execute_result" 617 | } 618 | ], 619 | "source": [ 620 | "conversation.predict(input=\"Hi how are you doing!\")" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 33, 626 | "metadata": { 627 | "ExecuteTime": { 628 | "end_time": "2023-05-04T07:55:26.267762Z", 629 | "start_time": "2023-05-04T07:55:24.635145Z" 630 | } 631 | }, 632 | "outputs": [ 633 | { 634 | "name": "stdout", 635 | "output_type": "stream", 636 | "text": [ 637 | "\n", 638 | "\n", 639 | "\u001B[1m> Entering new ConversationChain chain...\u001B[0m\n", 640 | "Prompt after formatting:\n", 641 | "\u001B[32;1m\u001B[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", 642 | "\n", 643 | "Current conversation:\n", 644 | "Human: Hi how are you doing!\n", 645 | "AI: Hi there! I'm doing great, thanks for asking. How about you?\n", 646 | "Human: I'm doing well! Just having a conversation with my newly created langchain agent with memory.\n", 647 | "AI:\u001B[0m\n", 648 | "\n", 649 | "\u001B[1m> Finished chain.\u001B[0m\n" 650 | ] 651 | }, 652 | { 653 | "data": { 654 | "text/plain": [ 655 | "' That sounds really cool! What kind of conversations have you been having?'" 656 | ] 657 | }, 658 | "execution_count": 33, 659 | "metadata": {}, 660 | "output_type": "execute_result" 661 | } 662 | ], 663 | "source": [ 664 | "conversation.predict(input=\"I'm doing well! Just having a conversation with my newly created langchain agent with memory.\")\n" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 34, 670 | "metadata": { 671 | "ExecuteTime": { 672 | "end_time": "2023-05-04T07:55:39.039729Z", 673 | "start_time": "2023-05-04T07:55:33.774782Z" 674 | } 675 | }, 676 | "outputs": [ 677 | { 678 | "name": "stdout", 679 | "output_type": "stream", 680 | "text": [ 681 | "\n", 682 | "\n", 683 | "\u001B[1m> Entering new ConversationChain chain...\u001B[0m\n", 684 | "Prompt after formatting:\n", 685 | "\u001B[32;1m\u001B[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", 686 | "\n", 687 | "Current conversation:\n", 688 | "Human: Hi how are you doing!\n", 689 | "AI: Hi there! I'm doing great, thanks for asking. How about you?\n", 690 | "Human: I'm doing well! Just having a conversation with my newly created langchain agent with memory.\n", 691 | "AI: That sounds really cool! What kind of conversations have you been having?\n", 692 | "Human: I would like to get to know a bit about what LangChain is\n", 693 | "AI:\u001B[0m\n", 694 | "\n", 695 | "\u001B[1m> Finished chain.\u001B[0m\n" 696 | ] 697 | }, 698 | { 699 | "data": { 700 | "text/plain": [ 701 | "' Sure! LangChain is a natural language processing platform that enables developers to create AI-powered chatbots and virtual assistants. It provides a suite of tools to help developers build, train, and deploy their bots quickly and easily. It also offers a range of features such as natural language understanding, sentiment analysis, and text-to-speech capabilities.'" 702 | ] 703 | }, 704 | "execution_count": 34, 705 | "metadata": {}, 706 | "output_type": "execute_result" 707 | } 708 | ], 709 | "source": [ 710 | "conversation.predict(input=\"I would like to get to know a bit about what LangChain is\")" 711 | ] 712 | }, 713 | { 714 | "cell_type": "markdown", 715 | "metadata": {}, 716 | "source": [ 717 | "# Continue the conversation yourself!" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": null, 723 | "metadata": {}, 724 | "outputs": [], 725 | "source": [ 726 | "conversation.predict(input=\"...\")" 727 | ] 728 | }, 729 | { 730 | "cell_type": "markdown", 731 | "metadata": {}, 732 | "source": [ 733 | "# This is the end of the tutorial. We hope you enjoyed it!\n", 734 | "\n", 735 | "Follow me on Github and Medium for more content:\n", 736 | "\n", 737 | "- [Github](https://github.com/rubentak)\n", 738 | "- [Medium](https://medium.com/@rubentak)" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": null, 744 | "metadata": {}, 745 | "outputs": [], 746 | "source": [] 747 | } 748 | ], 749 | "metadata": { 750 | "kernelspec": { 751 | "display_name": "Python 3 (ipykernel)", 752 | "language": "python", 753 | "name": "python3" 754 | }, 755 | "language_info": { 756 | "codemirror_mode": { 757 | "name": "ipython", 758 | "version": 3 759 | }, 760 | "file_extension": ".py", 761 | "mimetype": "text/x-python", 762 | "name": "python", 763 | "nbconvert_exporter": "python", 764 | "pygments_lexer": "ipython3", 765 | "version": "3.10.10" 766 | } 767 | }, 768 | "nbformat": 4, 769 | "nbformat_minor": 1 770 | } 771 | -------------------------------------------------------------------------------- /notebooks/langchain_pinecone.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Pinecone + Langchain Demo\n", 7 | "\n", 8 | "In this notebook i will show you how to easily create a semantic search engine for your documents using Pinecone and Langchain. The goal is to be able to search through your documents and find the most relevant ones to your query. We will also be able to ask questions about the documents and get answers back.\n", 9 | "\n", 10 | "[Pinecone signup](https://www.pinecone.io/)\n", 11 | "After logging in you can see your porjects, indexes and collections\n", 12 | "\n", 13 | "\n", 14 | "[Pinecone documentation](https://docs.pinecone.io/docs/python-client)\n", 15 | "\n", 16 | "[Pinecone Langchain documentation](https://www.pinecone.io/learn/series/langchain/langchain-intro/)\n", 17 | "\n", 18 | "[Langchain documentation](https://python.langchain.com/docs/get_started/introduction.html)\n" 19 | ], 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "id": "45a8053baf528836" 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "source": [ 28 | "### Install the packages" 29 | ], 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "id": "bf32b1d972e0e1c2" 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 1, 38 | "outputs": [], 39 | "source": [ 40 | "#!pip install langchain --upgrade\n", 41 | "#!pip install pypdf" 42 | ], 43 | "metadata": { 44 | "collapsed": false, 45 | "ExecuteTime": { 46 | "end_time": "2023-09-27T09:40:59.147531Z", 47 | "start_time": "2023-09-27T09:40:59.139746Z" 48 | } 49 | }, 50 | "id": "af8adc668081983d" 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "id": "2d3e92ed", 56 | "metadata": { 57 | "ExecuteTime": { 58 | "end_time": "2023-09-27T09:41:54.159120Z", 59 | "start_time": "2023-09-27T09:41:54.140119Z" 60 | } 61 | }, 62 | "outputs": [ 63 | { 64 | "ename": "ModuleNotFoundError", 65 | "evalue": "No module named 'langchain'", 66 | "output_type": "error", 67 | "traceback": [ 68 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", 69 | "\u001B[0;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", 70 | "Input \u001B[0;32mIn [3]\u001B[0m, in \u001B[0;36m\u001B[0;34m()\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;66;03m# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader\u001B[39;00m\n\u001B[0;32m----> 2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mlangchain\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdocument_loaders\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader\n\u001B[1;32m 3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mlangchain\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mtext_splitter\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m RecursiveCharacterTextSplitter\n\u001B[1;32m 4\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mos\u001B[39;00m\n", 71 | "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'langchain'" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader\n", 77 | "from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader\n", 78 | "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", 79 | "import os" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "id": "5166d759", 85 | "metadata": { 86 | "ExecuteTime": { 87 | "end_time": "2023-07-11T20:38:08.852519Z", 88 | "start_time": "2023-07-11T20:38:08.848491Z" 89 | } 90 | }, 91 | "source": [ 92 | "### Load your data\n", 93 | "\n", 94 | "The PDF file that I will use is a summary of one of my courses on strategy that i had in my bacholar.\n", 95 | "\n", 96 | "The PDF discusses various perspectives in the field of strategy, including the ideas of Clausewitz, Jomini, Marx, Tolstoy, Weber, Taylor, Follett, Rockefeller, Sloan, Ansoff, and game theory. It highlights the relevance of these perspectives in today's business environment and their impact on strategic thinking. The passage also mentions the different schools of thought in strategy, including the prescriptive and descriptive schools, and raises questions about their relationship to each other in the strategic process. Overall, it provides a comprehensive overview of different strategic perspectives and their implications." 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 76, 102 | "outputs": [], 103 | "source": [ 104 | "# create a loader\n", 105 | "loader = PyPDFLoader(\"../data/summary_strategy.pdf\")" 106 | ], 107 | "metadata": { 108 | "collapsed": false, 109 | "ExecuteTime": { 110 | "end_time": "2023-07-15T14:37:33.051666Z", 111 | "start_time": "2023-07-15T14:37:33.046956Z" 112 | } 113 | }, 114 | "id": "469312a1bc1a12fa" 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "source": [ 119 | "### Other options for loaders" 120 | ], 121 | "metadata": { 122 | "collapsed": false 123 | }, 124 | "id": "361fd031fd4eda62" 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 77, 129 | "outputs": [], 130 | "source": [ 131 | "# loader = UnstructuredPDFLoader(\"../data/summary_strategy.pdf\")\n", 132 | "# loader = OnlinePDFLoader(\"...\")" 133 | ], 134 | "metadata": { 135 | "collapsed": false, 136 | "ExecuteTime": { 137 | "end_time": "2023-07-15T14:37:33.655630Z", 138 | "start_time": "2023-07-15T14:37:33.652399Z" 139 | } 140 | }, 141 | "id": "9dc975356d7006be" 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 78, 146 | "id": "bcdac23c", 147 | "metadata": { 148 | "ExecuteTime": { 149 | "end_time": "2023-07-15T14:37:34.295917Z", 150 | "start_time": "2023-07-15T14:37:34.040125Z" 151 | } 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "# load your data\n", 156 | "data = loader.load()" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "source": [ 162 | "Note: If you're using PyPDFLoader, the text will be split by page for you already" 163 | ], 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "id": "32d7658fb742c073" 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 79, 172 | "id": "b4fd7c9e", 173 | "metadata": { 174 | "ExecuteTime": { 175 | "end_time": "2023-07-15T14:37:34.987030Z", 176 | "start_time": "2023-07-15T14:37:34.981646Z" 177 | } 178 | }, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "You have 18 document(s) in your data\n", 185 | "There are 3659 characters in your document\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "print (f'You have {len(data)} document(s) in your data')\n", 191 | "print (f'There are {len(data[0].page_content)} characters in your document')" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "id": "8af9b604", 197 | "metadata": { 198 | "ExecuteTime": { 199 | "end_time": "2023-07-11T20:38:12.194260Z", 200 | "start_time": "2023-07-11T20:38:12.191771Z" 201 | } 202 | }, 203 | "source": [ 204 | "### Split your data up into smaller documents with Chunks\n", 205 | "\n", 206 | "The chunksize should be chosen according to the length of your documents. If you have very long documents, you should choose a smaller chunksize. If you have very short documents, you should choose a smaller chunksize.\n", 207 | "\n", 208 | "The chunk overlap is the number of characters that will be shared between each chunk. This is useful if you want to make sure that your chunks are not too small.\n", 209 | "\n", 210 | "Play around with these parameters to see what works best for your data.\n", 211 | "\n", 212 | "Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 80, 218 | "id": "fb3c6f02", 219 | "metadata": { 220 | "ExecuteTime": { 221 | "end_time": "2023-07-15T14:37:35.803893Z", 222 | "start_time": "2023-07-15T14:37:35.801136Z" 223 | } 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", 228 | "texts = text_splitter.split_documents(data)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 81, 234 | "id": "879873a4", 235 | "metadata": { 236 | "ExecuteTime": { 237 | "end_time": "2023-07-15T14:37:36.304731Z", 238 | "start_time": "2023-07-15T14:37:36.295954Z" 239 | } 240 | }, 241 | "outputs": [ 242 | { 243 | "name": "stdout", 244 | "output_type": "stream", 245 | "text": [ 246 | "Now you have 41 documents\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "print (f'Now you have {len(texts)} documents')" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "id": "838b2843", 257 | "metadata": { 258 | "ExecuteTime": { 259 | "end_time": "2023-07-11T20:38:19.825374Z", 260 | "start_time": "2023-07-11T20:38:19.820981Z" 261 | } 262 | }, 263 | "source": [ 264 | "### Create embeddings of your documents\n", 265 | "\n", 266 | "Here we import the LangChain and Pinecone libraries. We will use the OpenAIEmbeddings class to create embeddings of our documents. We will then use the Pinecone library to create a Pinecone index and add our documents to it. Finally, we will use the Pinecone library to query our index and get back the most similar documents to our query." 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 121, 272 | "outputs": [], 273 | "source": [ 274 | "# import libraries\n", 275 | "from langchain.vectorstores import Pinecone\n", 276 | "from langchain.embeddings.openai import OpenAIEmbeddings\n", 277 | "import pinecone" 278 | ], 279 | "metadata": { 280 | "collapsed": false, 281 | "ExecuteTime": { 282 | "end_time": "2023-07-16T21:39:05.951908Z", 283 | "start_time": "2023-07-16T21:39:05.949818Z" 284 | } 285 | }, 286 | "id": "7e0c8c0a0cdb94" 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "source": [ 291 | "I chose to store my api keys in a file called credentials.py. You can also store them in your environment variables. You can find your api keys in the pinecone console." 292 | ], 293 | "metadata": { 294 | "collapsed": false 295 | }, 296 | "id": "3e2ed8751279bb2d" 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 122, 301 | "outputs": [], 302 | "source": [ 303 | "# import your API keys from a file called credentials.py\n", 304 | "from credentials import OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_API_ENV" 305 | ], 306 | "metadata": { 307 | "collapsed": false, 308 | "ExecuteTime": { 309 | "end_time": "2023-07-16T21:39:09.408778Z", 310 | "start_time": "2023-07-16T21:39:09.405108Z" 311 | } 312 | }, 313 | "id": "5fd0fc5a10960ab9" 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 123, 318 | "id": "0e093ef3", 319 | "metadata": { 320 | "hide_input": false, 321 | "ExecuteTime": { 322 | "end_time": "2023-07-16T21:39:11.479373Z", 323 | "start_time": "2023-07-16T21:39:11.475703Z" 324 | } 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "# you can also store the keys in your environment variables\n", 329 | "# OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'sk-...')\n", 330 | "#\n", 331 | "# PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '...')\n", 332 | "# PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', '...')" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "source": [ 338 | "I will use the OpenAI embeddings model to create embeddings of my documents. You can use any of the models that are available in the OpenAIEmbeddings class. You can also use any of the other embeddings models that are available in the langchain library." 339 | ], 340 | "metadata": { 341 | "collapsed": false 342 | }, 343 | "id": "42239b867b9ed4f7" 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 85, 348 | "id": "4e0d1c6a", 349 | "metadata": { 350 | "ExecuteTime": { 351 | "end_time": "2023-07-15T14:37:46.360383Z", 352 | "start_time": "2023-07-15T14:37:46.357915Z" 353 | } 354 | }, 355 | "outputs": [], 356 | "source": [ 357 | "# create embeddings\n", 358 | "embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "source": [ 364 | "## Create a Pinecone index and add your documents to it\n", 365 | "\n", 366 | "Here we create a Pinecone index and add our documents to it. We will then use the Pinecone library to query our index and get back the most similar documents to our query. I chose a dimension of 1536 and a metric of cosine. You can play around with these parameters to see what works best for your data.\n", 367 | "\n", 368 | "For the OpenAI text-embedding-ada-002 embeddings, the output dimension is 1536, hence the dimension parameter.\n", 369 | "\n", 370 | "\n", 371 | "The metric can be cosine, euclidean, or l2, depending on the type of data you have.\n", 372 | "\n", 373 | "#### Cosine Distance\n", 374 | "Description: Measures the cosine of the angle between two vectors, often used when working with normalized or convex sets.\n", 375 | "Use Cases: Document classification, semantic search, recommendation systems, and any other task involving high-dimensional and normalized data.\n", 376 | "\n", 377 | "#### Euclidean Distance (L2)\n", 378 | "Description: Calculates the straight-line distance between two vectors in a multidimensional space.\n", 379 | "Use Cases: Image recognition, speech recognition, handwriting analysis.\n", 380 | "\n", 381 | "#### Inner Product (Dot Product)\n", 382 | "Description: Computes the sum of the products of the vectors' corresponding components.\n", 383 | "Use Cases: Recommendation systems, collaborative filtering, matrix factorization.\n", 384 | "\n", 385 | "[source](https://www.imaurer.com/which-vector-similarity-metric-should-i-use/)\n", 386 | "\n", 387 | "In the free trial of pinecone you van only create one index. If you want to create more, you can upgrade to a paid plan." 388 | ], 389 | "metadata": { 390 | "collapsed": false 391 | }, 392 | "id": "fbf7f613063b114b" 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "outputs": [], 398 | "source": [ 399 | "# create a pinecone index\n", 400 | "pinecone.create_index(\"python-index\", dimension=1536, metric=\"cosine\")" 401 | ], 402 | "metadata": { 403 | "collapsed": false 404 | }, 405 | "id": "a416ab938d53caa" 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 86, 410 | "id": "0deb2f6a", 411 | "metadata": { 412 | "ExecuteTime": { 413 | "end_time": "2023-07-15T14:37:53.481093Z", 414 | "start_time": "2023-07-15T14:37:52.637558Z" 415 | } 416 | }, 417 | "outputs": [], 418 | "source": [ 419 | "# initialize pinecone\n", 420 | "pinecone.init(\n", 421 | " api_key=PINECONE_API_KEY, # find at app.pinecone.io\n", 422 | " environment=PINECONE_API_ENV # next to API key in console\n", 423 | ")\n", 424 | "\n", 425 | "index_name = \"python-index\" # put in the name of your pinecone index here" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "outputs": [], 432 | "source": [ 433 | "docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)" 434 | ], 435 | "metadata": { 436 | "collapsed": false 437 | }, 438 | "id": "1d5fe7bfef4edb7f" 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "outputs": [], 444 | "source": [ 445 | "# if you already have an index, you can load it like this\n", 446 | "#docsearch = Pinecone.from_existing_index(index_name, embeddings)" 447 | ], 448 | "metadata": { 449 | "collapsed": false 450 | }, 451 | "id": "4b390bf8ee6b513b" 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 115, 456 | "id": "34929595", 457 | "metadata": { 458 | "ExecuteTime": { 459 | "end_time": "2023-07-15T15:32:31.030766Z", 460 | "start_time": "2023-07-15T15:32:29.390088Z" 461 | } 462 | }, 463 | "outputs": [], 464 | "source": [ 465 | "query = \"Who was Von Clausewitz?\"\n", 466 | "docs = docsearch.similarity_search(query)" 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "source": [ 472 | "the function of similarity_search is defined as follows:\n", 473 | "\n", 474 | "\n", 475 | "def similarity_search(\n", 476 | " self,\n", 477 | " query: str,\n", 478 | " k: int = 4,\n", 479 | " filter: dict | None = None,\n", 480 | " namespace: str | None = None,\n", 481 | " **kwargs: Any) -> list[Document]\n", 482 | "\n", 483 | "\n", 484 | "The Query is the text that you want to search for. The k is the number of documents that you want to return. The filter is a dictionary of filters that you can use to filter your results. The namespace is the namespace of your index. The **kwargs are any other arguments that you want to pass to the pinecone library.\n", 485 | "\n", 486 | "By default, it will return the top 4 documents that are most similar to your query. You can change this by changing the k parameter.\n" 487 | ], 488 | "metadata": { 489 | "collapsed": false 490 | }, 491 | "id": "5937579e86312da5" 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 89, 496 | "outputs": [ 497 | { 498 | "data": { 499 | "text/plain": "[Document(page_content='9RQ\\x03&ODXVHZLW] Historic perspectives can still be seen in business perspective today. Clausewitz his perspective seems timeless. He is in the same timeframe as Jomini. French was in a revolution; Eu was ruled by royal houses and the Napoleonic war was going on. When was 12 he joined the army and when he turned 21 he joined the military academy as a scholar. There he met Gerhard von Scharnhorst (lecturer) and Marie von Bruhl (married Clausewitz). The was a grave and was Clausewitz his ticket to the higher circles in Prussia. She played an important role in Clausewitz his career progress and development of perspective in strategy. Von Clausewitz served in an old-fashioned army, Clausewitz was captured by the French and Prussia was conquered. After the release Clausewitz joined the Russian army and fought Napoleon. They defeated Napoleon. Clausewitz tried to finish a book but died of Cholera. 7KH\\x03ERRN\\x03KDV\\x03WKH\\x03WLWOH\\x03µ¶RQ\\x03ZDU¶¶\\x11\\x03', metadata={}),\n Document(page_content='9RQ\\x03&ODXVHZLW] Historic perspectives can still be seen in business perspective today. Clausewitz his perspective seems timeless. He is in the same timeframe as Jomini. French was in a revolution; Eu was ruled by royal houses and the Napoleonic war was going on. When was 12 he joined the army and when he turned 21 he joined the military academy as a scholar. There he met Gerhard von Scharnhorst (lecturer) and Marie von Bruhl (married Clausewitz). The was a grave and was Clausewitz his ticket to the higher circles in Prussia. She played an important role in Clausewitz his career progress and development of perspective in strategy. Von Clausewitz served in an old-fashioned army, Clausewitz was captured by the French and Prussia was conquered. After the release Clausewitz joined the Russian army and fought Napoleon. They defeated Napoleon. Clausewitz tried to finish a book but died of Cholera. 7KH\\x03ERRN\\x03KDV\\x03WKH\\x03WLWOH\\x03µ¶RQ\\x03ZDU¶¶\\x11\\x03', metadata={}),\n Document(page_content='9RQ\\x03&ODXVHZLW] Historic perspectives can still be seen in business perspective today. Clausewitz his perspective seems timeless. He is in the same timeframe as Jomini. French was in a revolution; Eu was ruled by royal houses and the Napoleonic war was going on. When was 12 he joined the army and when he turned 21 he joined the military academy as a scholar. There he met Gerhard von Scharnhorst (lecturer) and Marie von Bruhl (married Clausewitz). The was a grave and was Clausewitz his ticket to the higher circles in Prussia. She played an important role in Clausewitz his career progress and development of perspective in strategy. Von Clausewitz served in an old-fashioned army, Clausewitz was captured by the French and Prussia was conquered. After the release Clausewitz joined the Russian army and fought Napoleon. They defeated Napoleon. Clausewitz tried to finish a book but died of Cholera. 7KH\\x03ERRN\\x03KDV\\x03WKH\\x03WLWOH\\x03µ¶RQ\\x03ZDU¶¶\\x11\\x03 &ODXVHZLW]\\x03GHILQHV\\x03VWUDWHJ\\\\\\x03DV\\x03µ¶7KH\\x03XVH\\x03RI\\x03FRPEDW\\x0f\\x03RU\\x03WKH\\x03threat of combat, for the purpose of the ZDU\\x03LQ\\x03ZKLFK\\x03PDNHV\\x03LW\\x03WDNHV\\x03SODFH¶¶\\x11\\x03)ULFWLRQ\\x03\\x0bSDJH\\x03\\x1b\\x1a\\x0c\\x03is a concept of his perspective. Clausewitz states it as that everything in war is simple, but the simplest thing ios difficult. The difficulties in war are cumulate and in in producing a kind of friction. You can only experience friction unless you experienced a war. He describes friction as countless minor incidents which can lower the general level of performance. An example: You have a plan to attack something, but then you experience friction: little things go wrong and suddenly the roads are of terrible quality, they are worse then expected, or a supply that comes in late so soldiers are out of food and ammunition. Things that cant be foresee. This calls Clausewitz friction. If this is what you believe, what is the role of theory? Clausewitz thought that Theory should never be leading. Practice will always be turned out different,', metadata={}),\n Document(page_content='9RQ\\x03&ODXVHZLW] Historic perspectives can still be seen in business perspective today. Clausewitz his perspective seems timeless. He is in the same timeframe as Jomini. French was in a revolution; Eu was ruled by royal houses and the Napoleonic war was going on. When was 12 he joined the army and when he turned 21 he joined the military academy as a scholar. There he met Gerhard von Scharnhorst (lecturer) and Marie von Bruhl (married Clausewitz). The was a grave and was Clausewitz his ticket to the higher circles in Prussia. She played an important role in Clausewitz his career progress and development of perspective in strategy. Von Clausewitz served in an old-fashioned army, Clausewitz was captured by the French and Prussia was conquered. After the release Clausewitz joined the Russian army and fought Napoleon. They defeated Napoleon. Clausewitz tried to finish a book but died of Cholera. 7KH\\x03ERRN\\x03KDV\\x03WKH\\x03WLWOH\\x03µ¶RQ\\x03ZDU¶¶\\x11\\x03 &ODXVHZLW]\\x03GHILQHV\\x03VWUDWHJ\\\\\\x03DV\\x03µ¶7KH\\x03XVH\\x03RI\\x03FRPEDW\\x0f\\x03RU\\x03WKH\\x03threat of combat, for the purpose of the ZDU\\x03LQ\\x03ZKLFK\\x03PDNHV\\x03LW\\x03WDNHV\\x03SODFH¶¶\\x11\\x03)ULFWLRQ\\x03\\x0bSDJH\\x03\\x1b\\x1a\\x0c\\x03is a concept of his perspective. Clausewitz states it as that everything in war is simple, but the simplest thing ios difficult. The difficulties in war are cumulate and in in producing a kind of friction. You can only experience friction unless you experienced a war. He describes friction as countless minor incidents which can lower the general level of performance. An example: You have a plan to attack something, but then you experience friction: little things go wrong and suddenly the roads are of terrible quality, they are worse then expected, or a supply that comes in late so soldiers are out of food and ammunition. Things that cant be foresee. This calls Clausewitz friction. If this is what you believe, what is the role of theory? Clausewitz thought that Theory should never be leading. Practice will always be turned out different,', metadata={})]" 500 | }, 501 | "execution_count": 89, 502 | "metadata": {}, 503 | "output_type": "execute_result" 504 | } 505 | ], 506 | "source": [ 507 | "docs" 508 | ], 509 | "metadata": { 510 | "collapsed": false, 511 | "ExecuteTime": { 512 | "end_time": "2023-07-15T14:38:06.258476Z", 513 | "start_time": "2023-07-15T14:38:06.252699Z" 514 | } 515 | }, 516 | "id": "a1da57485da1ea47" 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 90, 521 | "id": "4e0f5b45", 522 | "metadata": { 523 | "ExecuteTime": { 524 | "end_time": "2023-07-15T14:38:08.005124Z", 525 | "start_time": "2023-07-15T14:38:07.993020Z" 526 | } 527 | }, 528 | "outputs": [ 529 | { 530 | "name": "stdout", 531 | "output_type": "stream", 532 | "text": [ 533 | "9RQ\u0003&ODXVHZLW] Historic perspectives can still be seen in business perspective today. Clausewitz his perspective seems timeless. He is in the same timeframe as Jomini. French was in a revolution; Eu was ruled by royal houses and the Napoleonic war was going on. When was 12 he joined the army and when he turned 21 he joined the military academy as a scholar. There he met Gerhard von Scharnhorst (lecturer) and Marie von Bruhl (married Clausewitz).\n" 534 | ] 535 | } 536 | ], 537 | "source": [ 538 | "# Here's an example of the first document that was returned\n", 539 | "print(docs[0].page_content[:450])" 540 | ] 541 | }, 542 | { 543 | "cell_type": "markdown", 544 | "id": "3c35dcd9", 545 | "metadata": { 546 | "ExecuteTime": { 547 | "end_time": "2023-07-11T20:23:44.812819Z", 548 | "start_time": "2023-07-11T20:23:44.807162Z" 549 | } 550 | }, 551 | "source": [ 552 | "### Query those docs to get your answer back\n", 553 | "\n", 554 | "Here we will use the langchain library to create a question answering chain. We will then use the chain to query our documents and get back the answer to our question.\n", 555 | "\n", 556 | "If you want to know more about how to use the question answering chain, I wrote a Medium article about it here: https://medium.com/@rubentak/langchain-using-different-langchain-chains-to-write-a-new-episode-for-the-office-us-7c45d869d895" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 117, 562 | "id": "f051337b", 563 | "metadata": { 564 | "ExecuteTime": { 565 | "end_time": "2023-07-15T15:49:47.786253Z", 566 | "start_time": "2023-07-15T15:49:47.782357Z" 567 | } 568 | }, 569 | "outputs": [], 570 | "source": [ 571 | "from langchain.chains.question_answering import load_qa_chain\n", 572 | "from langchain.llms import OpenAI\n", 573 | "from langchain.chat_models import ChatOpenAI" 574 | ] 575 | }, 576 | { 577 | "cell_type": "markdown", 578 | "source": [ 579 | "Use GPT-4 model to answer questions" 580 | ], 581 | "metadata": { 582 | "collapsed": false 583 | }, 584 | "id": "2294c263e88fe5a7" 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": 118, 589 | "id": "6b9b1c03", 590 | "metadata": { 591 | "ExecuteTime": { 592 | "end_time": "2023-07-15T15:49:48.484598Z", 593 | "start_time": "2023-07-15T15:49:48.481346Z" 594 | } 595 | }, 596 | "outputs": [], 597 | "source": [ 598 | "llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY, model_name='gpt-4')\n", 599 | "chain = load_qa_chain(llm, chain_type=\"stuff\")" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 119, 605 | "id": "f67ea7c2", 606 | "metadata": { 607 | "ExecuteTime": { 608 | "end_time": "2023-07-15T15:49:51.947489Z", 609 | "start_time": "2023-07-15T15:49:50.105431Z" 610 | } 611 | }, 612 | "outputs": [], 613 | "source": [ 614 | "query = \"Who was Von Clausewitz?\"\n", 615 | "docs = docsearch.similarity_search(query)" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": 120, 621 | "id": "3dfd2b7d", 622 | "metadata": { 623 | "ExecuteTime": { 624 | "end_time": "2023-07-15T15:50:03.469398Z", 625 | "start_time": "2023-07-15T15:49:53.052399Z" 626 | } 627 | }, 628 | "outputs": [ 629 | { 630 | "data": { 631 | "text/plain": "'Von Clausewitz was a military theorist who served in the Prussian and Russian armies. He joined the army at the age of 12 and later attended a military academy where he met influential figures such as Gerhard von Scharnhorst and Marie von Bruhl, the latter of whom he married. His wife played a significant role in his career progress and development of perspective in strategy. He was captured by the French during the Napoleonic war, and after his release, he joined the Russian army and fought against Napoleon. He attempted to finish a book titled \"On War\" but died of Cholera before he could complete it. His perspectives on war and strategy are still influential today.'" 632 | }, 633 | "execution_count": 120, 634 | "metadata": {}, 635 | "output_type": "execute_result" 636 | } 637 | ], 638 | "source": [ 639 | "chain.run(input_documents=docs, question=query)" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 95, 645 | "outputs": [ 646 | { 647 | "data": { 648 | "text/plain": "[Document(page_content='9RQ\\x03&ODXVHZLW] Historic perspectives can still be seen in business perspective today. Clausewitz his perspective seems timeless. He is in the same timeframe as Jomini. French was in a revolution; Eu was ruled by royal houses and the Napoleonic war was going on. When was 12 he joined the army and when he turned 21 he joined the military academy as a scholar. There he met Gerhard von Scharnhorst (lecturer) and Marie von Bruhl (married Clausewitz). The was a grave and was Clausewitz his ticket to the higher circles in Prussia. She played an important role in Clausewitz his career progress and development of perspective in strategy. Von Clausewitz served in an old-fashioned army, Clausewitz was captured by the French and Prussia was conquered. After the release Clausewitz joined the Russian army and fought Napoleon. They defeated Napoleon. Clausewitz tried to finish a book but died of Cholera. 7KH\\x03ERRN\\x03KDV\\x03WKH\\x03WLWOH\\x03µ¶RQ\\x03ZDU¶¶\\x11\\x03', metadata={}),\n Document(page_content='9RQ\\x03&ODXVHZLW] Historic perspectives can still be seen in business perspective today. Clausewitz his perspective seems timeless. He is in the same timeframe as Jomini. French was in a revolution; Eu was ruled by royal houses and the Napoleonic war was going on. When was 12 he joined the army and when he turned 21 he joined the military academy as a scholar. There he met Gerhard von Scharnhorst (lecturer) and Marie von Bruhl (married Clausewitz). The was a grave and was Clausewitz his ticket to the higher circles in Prussia. She played an important role in Clausewitz his career progress and development of perspective in strategy. Von Clausewitz served in an old-fashioned army, Clausewitz was captured by the French and Prussia was conquered. After the release Clausewitz joined the Russian army and fought Napoleon. They defeated Napoleon. Clausewitz tried to finish a book but died of Cholera. 7KH\\x03ERRN\\x03KDV\\x03WKH\\x03WLWOH\\x03µ¶RQ\\x03ZDU¶¶\\x11\\x03', metadata={}),\n Document(page_content='9RQ\\x03&ODXVHZLW] Historic perspectives can still be seen in business perspective today. Clausewitz his perspective seems timeless. He is in the same timeframe as Jomini. French was in a revolution; Eu was ruled by royal houses and the Napoleonic war was going on. When was 12 he joined the army and when he turned 21 he joined the military academy as a scholar. There he met Gerhard von Scharnhorst (lecturer) and Marie von Bruhl (married Clausewitz). The was a grave and was Clausewitz his ticket to the higher circles in Prussia. She played an important role in Clausewitz his career progress and development of perspective in strategy. Von Clausewitz served in an old-fashioned army, Clausewitz was captured by the French and Prussia was conquered. After the release Clausewitz joined the Russian army and fought Napoleon. They defeated Napoleon. Clausewitz tried to finish a book but died of Cholera. 7KH\\x03ERRN\\x03KDV\\x03WKH\\x03WLWOH\\x03µ¶RQ\\x03ZDU¶¶\\x11\\x03 &ODXVHZLW]\\x03GHILQHV\\x03VWUDWHJ\\\\\\x03DV\\x03µ¶7KH\\x03XVH\\x03RI\\x03FRPEDW\\x0f\\x03RU\\x03WKH\\x03threat of combat, for the purpose of the ZDU\\x03LQ\\x03ZKLFK\\x03PDNHV\\x03LW\\x03WDNHV\\x03SODFH¶¶\\x11\\x03)ULFWLRQ\\x03\\x0bSDJH\\x03\\x1b\\x1a\\x0c\\x03is a concept of his perspective. Clausewitz states it as that everything in war is simple, but the simplest thing ios difficult. The difficulties in war are cumulate and in in producing a kind of friction. You can only experience friction unless you experienced a war. He describes friction as countless minor incidents which can lower the general level of performance. An example: You have a plan to attack something, but then you experience friction: little things go wrong and suddenly the roads are of terrible quality, they are worse then expected, or a supply that comes in late so soldiers are out of food and ammunition. Things that cant be foresee. This calls Clausewitz friction. If this is what you believe, what is the role of theory? Clausewitz thought that Theory should never be leading. Practice will always be turned out different,', metadata={}),\n Document(page_content='9RQ\\x03&ODXVHZLW] Historic perspectives can still be seen in business perspective today. Clausewitz his perspective seems timeless. He is in the same timeframe as Jomini. French was in a revolution; Eu was ruled by royal houses and the Napoleonic war was going on. When was 12 he joined the army and when he turned 21 he joined the military academy as a scholar. There he met Gerhard von Scharnhorst (lecturer) and Marie von Bruhl (married Clausewitz). The was a grave and was Clausewitz his ticket to the higher circles in Prussia. She played an important role in Clausewitz his career progress and development of perspective in strategy. Von Clausewitz served in an old-fashioned army, Clausewitz was captured by the French and Prussia was conquered. After the release Clausewitz joined the Russian army and fought Napoleon. They defeated Napoleon. Clausewitz tried to finish a book but died of Cholera. 7KH\\x03ERRN\\x03KDV\\x03WKH\\x03WLWOH\\x03µ¶RQ\\x03ZDU¶¶\\x11\\x03 &ODXVHZLW]\\x03GHILQHV\\x03VWUDWHJ\\\\\\x03DV\\x03µ¶7KH\\x03XVH\\x03RI\\x03FRPEDW\\x0f\\x03RU\\x03WKH\\x03threat of combat, for the purpose of the ZDU\\x03LQ\\x03ZKLFK\\x03PDNHV\\x03LW\\x03WDNHV\\x03SODFH¶¶\\x11\\x03)ULFWLRQ\\x03\\x0bSDJH\\x03\\x1b\\x1a\\x0c\\x03is a concept of his perspective. Clausewitz states it as that everything in war is simple, but the simplest thing ios difficult. The difficulties in war are cumulate and in in producing a kind of friction. You can only experience friction unless you experienced a war. He describes friction as countless minor incidents which can lower the general level of performance. An example: You have a plan to attack something, but then you experience friction: little things go wrong and suddenly the roads are of terrible quality, they are worse then expected, or a supply that comes in late so soldiers are out of food and ammunition. Things that cant be foresee. This calls Clausewitz friction. If this is what you believe, what is the role of theory? Clausewitz thought that Theory should never be leading. Practice will always be turned out different,', metadata={})]" 649 | }, 650 | "execution_count": 95, 651 | "metadata": {}, 652 | "output_type": "execute_result" 653 | } 654 | ], 655 | "source": [ 656 | "docs" 657 | ], 658 | "metadata": { 659 | "collapsed": false, 660 | "ExecuteTime": { 661 | "end_time": "2023-07-15T14:38:21.290333Z", 662 | "start_time": "2023-07-15T14:38:21.283490Z" 663 | } 664 | }, 665 | "id": "d54af84b9de76ac7" 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": null, 670 | "outputs": [], 671 | "source": [ 672 | "from langchain.document_loaders import PyPDFLoader\n", 673 | "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", 674 | "from langchain.vectorstores import Pinecone\n", 675 | "from langchain.embeddings.openai import OpenAIEmbeddings\n", 676 | "from langchain.llms import OpenAI\n", 677 | "from langchain.chains.question_answering import load_qa_chain\n", 678 | "import pinecone\n", 679 | "from credentials import OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_API_ENV" 680 | ], 681 | "metadata": { 682 | "collapsed": false 683 | }, 684 | "id": "3f6bf8351492f151" 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": 113, 689 | "outputs": [], 690 | "source": [ 691 | "def create_qa_bot(prompt):\n", 692 | " # Load the PDF data\n", 693 | " loader = PyPDFLoader(\"../data/summary_strategy.pdf\")\n", 694 | " data = loader.load()\n", 695 | "\n", 696 | " # Split the data into smaller documents\n", 697 | " text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", 698 | " texts = text_splitter.split_documents(data)\n", 699 | "\n", 700 | " # Create embeddings\n", 701 | " embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)\n", 702 | "\n", 703 | " # Create a Pinecone index and add the documents to it\n", 704 | " #pinecone.create_index(\"python-index\", dimension=1536, metric=\"cosine\")\n", 705 | " pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)\n", 706 | " index_name = \"python-index\"\n", 707 | " docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)\n", 708 | "\n", 709 | " # Perform similarity search\n", 710 | " docs = docsearch.similarity_search(prompt)\n", 711 | "\n", 712 | " # Load the question answering chain\n", 713 | " llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)\n", 714 | " chain = load_qa_chain(llm, chain_type=\"stuff\")\n", 715 | "\n", 716 | " # Query the documents and get the answer\n", 717 | " answer = chain.run(input_documents=docs, question=prompt)\n", 718 | "\n", 719 | " return answer" 720 | ], 721 | "metadata": { 722 | "collapsed": false, 723 | "ExecuteTime": { 724 | "end_time": "2023-07-15T14:47:52.895033Z", 725 | "start_time": "2023-07-15T14:47:52.891904Z" 726 | } 727 | }, 728 | "id": "ea04193821b023bc" 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": 114, 733 | "outputs": [ 734 | { 735 | "name": "stdout", 736 | "output_type": "stream", 737 | "text": [ 738 | " Von Clausewitz was a Prussian military theorist and soldier who served in the Prussian army and the Russian army during the Napoleonic Wars. He is best known for his book On War, which is still studied today for its insights into military strategy and tactics.\n" 739 | ] 740 | } 741 | ], 742 | "source": [ 743 | "# Usage example\n", 744 | "prompt = \"Who was Von Clausewitz?\"\n", 745 | "answer = create_qa_bot(prompt)\n", 746 | "print(answer)" 747 | ], 748 | "metadata": { 749 | "collapsed": false, 750 | "ExecuteTime": { 751 | "end_time": "2023-07-15T14:48:03.232732Z", 752 | "start_time": "2023-07-15T14:47:53.439422Z" 753 | } 754 | }, 755 | "id": "1c5ee52ea45343bf" 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": null, 760 | "outputs": [], 761 | "source": [], 762 | "metadata": { 763 | "collapsed": false 764 | }, 765 | "id": "8e710cde61d98230" 766 | } 767 | ], 768 | "metadata": { 769 | "kernelspec": { 770 | "display_name": "Python 3 (ipykernel)", 771 | "language": "python", 772 | "name": "python3" 773 | }, 774 | "language_info": { 775 | "codemirror_mode": { 776 | "name": "ipython", 777 | "version": 3 778 | }, 779 | "file_extension": ".py", 780 | "mimetype": "text/x-python", 781 | "name": "python", 782 | "nbconvert_exporter": "python", 783 | "pygments_lexer": "ipython3", 784 | "version": "3.9.13" 785 | } 786 | }, 787 | "nbformat": 4, 788 | "nbformat_minor": 5 789 | } 790 | -------------------------------------------------------------------------------- /notebooks/langchain_text.file.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Using langchain to extract information of a textfile\n" 7 | ], 8 | "metadata": { 9 | "collapsed": false 10 | } 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": { 16 | "collapsed": true, 17 | "ExecuteTime": { 18 | "start_time": "2023-05-23T15:27:54.147198Z", 19 | "end_time": "2023-05-23T15:27:54.149785Z" 20 | } 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "# # import libraries\n", 25 | "import os\n", 26 | "from langchain.llms import OpenAI" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "outputs": [], 33 | "source": [ 34 | "#create a new openai api key\n", 35 | "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" 36 | ], 37 | "metadata": { 38 | "collapsed": false, 39 | "ExecuteTime": { 40 | "start_time": "2023-05-23T15:27:54.572890Z", 41 | "end_time": "2023-05-23T15:27:54.575318Z" 42 | } 43 | } 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 4, 48 | "outputs": [], 49 | "source": [ 50 | "# set up openai api key\n", 51 | "openai_api_key = os.environ.get('OPENAI_API_KEY')" 52 | ], 53 | "metadata": { 54 | "collapsed": false, 55 | "ExecuteTime": { 56 | "start_time": "2023-05-23T15:27:55.834529Z", 57 | "end_time": "2023-05-23T15:27:55.837839Z" 58 | } 59 | } 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 5, 64 | "outputs": [], 65 | "source": [ 66 | "from langchain.document_loaders import TextLoader\n", 67 | "loader = TextLoader('../data/notebook.txt')" 68 | ], 69 | "metadata": { 70 | "collapsed": false, 71 | "ExecuteTime": { 72 | "start_time": "2023-05-23T15:27:56.535470Z", 73 | "end_time": "2023-05-23T15:27:56.538124Z" 74 | } 75 | } 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 6, 80 | "outputs": [], 81 | "source": [ 82 | "# load the text\n", 83 | "with open('../data/notebook.txt') as f:\n", 84 | " notebook = f.read()" 85 | ], 86 | "metadata": { 87 | "collapsed": false, 88 | "ExecuteTime": { 89 | "start_time": "2023-05-23T15:27:57.066713Z", 90 | "end_time": "2023-05-23T15:27:57.070006Z" 91 | } 92 | } 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 7, 97 | "outputs": [], 98 | "source": [ 99 | "# text split into chunks\n", 100 | "from langchain.text_splitter import CharacterTextSplitter\n", 101 | "text_splitter1 = CharacterTextSplitter(\n", 102 | " separator = \"]\\n\",\n", 103 | " chunk_size = 1000,\n", 104 | " chunk_overlap = 200,\n", 105 | " length_function = len,\n", 106 | ")\n", 107 | "# text split 2\n", 108 | "text_splitter2 = CharacterTextSplitter(\n", 109 | " separator = \"]\\n\",\n", 110 | " chunk_size = 500,\n", 111 | " chunk_overlap = 100,\n", 112 | " length_function = len,\n", 113 | ")\n", 114 | "# text split 3\n", 115 | "text_splitter3= CharacterTextSplitter(\n", 116 | " separator = \"]\\n\",\n", 117 | " chunk_size = 100,\n", 118 | " chunk_overlap = 20,\n", 119 | " length_function = len,\n", 120 | ")" 121 | ], 122 | "metadata": { 123 | "collapsed": false, 124 | "ExecuteTime": { 125 | "start_time": "2023-05-23T15:27:58.221813Z", 126 | "end_time": "2023-05-23T15:27:58.223990Z" 127 | } 128 | } 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 25, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | "page_content='[\\'# Getting started with Langchain\\\\n\\', \\'\\\\n\\', \\'In this tutorial, we will walk through the basics of using LangChain to create a simple AI chatbot that can answer questions using OpenAI and Serpapi.\\']\\n[\\'# import libraries\\\\n\\', \\'import os\\\\n\\', \\'from langchain.llms import OpenAI\\']\\n[\\'#create a new openai api key\\\\n\\', \\'#os.environ[\"OPENAI_API_KEY\"] = \"...\"\\']\\n[\\'# set up openai api key\\\\n\\', \"openai_api_key = os.environ.get(\\'OPENAI_API_KEY\\')\"]\\n[\\'### The temperature of a llm is a hyperparameter that controls the randomness of the output. It is a value between 0 and 1. A higher temperature will result in more random output. A lower temperature will result in more predictable output. For this tutorial, we will set the temperature to 0.9. You can play aroun with this yourself to see how it affects the output.\\']\\n[\\'# create a llm\\\\n\\', \\'llm = OpenAI(temperature = 0.9)\\']\\n[\\'text = \"What are 4 countries where they eat a lot of potatoes?\"\\\\n\\', \\'print(llm(text))\\'' metadata={}\n" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "texts = text_splitter1.create_documents([notebook])\n", 144 | "#texts2 = text_splitter2.create_documents([notebook])\n", 145 | "#texts3 = text_splitter3.create_documents([notebook])\n", 146 | "print(texts[0])" 147 | ], 148 | "metadata": { 149 | "collapsed": false, 150 | "ExecuteTime": { 151 | "start_time": "2023-05-18T23:43:12.525200Z", 152 | "end_time": "2023-05-18T23:43:12.531288Z" 153 | } 154 | } 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "source": [ 159 | "# Create an index" 160 | ], 161 | "metadata": { 162 | "collapsed": false 163 | } 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 10, 168 | "outputs": [ 169 | { 170 | "name": "stderr", 171 | "output_type": "stream", 172 | "text": [ 173 | "Using embedded DuckDB without persistence: data will be transient\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "from langchain.indexes import VectorstoreIndexCreator\n", 179 | "index = VectorstoreIndexCreator().from_loaders([loader])" 180 | ], 181 | "metadata": { 182 | "collapsed": false, 183 | "ExecuteTime": { 184 | "start_time": "2023-05-23T15:28:24.452308Z", 185 | "end_time": "2023-05-23T15:28:29.187213Z" 186 | } 187 | } 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 11, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/plain": "' This text is about using LangChain to create a simple AI chatbot that can answer questions using OpenAI and Serpapi. It also covers how to add memory to agents, how to set up an OpenAI API key, and how to use PromptTemplates and LLMChains.'" 196 | }, 197 | "execution_count": 11, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "query = \"What is the text about?\"\n", 204 | "index.query(query)" 205 | ], 206 | "metadata": { 207 | "collapsed": false, 208 | "ExecuteTime": { 209 | "start_time": "2023-05-23T15:28:36.864010Z", 210 | "end_time": "2023-05-23T15:28:45.273966Z" 211 | } 212 | } 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 12, 217 | "outputs": [ 218 | { 219 | "name": "stderr", 220 | "output_type": "stream", 221 | "text": [ 222 | "Retrying langchain.llms.openai.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised APIError: The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID c9825b3d7d5b5f0ea8e92ba86986c531 in your message.) {\n", 223 | " \"error\": {\n", 224 | " \"message\": \"The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID c9825b3d7d5b5f0ea8e92ba86986c531 in your message.)\",\n", 225 | " \"type\": \"server_error\",\n", 226 | " \"param\": null,\n", 227 | " \"code\": null\n", 228 | " }\n", 229 | "}\n", 230 | " 500 {'error': {'message': 'The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID c9825b3d7d5b5f0ea8e92ba86986c531 in your message.)', 'type': 'server_error', 'param': None, 'code': None}} {'Date': 'Tue, 23 May 2023 13:28:52 GMT', 'Content-Type': 'application/json', 'Content-Length': '366', 'Connection': 'keep-alive', 'access-control-allow-origin': '*', 'openai-processing-ms': '155', 'openai-version': '2020-10-01', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'x-ratelimit-limit-requests': '3000', 'x-ratelimit-limit-tokens': '250000', 'x-ratelimit-remaining-requests': '2999', 'x-ratelimit-remaining-tokens': '249487', 'x-ratelimit-reset-requests': '20ms', 'x-ratelimit-reset-tokens': '122ms', 'x-request-id': 'c9825b3d7d5b5f0ea8e92ba86986c531', 'CF-Cache-Status': 'DYNAMIC', 'Server': 'cloudflare', 'CF-RAY': '7cbda4fbf96503a7-MAD', 'alt-svc': 'h3=\":443\"; ma=86400, h3-29=\":443\"; ma=86400'}.\n" 231 | ] 232 | }, 233 | { 234 | "data": { 235 | "text/plain": "' This text explains how to use LangChain to create a simple AI chatbot that can answer questions using OpenAI and Serpapi. It explains how to set up an OpenAI API key and how to create a LLM with a temperature of 0.9. It also explains how to use PromptTemplates and LLMChains to generate responses to questions.'" 236 | }, 237 | "execution_count": 12, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "query = \"please give a summary of the text\"\n", 244 | "index.query(query)" 245 | ], 246 | "metadata": { 247 | "collapsed": false, 248 | "ExecuteTime": { 249 | "start_time": "2023-05-23T15:28:51.354647Z", 250 | "end_time": "2023-05-23T15:29:02.312746Z" 251 | } 252 | } 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 13, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | " \n", 263 | "1. Load the model\n", 264 | "2. Load in some tools to use\n", 265 | "3. Initialize an agent\n", 266 | "4. Create a new OpenAI API key\n", 267 | "5. Set up OpenAI API key\n", 268 | "6. Create a LLM with a temperature of 0.9\n", 269 | "7. Run the LLM with a text input\n" 270 | ] 271 | } 272 | ], 273 | "source": [ 274 | "query = \"please give bullet points of all the steps in the text\"\n", 275 | "print(index.query(query))" 276 | ], 277 | "metadata": { 278 | "collapsed": false, 279 | "ExecuteTime": { 280 | "start_time": "2023-05-23T15:29:02.315006Z", 281 | "end_time": "2023-05-23T15:29:08.698087Z" 282 | } 283 | } 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 14, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/plain": "' This notebook provides a tutorial on how to use LangChain to create a simple AI chatbot that can answer questions using OpenAI and Serpapi. It covers topics such as importing libraries, setting up an OpenAI API key, creating a language model, and initializing an agent. It also covers how to add memory to the agent to make it more dynamic and interactive. By the end of the tutorial, you will have a fully functioning AI chatbot that can answer questions and have conversations.'" 292 | }, 293 | "execution_count": 14, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "query = \"Write a introduction paragraph for an article about this notebook\"\n", 300 | "index.query(query)" 301 | ], 302 | "metadata": { 303 | "collapsed": false, 304 | "ExecuteTime": { 305 | "start_time": "2023-05-23T15:29:13.011314Z", 306 | "end_time": "2023-05-23T15:29:24.115892Z" 307 | } 308 | } 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 18, 313 | "outputs": [ 314 | { 315 | "name": "stdout", 316 | "output_type": "stream", 317 | "text": [ 318 | "\n", 319 | "\n", 320 | "- How to use LangChain to create a simple AI chatbot\n", 321 | "- How to use OpenAI and Serpapi to answer questions\n", 322 | "- How to set up an OpenAI API key\n", 323 | "- How to set the temperature of a language model\n", 324 | "- How to use tools, language models, and agents to interact with the world\n" 325 | ] 326 | } 327 | ], 328 | "source": [ 329 | "query = \"what are the key takeaways in bullet points in this notebook?\"\n", 330 | "print(index.query(query))" 331 | ], 332 | "metadata": { 333 | "collapsed": false, 334 | "ExecuteTime": { 335 | "start_time": "2023-05-23T15:30:31.100022Z", 336 | "end_time": "2023-05-23T15:30:36.618327Z" 337 | } 338 | } 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 17, 343 | "outputs": [ 344 | { 345 | "name": "stdout", 346 | "output_type": "stream", 347 | "text": [ 348 | " #import libraries, from langchain.agents import load_tools, from langchain.agents import initialize_agent, from langchain.llms import OpenAI, # Load the model, llm = OpenAI(temperature=0), Tools are functions that agents can use to interact with the world. These tools can be generic utilities (e.g. search), other chains, or even other agents., # Load in some tools to use, #os.environ[\"SERPAPI_API_KEY\"] = \"...\", tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm), # Finally, let's initialize an agent with:, 1. The tools, 2. The language model, 3. The type of agent we want to use.\n" 349 | ] 350 | } 351 | ], 352 | "source": [ 353 | "query = \"what are the first 10 lines of code?\"\n", 354 | "print(index.query(query))" 355 | ], 356 | "metadata": { 357 | "collapsed": false, 358 | "ExecuteTime": { 359 | "start_time": "2023-05-23T15:30:16.866693Z", 360 | "end_time": "2023-05-23T15:30:28.531216Z" 361 | } 362 | } 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "source": [ 367 | "# Agent article writer" 368 | ], 369 | "metadata": { 370 | "collapsed": false 371 | } 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 21, 376 | "outputs": [], 377 | "source": [ 378 | "from langchain.document_loaders import TextLoader\n", 379 | "\n", 380 | "loader = TextLoader('../data/notebook.txt')" 381 | ], 382 | "metadata": { 383 | "collapsed": false, 384 | "ExecuteTime": { 385 | "start_time": "2023-05-23T15:32:39.819844Z", 386 | "end_time": "2023-05-23T15:32:39.823082Z" 387 | } 388 | } 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 22, 393 | "outputs": [], 394 | "source": [ 395 | "# load the text\n", 396 | "with open('../data/notebook.txt') as f:\n", 397 | " notebook = f.read()" 398 | ], 399 | "metadata": { 400 | "collapsed": false, 401 | "ExecuteTime": { 402 | "start_time": "2023-05-23T15:32:40.427401Z", 403 | "end_time": "2023-05-23T15:32:40.431420Z" 404 | } 405 | } 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 23, 410 | "outputs": [], 411 | "source": [ 412 | "from langchain import OpenAI, ConversationChain\n", 413 | "llm = OpenAI(temperature=0)\n", 414 | "conversation = ConversationChain(llm=llm, verbose=True)" 415 | ], 416 | "metadata": { 417 | "collapsed": false, 418 | "ExecuteTime": { 419 | "start_time": "2023-05-23T15:32:41.180047Z", 420 | "end_time": "2023-05-23T15:32:41.184875Z" 421 | } 422 | } 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 24, 427 | "outputs": [ 428 | { 429 | "name": "stdout", 430 | "output_type": "stream", 431 | "text": [ 432 | "\n", 433 | "\n", 434 | "\u001B[1m> Entering new ConversationChain chain...\u001B[0m\n", 435 | "Prompt after formatting:\n", 436 | "\u001B[32;1m\u001B[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", 437 | "\n", 438 | "Current conversation:\n", 439 | "\n", 440 | "Human: can you read the notebook.txt file for me?\n", 441 | "AI:\u001B[0m\n", 442 | "\n", 443 | "\u001B[1m> Finished chain.\u001B[0m\n" 444 | ] 445 | }, 446 | { 447 | "data": { 448 | "text/plain": "' Sure, I can read the notebook.txt file for you. It contains a list of notes about a project you are working on. The notes include ideas, tasks, and resources related to the project. Do you need me to read the entire file or just a specific section?'" 449 | }, 450 | "execution_count": 24, 451 | "metadata": {}, 452 | "output_type": "execute_result" 453 | } 454 | ], 455 | "source": [ 456 | "conversation.predict(input=\"can you read the notebook.txt file for me?\")" 457 | ], 458 | "metadata": { 459 | "collapsed": false, 460 | "ExecuteTime": { 461 | "start_time": "2023-05-23T15:32:42.024890Z", 462 | "end_time": "2023-05-23T15:32:47.355078Z" 463 | } 464 | } 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 42, 469 | "outputs": [ 470 | { 471 | "name": "stdout", 472 | "output_type": "stream", 473 | "text": [ 474 | "\n", 475 | "\n", 476 | "\u001B[1m> Entering new ConversationChain chain...\u001B[0m\n", 477 | "Prompt after formatting:\n", 478 | "\u001B[32;1m\u001B[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", 479 | "\n", 480 | "Current conversation:\n", 481 | "Human: can you read the notebook.txt file for me?\n", 482 | "AI: Sure, I can read the notebook.txt file for you. It contains a list of notes about a project you are working on. The notes include ideas, tasks, and resources you need to complete the project. Do you need me to read the entire file or just a specific section?\n", 483 | "Human: What is it about?\n", 484 | "AI:\u001B[0m\n", 485 | "\n", 486 | "\u001B[1m> Finished chain.\u001B[0m\n" 487 | ] 488 | }, 489 | { 490 | "data": { 491 | "text/plain": "' The notebook.txt file contains notes about a project you are working on. The notes include ideas, tasks, and resources you need to complete the project. It also includes any other information related to the project that you have written down.'" 492 | }, 493 | "execution_count": 42, 494 | "metadata": {}, 495 | "output_type": "execute_result" 496 | } 497 | ], 498 | "source": [ 499 | "conversation.predict(input=\"What is it about?\")" 500 | ], 501 | "metadata": { 502 | "collapsed": false, 503 | "ExecuteTime": { 504 | "start_time": "2023-05-18T23:46:31.121329Z", 505 | "end_time": "2023-05-18T23:46:36.924566Z" 506 | } 507 | } 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "outputs": [], 513 | "source": [], 514 | "metadata": { 515 | "collapsed": false 516 | } 517 | }, 518 | { 519 | "cell_type": "markdown", 520 | "source": [ 521 | "# chain for summarization\n", 522 | "[Documentation](https://python.langchain.com/en/latest/modules/chains/index_examples/summarize.html)" 523 | ], 524 | "metadata": { 525 | "collapsed": false 526 | } 527 | } 528 | ], 529 | "metadata": { 530 | "kernelspec": { 531 | "display_name": "Python 3", 532 | "language": "python", 533 | "name": "python3" 534 | }, 535 | "language_info": { 536 | "codemirror_mode": { 537 | "name": "ipython", 538 | "version": 2 539 | }, 540 | "file_extension": ".py", 541 | "mimetype": "text/x-python", 542 | "name": "python", 543 | "nbconvert_exporter": "python", 544 | "pygments_lexer": "ipython2", 545 | "version": "2.7.6" 546 | } 547 | }, 548 | "nbformat": 4, 549 | "nbformat_minor": 0 550 | } 551 | -------------------------------------------------------------------------------- /notebooks/llama_and_chroma.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "initial_id", 7 | "metadata": { 8 | "collapsed": true, 9 | "ExecuteTime": { 10 | "end_time": "2023-09-27T16:36:00.813823Z", 11 | "start_time": "2023-09-27T16:36:00.480824Z" 12 | } 13 | }, 14 | "outputs": [ 15 | { 16 | "ename": "ModuleNotFoundError", 17 | "evalue": "No module named 'dotenv'", 18 | "output_type": "error", 19 | "traceback": [ 20 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", 21 | "\u001B[0;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", 22 | "Input \u001B[0;32mIn [1]\u001B[0m, in \u001B[0;36m\u001B[0;34m()\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mdotenv\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m load_dotenv\n\u001B[1;32m 2\u001B[0m load_dotenv()\n\u001B[1;32m 4\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mllama_index\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m GPTVectorStoreIndex, TrafilaturaWebReader\n", 23 | "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'dotenv'" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "\n", 29 | "from dotenv import load_dotenv\n", 30 | "load_dotenv()\n", 31 | "\n", 32 | "from llama_index import GPTVectorStoreIndex, TrafilaturaWebReader\n", 33 | "import chromadb\n", 34 | "\n", 35 | "\n", 36 | "def create_embedding_store(name):\n", 37 | " chroma_client = chromadb.Client()\n", 38 | " return chroma_client.create_collection(name)\n", 39 | "\n", 40 | "def query_pages(collection, urls, questions):\n", 41 | " docs = TrafilaturaWebReader().load_data(urls)\n", 42 | " index = GPTVectorStoreIndex.from_documents(docs, chroma_collection=collection)\n", 43 | " query_engine = index.as_query_engine()\n", 44 | " for question in questions:\n", 45 | " print(f\"Question: {question} \\n\")\n", 46 | " print(f\"Answer: {query_engine.query(question)}\")\n", 47 | "\n", 48 | "if __name__ == \"__main__\":\n", 49 | " url_list = [\"https://supertype.ai\", \"https://supertype.ai/about-us\"]\n", 50 | " questions = [\n", 51 | " \"Who are the members of Supertype.ai\",\n", 52 | " \"What problems are they trying to solve?\",\n", 53 | " \"What are the important values at the company?\"\n", 54 | " ]\n", 55 | "\n", 56 | " collection = create_embedding_store(\"supertype\")\n", 57 | "\n", 58 | " query_pages(\n", 59 | " collection,\n", 60 | " url_list,\n", 61 | " questions\n", 62 | " )" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "outputs": [], 69 | "source": [], 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "id": "191f7441b389b70a" 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 2 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython2", 92 | "version": "2.7.6" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 5 97 | } 98 | --------------------------------------------------------------------------------