├── .gitignore ├── README.md ├── img └── ai.png ├── requirements.txt └── web_explorer.py /.gitignore: -------------------------------------------------------------------------------- 1 | secrets.toml 2 | .venv 3 | __pycache__ 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web Explorer 2 | 3 | This is a lightweight app using the [Web Research Retriever](https://github.com/langchain-ai/langchain/pull/8102). 4 | 5 | ## Setup 6 | You only need to supply a few things. 7 | 8 | In `settings()` function, supply: 9 | 10 | * Search: Select the search tool you want to use (e.g., GoogleSearchAPIWrapper). 11 | * Vectorstore: Select the vectorstore and embeddings you want to use (e.g., Chroma, OpenAIEmbeddings). 12 | * Select the LLM you want to use (e.g., ChatOpenAI). 13 | 14 | To use `st.secrets` set enviorment variables in `.streamlit/secrets.toml` file. 15 | 16 | Or, simply add environemnt variables and remove `st.secrets`: 17 | ``` 18 | import os 19 | os.environ["GOOGLE_API_KEY"] = "YOUR_API_KEY" 20 | os.environ["GOOGLE_CSE_ID"] = "YOUR_CSE_ID" 21 | os.environ["OPENAI_API_BASE"] = "https://api.openai.com/v1" 22 | os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY" 23 | 24 | ``` 25 | 26 | For `GOOGLE_API_KEY` , you could get it from [this link](https://console.cloud.google.com/apis/api/customsearch.googleapis.com/credentials). 27 | 28 | For `GOOGLE_CSE_ID` , you could get it from [this link](https://programmablesearchengine.google.com/) 29 | 30 | ## Run 31 | ``` 32 | streamlit run web_explorer.py 33 | ``` 34 | 35 | Example output: 36 | ![example](https://github.com/langchain-ai/web-explorer/assets/122662504/f1383640-d089-492d-8757-ad743d34535f) -------------------------------------------------------------------------------- /img/ai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/web-explorer/cd43e03e8b7eec8234a65314d76563e06a4ea9e6/img/ai.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.25.0 2 | langchain==0.0.244 3 | chromadb==0.4.3 4 | openai==0.27.8 5 | html2text==2020.1.16 6 | google-api-core==2.11.1 7 | google-api-python-client==2.95.0 8 | google-auth==2.22.0 9 | google-auth-httplib2==0.1.0 10 | googleapis-common-protos==1.59.1 11 | tiktoken==0.4.0 12 | faiss-cpu==1.7.4 -------------------------------------------------------------------------------- /web_explorer.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from langchain.callbacks.base import BaseCallbackHandler 3 | from langchain.chains import RetrievalQAWithSourcesChain 4 | from langchain.retrievers.web_research import WebResearchRetriever 5 | 6 | import os 7 | 8 | os.environ["GOOGLE_API_KEY"] = "YOUR_API_KEY" # Get it at https://console.cloud.google.com/apis/api/customsearch.googleapis.com/credentials 9 | os.environ["GOOGLE_CSE_ID"] = "YOUR_CSE_ID" # Get it at https://programmablesearchengine.google.com/ 10 | os.environ["OPENAI_API_BASE"] = "https://api.openai.com/v1" 11 | os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY" # Get it at https://beta.openai.com/account/api-keys 12 | 13 | st.set_page_config(page_title="Interweb Explorer", page_icon="🌐") 14 | 15 | def settings(): 16 | 17 | # Vectorstore 18 | import faiss 19 | from langchain.vectorstores import FAISS 20 | from langchain.embeddings.openai import OpenAIEmbeddings 21 | from langchain.docstore import InMemoryDocstore 22 | embeddings_model = OpenAIEmbeddings() 23 | embedding_size = 1536 24 | index = faiss.IndexFlatL2(embedding_size) 25 | vectorstore_public = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {}) 26 | 27 | # LLM 28 | from langchain.chat_models import ChatOpenAI 29 | llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0, streaming=True) 30 | 31 | # Search 32 | from langchain.utilities import GoogleSearchAPIWrapper 33 | search = GoogleSearchAPIWrapper() 34 | 35 | # Initialize 36 | web_retriever = WebResearchRetriever.from_llm( 37 | vectorstore=vectorstore_public, 38 | llm=llm, 39 | search=search, 40 | num_search_results=3 41 | ) 42 | 43 | return web_retriever, llm 44 | 45 | class StreamHandler(BaseCallbackHandler): 46 | def __init__(self, container, initial_text=""): 47 | self.container = container 48 | self.text = initial_text 49 | 50 | def on_llm_new_token(self, token: str, **kwargs) -> None: 51 | self.text += token 52 | self.container.info(self.text) 53 | 54 | 55 | class PrintRetrievalHandler(BaseCallbackHandler): 56 | def __init__(self, container): 57 | self.container = container.expander("Context Retrieval") 58 | 59 | def on_retriever_start(self, query: str, **kwargs): 60 | self.container.write(f"**Question:** {query}") 61 | 62 | def on_retriever_end(self, documents, **kwargs): 63 | # self.container.write(documents) 64 | for idx, doc in enumerate(documents): 65 | source = doc.metadata["source"] 66 | self.container.write(f"**Results from {source}**") 67 | self.container.text(doc.page_content) 68 | 69 | 70 | st.sidebar.image("img/ai.png") 71 | st.header("`Interweb Explorer`") 72 | st.info("`I am an AI that can answer questions by exploring, reading, and summarizing web pages." 73 | "I can be configured to use different modes: public API or private (no data sharing).`") 74 | 75 | # Make retriever and llm 76 | if 'retriever' not in st.session_state: 77 | st.session_state['retriever'], st.session_state['llm'] = settings() 78 | web_retriever = st.session_state.retriever 79 | llm = st.session_state.llm 80 | 81 | # User input 82 | question = st.text_input("`Ask a question:`") 83 | 84 | if question: 85 | 86 | # Generate answer (w/ citations) 87 | import logging 88 | logging.basicConfig() 89 | logging.getLogger("langchain.retrievers.web_research").setLevel(logging.INFO) 90 | qa_chain = RetrievalQAWithSourcesChain.from_chain_type(llm, retriever=web_retriever) 91 | 92 | # Write answer and sources 93 | retrieval_streamer_cb = PrintRetrievalHandler(st.container()) 94 | answer = st.empty() 95 | stream_handler = StreamHandler(answer, initial_text="`Answer:`\n\n") 96 | result = qa_chain({"question": question},callbacks=[retrieval_streamer_cb, stream_handler]) 97 | answer.info('`Answer:`\n\n' + result['answer']) 98 | st.info('`Sources:`\n\n' + result['sources']) 99 | --------------------------------------------------------------------------------