├── .gitignore
├── README.md
├── img
    └── ai.png
├── requirements.txt
└── web_explorer.py


/.gitignore:
--------------------------------------------------------------------------------
1 | secrets.toml
2 | .venv
3 | __pycache__
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Web Explorer
 2 | 
 3 | This is a lightweight app using the [Web Research Retriever](https://github.com/langchain-ai/langchain/pull/8102).
 4 | 
 5 | ## Setup
 6 | You only need to supply a few things.
 7 | 
 8 | In `settings()` function, supply:
 9 | 
10 | * Search: Select the search tool you want to use (e.g., GoogleSearchAPIWrapper). 
11 | * Vectorstore: Select the vectorstore and embeddings you want to use (e.g., Chroma, OpenAIEmbeddings).
12 | * Select the LLM you want to use (e.g., ChatOpenAI).
13 | 
14 | To use `st.secrets` set enviorment variables in `.streamlit/secrets.toml` file.
15 |  
16 | Or, simply add environemnt variables and remove `st.secrets`: 
17 | ```
18 | import os
19 | os.environ["GOOGLE_API_KEY"] = "YOUR_API_KEY"
20 | os.environ["GOOGLE_CSE_ID"] = "YOUR_CSE_ID" 
21 | os.environ["OPENAI_API_BASE"] = "https://api.openai.com/v1"
22 | os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"
23 | 
24 | ```
25 | 
26 | For `GOOGLE_API_KEY` , you could get it from [this link](https://console.cloud.google.com/apis/api/customsearch.googleapis.com/credentials).
27 | 
28 | For `GOOGLE_CSE_ID` , you could get it from [this link](https://programmablesearchengine.google.com/)
29 | 
30 | ## Run
31 | ```
32 | streamlit run web_explorer.py
33 | ```
34 | 
35 | Example output:
36 | ![example](https://github.com/langchain-ai/web-explorer/assets/122662504/f1383640-d089-492d-8757-ad743d34535f)


--------------------------------------------------------------------------------
/img/ai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/web-explorer/cd43e03e8b7eec8234a65314d76563e06a4ea9e6/img/ai.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | streamlit==1.25.0
 2 | langchain==0.0.244
 3 | chromadb==0.4.3
 4 | openai==0.27.8
 5 | html2text==2020.1.16
 6 | google-api-core==2.11.1
 7 | google-api-python-client==2.95.0
 8 | google-auth==2.22.0
 9 | google-auth-httplib2==0.1.0
10 | googleapis-common-protos==1.59.1
11 | tiktoken==0.4.0
12 | faiss-cpu==1.7.4


--------------------------------------------------------------------------------
/web_explorer.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from langchain.callbacks.base import BaseCallbackHandler
 3 | from langchain.chains import RetrievalQAWithSourcesChain
 4 | from langchain.retrievers.web_research import WebResearchRetriever
 5 | 
 6 | import os
 7 | 
 8 | os.environ["GOOGLE_API_KEY"] = "YOUR_API_KEY" # Get it at https://console.cloud.google.com/apis/api/customsearch.googleapis.com/credentials
 9 | os.environ["GOOGLE_CSE_ID"] = "YOUR_CSE_ID" # Get it at https://programmablesearchengine.google.com/
10 | os.environ["OPENAI_API_BASE"] = "https://api.openai.com/v1"
11 | os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY" # Get it at https://beta.openai.com/account/api-keys
12 | 
13 | st.set_page_config(page_title="Interweb Explorer", page_icon="🌐")
14 | 
15 | def settings():
16 | 
17 |     # Vectorstore
18 |     import faiss
19 |     from langchain.vectorstores import FAISS 
20 |     from langchain.embeddings.openai import OpenAIEmbeddings
21 |     from langchain.docstore import InMemoryDocstore  
22 |     embeddings_model = OpenAIEmbeddings()  
23 |     embedding_size = 1536  
24 |     index = faiss.IndexFlatL2(embedding_size)  
25 |     vectorstore_public = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {})
26 | 
27 |     # LLM
28 |     from langchain.chat_models import ChatOpenAI
29 |     llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0, streaming=True)
30 | 
31 |     # Search
32 |     from langchain.utilities import GoogleSearchAPIWrapper
33 |     search = GoogleSearchAPIWrapper()   
34 | 
35 |     # Initialize 
36 |     web_retriever = WebResearchRetriever.from_llm(
37 |         vectorstore=vectorstore_public,
38 |         llm=llm, 
39 |         search=search, 
40 |         num_search_results=3
41 |     )
42 | 
43 |     return web_retriever, llm
44 | 
45 | class StreamHandler(BaseCallbackHandler):
46 |     def __init__(self, container, initial_text=""):
47 |         self.container = container
48 |         self.text = initial_text
49 | 
50 |     def on_llm_new_token(self, token: str, **kwargs) -> None:
51 |         self.text += token
52 |         self.container.info(self.text)
53 | 
54 | 
55 | class PrintRetrievalHandler(BaseCallbackHandler):
56 |     def __init__(self, container):
57 |         self.container = container.expander("Context Retrieval")
58 | 
59 |     def on_retriever_start(self, query: str, **kwargs):
60 |         self.container.write(f"**Question:** {query}")
61 | 
62 |     def on_retriever_end(self, documents, **kwargs):
63 |         # self.container.write(documents)
64 |         for idx, doc in enumerate(documents):
65 |             source = doc.metadata["source"]
66 |             self.container.write(f"**Results from {source}**")
67 |             self.container.text(doc.page_content)
68 | 
69 | 
70 | st.sidebar.image("img/ai.png")
71 | st.header("`Interweb Explorer`")
72 | st.info("`I am an AI that can answer questions by exploring, reading, and summarizing web pages."
73 |     "I can be configured to use different modes: public API or private (no data sharing).`")
74 | 
75 | # Make retriever and llm
76 | if 'retriever' not in st.session_state:
77 |     st.session_state['retriever'], st.session_state['llm'] = settings()
78 | web_retriever = st.session_state.retriever
79 | llm = st.session_state.llm
80 | 
81 | # User input 
82 | question = st.text_input("`Ask a question:`")
83 | 
84 | if question:
85 | 
86 |     # Generate answer (w/ citations)
87 |     import logging
88 |     logging.basicConfig()
89 |     logging.getLogger("langchain.retrievers.web_research").setLevel(logging.INFO)    
90 |     qa_chain = RetrievalQAWithSourcesChain.from_chain_type(llm, retriever=web_retriever)
91 | 
92 |     # Write answer and sources
93 |     retrieval_streamer_cb = PrintRetrievalHandler(st.container())
94 |     answer = st.empty()
95 |     stream_handler = StreamHandler(answer, initial_text="`Answer:`\n\n")
96 |     result = qa_chain({"question": question},callbacks=[retrieval_streamer_cb, stream_handler])
97 |     answer.info('`Answer:`\n\n' + result['answer'])
98 |     st.info('`Sources:`\n\n' + result['sources'])
99 | 


--------------------------------------------------------------------------------