├── requirements.txt
├── LICENSE
├── webquery.py
├── README.md
├── main.py
└── streamlitui.py


/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.8.4
 2 | aiosignal==1.3.1
 3 | async-timeout==4.0.2
 4 | attrs==23.1.0
 5 | backports.zoneinfo==0.2.1
 6 | certifi==2022.12.7
 7 | charset-normalizer==3.1.0
 8 | colorama==0.4.6
 9 | courlan==0.9.2
10 | dataclasses-json==0.5.7
11 | dateparser==1.1.8
12 | frozenlist==1.3.3
13 | greenlet==2.0.2
14 | htmldate==1.4.3
15 | idna==3.4
16 | jusText==3.0.0
17 | langchain==0.0.161
18 | langcodes==3.3.0
19 | lxml==4.9.2
20 | marshmallow==3.19.0
21 | marshmallow-enum==1.5.1
22 | multidict==6.0.4
23 | mypy-extensions==1.0.0
24 | numexpr==2.8.4
25 | numpy==1.24.3
26 | openai==0.27.6
27 | openapi-schema-pydantic==1.2.4
28 | packaging==23.1
29 | pydantic==1.10.7
30 | python-dateutil==2.8.2
31 | pytz==2023.3
32 | pytz-deprecation-shim==0.1.0.post0
33 | PyYAML==6.0
34 | regex==2023.5.5
35 | requests==2.30.0
36 | six==1.16.0
37 | SQLAlchemy==2.0.12
38 | tenacity==8.2.2
39 | tld==0.13
40 | tqdm==4.65.0
41 | trafilatura==1.5.0
42 | typing-inspect==0.8.0
43 | typing_extensions==4.5.0
44 | tzdata==2023.3
45 | tzlocal==4.3
46 | urllib3==1.26.15
47 | yarl==1.9.2
48 | streamlit==1.23.1
49 | streamlit_chat==0.0.2.2
50 | chromadb==0.3.26
51 | tiktoken==0.4.0
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Anil Chandra Naidu Matcha
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/webquery.py:
--------------------------------------------------------------------------------
 1 | import os, trafilatura
 2 | from langchain.embeddings.openai import OpenAIEmbeddings
 3 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 4 | from langchain.vectorstores import Chroma
 5 | from langchain.chains.question_answering import load_qa_chain
 6 | from langchain.llms import OpenAI
 7 | from langchain.docstore.document import Document
 8 | 
 9 | class WebQuery:
10 |     def __init__(self, openai_api_key = None) -> None:
11 |         self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
12 |         os.environ["OPENAI_API_KEY"] = openai_api_key
13 |         self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
14 |         self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
15 |         self.chain = None
16 |         self.db = None
17 | 
18 |     def ask(self, question: str) -> str:
19 |         if self.chain is None:
20 |             response = "Please, add a document."
21 |         else:
22 |             docs = self.db.get_relevant_documents(question)
23 |             response = self.chain.run(input_documents=docs, question=question)
24 |         return response
25 | 
26 |     def ingest(self, url: str) -> str:
27 |         result = trafilatura.extract(trafilatura.fetch_url(url))
28 |         documents = [Document(page_content=result, metadata={"source": url})]
29 |         splitted_documents = self.text_splitter.split_documents(documents)
30 |         self.db = Chroma.from_documents(splitted_documents, self.embeddings).as_retriever()
31 |         self.chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")
32 |         return "Success"
33 | 
34 |     def forget(self) -> None:
35 |         self.db = None
36 |         self.chain = None


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Website-to-Chatbot
 2 | 
 3 | ChatGPT for every website 
 4 | 
 5 | Instantly answer your visitors' questions with a personalized chatbot trained on your website content.
 6 | 
 7 | ### Tutorial -> https://www.youtube.com/watch?v=ZSfdZVvZ99Q
 8 | 
 9 | Create app like [SiteGPT](https://www.thesamur.ai/sitegpt-alternative) , [Chatbase](https://www.thesamur.ai/chatbase-alternative) , [Dante AI](https://www.thesamur.ai/danteai-alternative) , [Botsonic](https://www.thesamur.ai/botsonic-alternatives) , [CustomGPT](https://www.thesamur.ai/customgpt-alternative) , [Botpress](https://www.thesamur.ai/botpress-alternative) , [Chatbot AI](https://www.thesamur.ai/botpress-alternative) in less than 40 lines of code
10 | 
11 | https://github.com/Anil-matcha/Chatbase/assets/4326215/ef4a9654-dc72-41bb-8644-68454b3a3e33
12 | 
13 | ### Getting Started
14 | 
15 | Code is up, ⭐ (Star) the repo to receive updates
16 | 
17 | Replit and streamlit version coming soon
18 | 
19 | Follow [Anil Chandra Naidu Matcha](https://twitter.com/matchaman11) on twitter for updates
20 | 
21 | Subscribe to https://www.youtube.com/@AnilChandraNaiduMatcha for more such video tutorials
22 | 
23 | ### How to run ?
24 | 
25 | 1. Create a virtual environment in python https://docs.python.org/3/library/venv.html
26 | 
27 | 2. Run "pip install -r requirements.txt"
28 | 
29 | 3. Set OPENAI_API_KEY environment variable with your openai key
30 | 
31 | 4. Run "python main.py"
32 | 
33 | 5. Change url and query in code if you want to try with any other content
34 | 
35 | To run streamlit app, follow the steps run "streamlit run streamlitui.py"
36 | 
37 | ### Demo link
38 | 
39 | https://heybot.thesamur.ai/
40 | 
41 | ### Also check
42 | [Chat with PDF code](https://github.com/Anil-matcha/ChatPDF)
43 | 
44 | [Chat with CSV code](https://github.com/Anil-matcha/Chat-With-Excel)
45 | 
46 | [Chat with Youtube code](https://github.com/Anil-matcha/Chat-Youtube)
47 | 
48 | [ChatGPT in Discord code](https://github.com/Anil-matcha/DiscordGPT)
49 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from langchain.tools import BaseTool
 2 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 3 | from pydantic import Field
 4 | from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain, BaseCombineDocumentsChain
 5 | from langchain.chat_models import ChatOpenAI
 6 | import os, asyncio, trafilatura
 7 | from langchain.docstore.document import Document
 8 | 
 9 | def _get_text_splitter():
10 |     return RecursiveCharacterTextSplitter(
11 |         # Set a really small chunk size, just to show.
12 |         chunk_size = 500,
13 |         chunk_overlap  = 20,
14 |         length_function = len,
15 |     )
16 | 
17 | class WebpageQATool(BaseTool):
18 |     name = "query_webpage"
19 |     description = "Browse a webpage and retrieve the information relevant to the question."
20 |     text_splitter: RecursiveCharacterTextSplitter = Field(default_factory=_get_text_splitter)
21 |     qa_chain: BaseCombineDocumentsChain
22 |     
23 |     def _run(self, question: str) -> str:
24 |         result = trafilatura.extract(trafilatura.fetch_url(url))
25 |         docs = [Document(page_content=result, metadata={"source": url})]
26 |         web_docs = self.text_splitter.split_documents(docs)
27 |         results = []
28 |         for i in range(0, len(web_docs), 4):
29 |             input_docs = web_docs[i:i+4]
30 |             window_result = self.qa_chain({"input_documents": input_docs, "question": question}, return_only_outputs=True)
31 |             results.append(f"Response from window {i} - {window_result}")
32 |         results_docs = [Document(page_content="\n".join(results), metadata={"source": url})]
33 |         return self.qa_chain({"input_documents": results_docs, "question": question}, return_only_outputs=True)
34 |     
35 |     async def _arun(self, url: str, question: str) -> str:
36 |         raise NotImplementedError
37 | 
38 | llm = ChatOpenAI(temperature=1.0)        
39 | query_website_tool = WebpageQATool(qa_chain=load_qa_with_sources_chain(llm))
40 | url = "https://uuki.live/"
41 | print(query_website_tool.run("What is UUKI ?"))


--------------------------------------------------------------------------------
/streamlitui.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | import streamlit as st
 4 | from streamlit_chat import message
 5 | from webquery import WebQuery
 6 | 
 7 | st.set_page_config(page_title="Website to Chatbot")
 8 | 
 9 | 
10 | def display_messages():
11 |     st.subheader("Chat")
12 |     for i, (msg, is_user) in enumerate(st.session_state["messages"]):
13 |         message(msg, is_user=is_user, key=str(i))
14 |     st.session_state["thinking_spinner"] = st.empty()
15 | 
16 | 
17 | def process_input():
18 |     if st.session_state["user_input"] and len(st.session_state["user_input"].strip()) > 0:
19 |         user_text = st.session_state["user_input"].strip()
20 |         with st.session_state["thinking_spinner"], st.spinner(f"Thinking"):
21 |             query_text = st.session_state["webquery"].ask(user_text)
22 | 
23 |         st.session_state["messages"].append((user_text, True))
24 |         st.session_state["messages"].append((query_text, False))
25 |         
26 | def ingest_input():
27 |     if st.session_state["input_url"] and len(st.session_state["input_url"].strip()) > 0:
28 |         url = st.session_state["input_url"].strip()
29 |         with st.session_state["thinking_spinner"], st.spinner(f"Thinking"):
30 |             ingest_text = st.session_state["webquery"].ingest(url)        
31 | 
32 | def is_openai_api_key_set() -> bool:
33 |     return len(st.session_state["OPENAI_API_KEY"]) > 0
34 | 
35 | 
36 | def main():
37 |     if len(st.session_state) == 0:
38 |         st.session_state["messages"] = []
39 |         st.session_state["url"] = ""
40 |         st.session_state["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "")
41 |         if is_openai_api_key_set():
42 |             st.session_state["webquery"] = WebQuery(st.session_state["OPENAI_API_KEY"])
43 |         else:
44 |             st.session_state["webquery"] = None
45 | 
46 |     st.header("Website to Chatbot")
47 | 
48 |     if st.text_input("OpenAI API Key", value=st.session_state["OPENAI_API_KEY"], key="input_OPENAI_API_KEY", type="password"):
49 |         if (
50 |             len(st.session_state["input_OPENAI_API_KEY"]) > 0
51 |             and st.session_state["input_OPENAI_API_KEY"] != st.session_state["OPENAI_API_KEY"]
52 |         ):
53 |             st.session_state["OPENAI_API_KEY"] = st.session_state["input_OPENAI_API_KEY"]
54 |             st.session_state["messages"] = []
55 |             st.session_state["user_input"] = ""
56 |             st.session_state["input_url"] = ""
57 |             st.session_state["webquery"] = WebQuery(st.session_state["OPENAI_API_KEY"])
58 | 
59 |     st.subheader("Add a url")
60 |     st.text_input("Input url", value=st.session_state["url"], key="input_url", disabled=not is_openai_api_key_set(), on_change=ingest_input)
61 | 
62 |     st.session_state["ingestion_spinner"] = st.empty()
63 | 
64 |     display_messages()
65 |     st.text_input("Message", key="user_input", disabled=not is_openai_api_key_set(), on_change=process_input)
66 | 
67 |     st.divider()
68 |     st.markdown("Source code: [Github](https://github.com/Anil-matcha/Website-to-Chatbot)")
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     main()


--------------------------------------------------------------------------------