├── requirements.txt ├── LICENSE ├── webquery.py ├── README.md ├── main.py └── streamlitui.py /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | async-timeout==4.0.2 4 | attrs==23.1.0 5 | backports.zoneinfo==0.2.1 6 | certifi==2022.12.7 7 | charset-normalizer==3.1.0 8 | colorama==0.4.6 9 | courlan==0.9.2 10 | dataclasses-json==0.5.7 11 | dateparser==1.1.8 12 | frozenlist==1.3.3 13 | greenlet==2.0.2 14 | htmldate==1.4.3 15 | idna==3.4 16 | jusText==3.0.0 17 | langchain==0.0.161 18 | langcodes==3.3.0 19 | lxml==4.9.2 20 | marshmallow==3.19.0 21 | marshmallow-enum==1.5.1 22 | multidict==6.0.4 23 | mypy-extensions==1.0.0 24 | numexpr==2.8.4 25 | numpy==1.24.3 26 | openai==0.27.6 27 | openapi-schema-pydantic==1.2.4 28 | packaging==23.1 29 | pydantic==1.10.7 30 | python-dateutil==2.8.2 31 | pytz==2023.3 32 | pytz-deprecation-shim==0.1.0.post0 33 | PyYAML==6.0 34 | regex==2023.5.5 35 | requests==2.30.0 36 | six==1.16.0 37 | SQLAlchemy==2.0.12 38 | tenacity==8.2.2 39 | tld==0.13 40 | tqdm==4.65.0 41 | trafilatura==1.5.0 42 | typing-inspect==0.8.0 43 | typing_extensions==4.5.0 44 | tzdata==2023.3 45 | tzlocal==4.3 46 | urllib3==1.26.15 47 | yarl==1.9.2 48 | streamlit==1.23.1 49 | streamlit_chat==0.0.2.2 50 | chromadb==0.3.26 51 | tiktoken==0.4.0 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Anil Chandra Naidu Matcha 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /webquery.py: -------------------------------------------------------------------------------- 1 | import os, trafilatura 2 | from langchain.embeddings.openai import OpenAIEmbeddings 3 | from langchain.text_splitter import RecursiveCharacterTextSplitter 4 | from langchain.vectorstores import Chroma 5 | from langchain.chains.question_answering import load_qa_chain 6 | from langchain.llms import OpenAI 7 | from langchain.docstore.document import Document 8 | 9 | class WebQuery: 10 | def __init__(self, openai_api_key = None) -> None: 11 | self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) 12 | os.environ["OPENAI_API_KEY"] = openai_api_key 13 | self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) 14 | self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key) 15 | self.chain = None 16 | self.db = None 17 | 18 | def ask(self, question: str) -> str: 19 | if self.chain is None: 20 | response = "Please, add a document." 21 | else: 22 | docs = self.db.get_relevant_documents(question) 23 | response = self.chain.run(input_documents=docs, question=question) 24 | return response 25 | 26 | def ingest(self, url: str) -> str: 27 | result = trafilatura.extract(trafilatura.fetch_url(url)) 28 | documents = [Document(page_content=result, metadata={"source": url})] 29 | splitted_documents = self.text_splitter.split_documents(documents) 30 | self.db = Chroma.from_documents(splitted_documents, self.embeddings).as_retriever() 31 | self.chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff") 32 | return "Success" 33 | 34 | def forget(self) -> None: 35 | self.db = None 36 | self.chain = None -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Website-to-Chatbot 2 | 3 | ChatGPT for every website 4 | 5 | Instantly answer your visitors' questions with a personalized chatbot trained on your website content. 6 | 7 | ### Tutorial -> https://www.youtube.com/watch?v=ZSfdZVvZ99Q 8 | 9 | Create app like [SiteGPT](https://www.thesamur.ai/sitegpt-alternative) , [Chatbase](https://www.thesamur.ai/chatbase-alternative) , [Dante AI](https://www.thesamur.ai/danteai-alternative) , [Botsonic](https://www.thesamur.ai/botsonic-alternatives) , [CustomGPT](https://www.thesamur.ai/customgpt-alternative) , [Botpress](https://www.thesamur.ai/botpress-alternative) , [Chatbot AI](https://www.thesamur.ai/botpress-alternative) in less than 40 lines of code 10 | 11 | https://github.com/Anil-matcha/Chatbase/assets/4326215/ef4a9654-dc72-41bb-8644-68454b3a3e33 12 | 13 | ### Getting Started 14 | 15 | Code is up, ⭐ (Star) the repo to receive updates 16 | 17 | Replit and streamlit version coming soon 18 | 19 | Follow [Anil Chandra Naidu Matcha](https://twitter.com/matchaman11) on twitter for updates 20 | 21 | Subscribe to https://www.youtube.com/@AnilChandraNaiduMatcha for more such video tutorials 22 | 23 | ### How to run ? 24 | 25 | 1. Create a virtual environment in python https://docs.python.org/3/library/venv.html 26 | 27 | 2. Run "pip install -r requirements.txt" 28 | 29 | 3. Set OPENAI_API_KEY environment variable with your openai key 30 | 31 | 4. Run "python main.py" 32 | 33 | 5. Change url and query in code if you want to try with any other content 34 | 35 | To run streamlit app, follow the steps run "streamlit run streamlitui.py" 36 | 37 | ### Demo link 38 | 39 | https://heybot.thesamur.ai/ 40 | 41 | ### Also check 42 | [Chat with PDF code](https://github.com/Anil-matcha/ChatPDF) 43 | 44 | [Chat with CSV code](https://github.com/Anil-matcha/Chat-With-Excel) 45 | 46 | [Chat with Youtube code](https://github.com/Anil-matcha/Chat-Youtube) 47 | 48 | [ChatGPT in Discord code](https://github.com/Anil-matcha/DiscordGPT) 49 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from langchain.tools import BaseTool 2 | from langchain.text_splitter import RecursiveCharacterTextSplitter 3 | from pydantic import Field 4 | from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain, BaseCombineDocumentsChain 5 | from langchain.chat_models import ChatOpenAI 6 | import os, asyncio, trafilatura 7 | from langchain.docstore.document import Document 8 | 9 | def _get_text_splitter(): 10 | return RecursiveCharacterTextSplitter( 11 | # Set a really small chunk size, just to show. 12 | chunk_size = 500, 13 | chunk_overlap = 20, 14 | length_function = len, 15 | ) 16 | 17 | class WebpageQATool(BaseTool): 18 | name = "query_webpage" 19 | description = "Browse a webpage and retrieve the information relevant to the question." 20 | text_splitter: RecursiveCharacterTextSplitter = Field(default_factory=_get_text_splitter) 21 | qa_chain: BaseCombineDocumentsChain 22 | 23 | def _run(self, question: str) -> str: 24 | result = trafilatura.extract(trafilatura.fetch_url(url)) 25 | docs = [Document(page_content=result, metadata={"source": url})] 26 | web_docs = self.text_splitter.split_documents(docs) 27 | results = [] 28 | for i in range(0, len(web_docs), 4): 29 | input_docs = web_docs[i:i+4] 30 | window_result = self.qa_chain({"input_documents": input_docs, "question": question}, return_only_outputs=True) 31 | results.append(f"Response from window {i} - {window_result}") 32 | results_docs = [Document(page_content="\n".join(results), metadata={"source": url})] 33 | return self.qa_chain({"input_documents": results_docs, "question": question}, return_only_outputs=True) 34 | 35 | async def _arun(self, url: str, question: str) -> str: 36 | raise NotImplementedError 37 | 38 | llm = ChatOpenAI(temperature=1.0) 39 | query_website_tool = WebpageQATool(qa_chain=load_qa_with_sources_chain(llm)) 40 | url = "https://uuki.live/" 41 | print(query_website_tool.run("What is UUKI ?")) -------------------------------------------------------------------------------- /streamlitui.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import streamlit as st 4 | from streamlit_chat import message 5 | from webquery import WebQuery 6 | 7 | st.set_page_config(page_title="Website to Chatbot") 8 | 9 | 10 | def display_messages(): 11 | st.subheader("Chat") 12 | for i, (msg, is_user) in enumerate(st.session_state["messages"]): 13 | message(msg, is_user=is_user, key=str(i)) 14 | st.session_state["thinking_spinner"] = st.empty() 15 | 16 | 17 | def process_input(): 18 | if st.session_state["user_input"] and len(st.session_state["user_input"].strip()) > 0: 19 | user_text = st.session_state["user_input"].strip() 20 | with st.session_state["thinking_spinner"], st.spinner(f"Thinking"): 21 | query_text = st.session_state["webquery"].ask(user_text) 22 | 23 | st.session_state["messages"].append((user_text, True)) 24 | st.session_state["messages"].append((query_text, False)) 25 | 26 | def ingest_input(): 27 | if st.session_state["input_url"] and len(st.session_state["input_url"].strip()) > 0: 28 | url = st.session_state["input_url"].strip() 29 | with st.session_state["thinking_spinner"], st.spinner(f"Thinking"): 30 | ingest_text = st.session_state["webquery"].ingest(url) 31 | 32 | def is_openai_api_key_set() -> bool: 33 | return len(st.session_state["OPENAI_API_KEY"]) > 0 34 | 35 | 36 | def main(): 37 | if len(st.session_state) == 0: 38 | st.session_state["messages"] = [] 39 | st.session_state["url"] = "" 40 | st.session_state["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "") 41 | if is_openai_api_key_set(): 42 | st.session_state["webquery"] = WebQuery(st.session_state["OPENAI_API_KEY"]) 43 | else: 44 | st.session_state["webquery"] = None 45 | 46 | st.header("Website to Chatbot") 47 | 48 | if st.text_input("OpenAI API Key", value=st.session_state["OPENAI_API_KEY"], key="input_OPENAI_API_KEY", type="password"): 49 | if ( 50 | len(st.session_state["input_OPENAI_API_KEY"]) > 0 51 | and st.session_state["input_OPENAI_API_KEY"] != st.session_state["OPENAI_API_KEY"] 52 | ): 53 | st.session_state["OPENAI_API_KEY"] = st.session_state["input_OPENAI_API_KEY"] 54 | st.session_state["messages"] = [] 55 | st.session_state["user_input"] = "" 56 | st.session_state["input_url"] = "" 57 | st.session_state["webquery"] = WebQuery(st.session_state["OPENAI_API_KEY"]) 58 | 59 | st.subheader("Add a url") 60 | st.text_input("Input url", value=st.session_state["url"], key="input_url", disabled=not is_openai_api_key_set(), on_change=ingest_input) 61 | 62 | st.session_state["ingestion_spinner"] = st.empty() 63 | 64 | display_messages() 65 | st.text_input("Message", key="user_input", disabled=not is_openai_api_key_set(), on_change=process_input) 66 | 67 | st.divider() 68 | st.markdown("Source code: [Github](https://github.com/Anil-matcha/Website-to-Chatbot)") 69 | 70 | 71 | if __name__ == "__main__": 72 | main() --------------------------------------------------------------------------------