├── .gitignore
├── LICENSE
├── src
    ├── README.md
    └── app.py
├── requirements.txt
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Aaron Jimenez
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/src/README.md:
--------------------------------------------------------------------------------
 1 | You can change and customize the code to test different models and configurations.
 2 | 
 3 | ### Embeddings models
 4 | 
 5 | For this project, I tried 3 different embeddings model to check performance:
 6 | 
 7 | #### HuggingFaceHubEmbeddings()
 8 | 
 9 | ```python
10 | from langchain_community.embeddings import HuggingFaceHubEmbeddings
11 |    
12 |     ...
13 |     vectore_store = Chroma.from_documents(document_chunks, HuggingFaceHubEmbeddings())
14 | ```
15 | 
16 | #### phi3 with ollama
17 | ```python
18 | from langchain_community.embeddings import OllamaEmbeddings
19 |    
20 |     ...
21 |     embeddings = OllamaEmbeddings(model='phi3')
22 |     vectore_store = Chroma.from_documents(document_chunks, embeddings)
23 | ```
24 | 
25 | #### nomic-embed-text with ollama
26 | 
27 | ```bash
28 | ollama run nomic-embed-text
29 | ```
30 | 
31 | ```python
32 | from langchain_community.embeddings import OllamaEmbeddings
33 |    
34 |     ...
35 |     embeddings = OllamaEmbeddings(model='nomic-embed-text')
36 |     vectore_store = Chroma.from_documents(document_chunks, embeddings)
37 | ```
38 | 
39 | ### LLm
40 | 
41 | I used `phi-3` model with ollama.
42 | 
43 | ```bash
44 | ollama run phi3
45 | ```
46 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | aiohttp==3.9.3
  2 | aiosignal==1.3.1
  3 | altair==5.2.0
  4 | annotated-types==0.6.0
  5 | anyio==4.3.0
  6 | asgiref==3.7.2
  7 | async-timeout==4.0.3
  8 | attrs==23.2.0
  9 | backoff==2.2.1
 10 | bcrypt==4.1.2
 11 | beautifulsoup4==4.12.3
 12 | blinker==1.7.0
 13 | build==1.0.3
 14 | cachetools==5.3.3
 15 | certifi==2024.2.2
 16 | charset-normalizer==3.3.2
 17 | chroma-hnswlib==0.7.3
 18 | chromadb==0.4.24
 19 | click==8.1.7
 20 | coloredlogs==15.0.1
 21 | dataclasses-json==0.6.4
 22 | Deprecated==1.2.14
 23 | exceptiongroup==1.2.0
 24 | fastapi==0.110.0
 25 | filelock==3.13.1
 26 | flatbuffers==23.5.26
 27 | frozenlist==1.4.1
 28 | fsspec==2024.2.0
 29 | gitdb==4.0.11
 30 | GitPython==3.1.42
 31 | google-auth==2.28.1
 32 | googleapis-common-protos==1.62.0
 33 | greenlet==3.0.3
 34 | grpcio==1.62.0
 35 | h11==0.14.0
 36 | httptools==0.6.1
 37 | huggingface-hub==0.21.3
 38 | humanfriendly==10.0
 39 | idna==3.6
 40 | importlib-metadata==6.11.0
 41 | importlib_resources==6.1.2
 42 | Jinja2==3.1.3
 43 | jsonpatch==1.33
 44 | jsonpointer==2.4
 45 | jsonschema==4.21.1
 46 | jsonschema-specifications==2023.12.1
 47 | kubernetes==29.0.0
 48 | langchain==0.1.9
 49 | langchain-community==0.0.24
 50 | langchain-core==0.1.27
 51 | langsmith==0.1.10
 52 | markdown-it-py==3.0.0
 53 | MarkupSafe==2.1.5
 54 | marshmallow==3.21.0
 55 | mdurl==0.1.2
 56 | mmh3==4.1.0
 57 | monotonic==1.6
 58 | mpmath==1.3.0
 59 | multidict==6.0.5
 60 | mypy-extensions==1.0.0
 61 | numpy==1.26.4
 62 | oauthlib==3.2.2
 63 | onnxruntime==1.17.1
 64 | opentelemetry-api==1.23.0
 65 | opentelemetry-exporter-otlp-proto-common==1.23.0
 66 | opentelemetry-exporter-otlp-proto-grpc==1.23.0
 67 | opentelemetry-instrumentation==0.44b0
 68 | opentelemetry-instrumentation-asgi==0.44b0
 69 | opentelemetry-instrumentation-fastapi==0.44b0
 70 | opentelemetry-proto==1.23.0
 71 | opentelemetry-sdk==1.23.0
 72 | opentelemetry-semantic-conventions==0.44b0
 73 | opentelemetry-util-http==0.44b0
 74 | orjson==3.9.15
 75 | overrides==7.7.0
 76 | packaging==23.2
 77 | pandas==2.2.1
 78 | pillow==10.2.0
 79 | posthog==3.4.2
 80 | protobuf==4.25.3
 81 | pulsar-client==3.4.0
 82 | pyarrow==15.0.0
 83 | pyasn1==0.5.1
 84 | pyasn1-modules==0.3.0
 85 | pydantic==2.6.3
 86 | pydantic_core==2.16.3
 87 | pydeck==0.8.1b0
 88 | Pygments==2.17.2
 89 | PyPika==0.48.9
 90 | pyproject_hooks==1.0.0
 91 | python-dateutil==2.8.2
 92 | python-dotenv==1.0.1
 93 | pytz==2024.1
 94 | PyYAML==6.0.1
 95 | referencing==0.33.0
 96 | requests==2.31.0
 97 | requests-oauthlib==1.3.1
 98 | rich==13.7.1
 99 | rpds-py==0.18.0
100 | rsa==4.9
101 | six==1.16.0
102 | smmap==5.0.1
103 | sniffio==1.3.1
104 | soupsieve==2.5
105 | SQLAlchemy==2.0.27
106 | starlette==0.36.3
107 | streamlit==1.31.1
108 | sympy==1.12
109 | tenacity==8.2.3
110 | tokenizers==0.15.2
111 | toml==0.10.2
112 | tomli==2.0.1
113 | toolz==0.12.1
114 | tornado==6.4
115 | tqdm==4.66.2
116 | typer==0.9.0
117 | typing-inspect==0.9.0
118 | typing_extensions==4.10.0
119 | tzdata==2024.1
120 | tzlocal==5.2
121 | urllib3==2.2.1
122 | uvicorn==0.27.1
123 | uvloop==0.19.0
124 | validators==0.22.0
125 | watchdog==4.0.0
126 | watchfiles==0.21.0
127 | websocket-client==1.7.0
128 | websockets==12.0
129 | wrapt==1.16.0
130 | yarl==1.9.4
131 | zipp==3.17.0
132 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Open Source Web Chatbot using RAG
 2 | ---
 3 | This project implements a web-based chatbot using the `LangChain` framework, `phi-3` model as llm with `ollama`, `chromadb` as vectorDB and `streamlit` as frontend. The chatbot is designed to interact with users based on the content of a specified website.
 4 | 
 5 | ## Features
 6 | 
 7 | - **Web-based Interface**: Users can interact with the chatbot through a web interface.
 8 | - **Document Loading**: The chatbot loads content from a specified website to understand the context of the conversation.
 9 | - **Text Splitting**: The content from the website is split into chunks for processing.
10 | - **Vector Store Creation**: Chunks of text are converted into vectors and stored in a vector store for efficient retrieval.
11 | - **RAG (Retrieval-Augmented Generation)**: The chatbot uses RAG to improve the quality of its responses. RAG involves two main components: a retriever chain and a conversation RAG chain.
12 | 
13 | ## Setup
14 | 
15 | To run the project, follow these steps:
16 | 
17 | 1. Install [ollama](https://ollama.com) in you machine and select a llm to use.
18 | 
19 | 2. Install the required dependencies of the project:
20 | 
21 |    ```bash
22 |    pip install streamlit langchain beautifulsoup4 chromadb huggingface_hub
23 |    ```
24 | 
25 | 3. Run the Streamlit app:
26 | 
27 |    ```bash
28 |    streamlit run src/app.py
29 |    ```
30 | 
31 | 4. Once the Streamlit app is running, enter a website URL in the sidebar and start chatting with the chatbot.
32 | 
33 | ## What is RAG?
34 | 
35 | >*Retrieval-Augmented Generation (RAG) is the process of optimizing the output of a large language model, so it references an authoritative knowledge base outside of its training data sources before generating a response. Large Language Models (LLMs) are trained on vast volumes of data and use billions of parameters to generate original output for tasks like answering questions, translating languages, and completing sentences. RAG extends the already powerful capabilities of LLMs to specific domains or an organization's internal knowledge base, all without the need to retrain the model. It is a cost-effective approach to improving LLM output so it remains relevant, accurate, and useful in various contexts.* [Read more here](https://aws.amazon.com/what-is/retrieval-augmented-generation/)
36 | 
37 | ### RAG Architecture
38 | 
39 | Follow the [LangChain documentation](python.langchain.com/docs/use_cases/question_answering/), a typical RAG application has two main components:
40 | 
41 | Indexing:
42 | ![Indexing](https://python.langchain.com/assets/images/rag_indexing-8160f90a90a33253d0154659cf7d453f.png){width=90%}
43 | 
44 | Retrieval and generation:
45 | ![Retrieval and generation](https://python.langchain.com/assets/images/rag_retrieval_generation-1046a4668d6bb08786ef73c56d4f228a.png){width=90%}
46 | 
47 | ## Reference
48 | 
49 | * [Tutorial | Chat with any Website using Python and Langchain (LATEST VERSION)](https://www.youtube.com/watch?v=bupx08ZgSFg)
50 | 
51 | * [Documentation: ollama llm in LangChain](https://python.langchain.com/docs/integrations/llms/ollama)
52 | 
53 | * [Documentation: ollama embeddings class in LangChain](https://python.langchain.com/docs/integrations/text_embedding/ollama)
54 | 
55 | * [Documentation: Hugging Face Embedding class in LangChain](https://python.langchain.com/docs/integrations/text_embedding/huggingfacehub)
56 | 
57 | * [Documentation: Q&A with RAG](https://python.langchain.com/docs/use_cases/question_answering/)
58 | 
59 | * [Nomic's New Embedding Model | nomic-embed-text](https://www.youtube.com/watch?v=LpcaeQZDVB8)
60 | 
61 | * [Introducing Nomic Embed: A Truly Open Embedding Model](https://blog.nomic.ai/posts/nomic-embed-text-v1)
62 | 


--------------------------------------------------------------------------------
/src/app.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | from langchain_core.messages import AIMessage, HumanMessage
  3 | from langchain_community.document_loaders import WebBaseLoader
  4 | from langchain.text_splitter import RecursiveCharacterTextSplitter
  5 | from langchain_community.vectorstores import Chroma
  6 | from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
  7 | from langchain.chains import create_history_aware_retriever, create_retrieval_chain
  8 | from langchain_community.llms import Ollama
  9 | from langchain.chains.combine_documents import create_stuff_documents_chain
 10 | from langchain_community.embeddings import OllamaEmbeddings
 11 | 
 12 | 
 13 | def get_vectorStrore_from_url(url):
 14 |     # load the html text from the document and split it into chunks
 15 |     #
 16 |     # store the chunk in a vectore store
 17 |     #
 18 |     loader = WebBaseLoader(url)
 19 |     document = loader.load()
 20 | 
 21 |     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0) # To do: test performance
 22 |     document_chunks = text_splitter.split_documents(document)
 23 | 
 24 |     embeddings = OllamaEmbeddings(model='nomic-embed-text')
 25 |     vectore_store = Chroma.from_documents(document_chunks, embeddings)
 26 | 
 27 |     return vectore_store
 28 | 
 29 | def get_context_retriever_chain(vector_store):
 30 |     # set up the llm, retriver and prompt to the retriver_chain
 31 |     #
 32 |     # retriver_chain -> retrieve relevant information from the database
 33 |     #
 34 |     llm = Ollama(model='phi3') # "or any other model that you have"
 35 | 
 36 |     retriver = vector_store.as_retriever(k=2) # To do: test `k`
 37 | 
 38 |     prompt = ChatPromptTemplate.from_messages(
 39 |         [
 40 |             MessagesPlaceholder(variable_name="chat_history"),
 41 |             ("user", "{input}"),
 42 |             ("user", "Given the above conversation, generate a search query to look up in order to get the information relevant to the conversation")
 43 |         ]
 44 |     )
 45 | 
 46 |     retriver_chain = create_history_aware_retriever(
 47 |         llm, 
 48 |         retriver, 
 49 |         prompt
 50 |     )
 51 | 
 52 |     return retriver_chain
 53 | 
 54 | def get_conversation_rag_chain(retriever_chain):
 55 |     # summarize the contents of the context obtained from the webpage
 56 |     #
 57 |     # based on context generate the answer of the question
 58 |     #
 59 |     llm = Ollama(model='phi3') # "or any other model that you have"
 60 | 
 61 |     prompt = ChatPromptTemplate.from_messages(
 62 |         [
 63 |             (
 64 |                 "system",
 65 |                 "Answer the user's questions based on the below context:\n\n{context}"
 66 |             ),
 67 |             MessagesPlaceholder(variable_name="chat_history"),
 68 |             ("user", "{input}"),
 69 |         ]
 70 |     )
 71 | 
 72 |     stuff_document_chain = create_stuff_documents_chain(llm,prompt)
 73 | 
 74 |     return create_retrieval_chain(retriever_chain, stuff_document_chain)
 75 | 
 76 | def get_response(user_input):
 77 |     #  invokes the chains created to generate a response to a given user query
 78 |     #
 79 |     retriver_chain = get_context_retriever_chain(st.session_state.vector_store)
 80 |     conversation_rag_chain = get_conversation_rag_chain(retriver_chain)
 81 | 
 82 |     response = conversation_rag_chain.invoke({
 83 |         "chat_history": st.session_state.chat_history,
 84 |         "input": user_query
 85 |     })
 86 | 
 87 |     return response['answer']
 88 | 
 89 | 
 90 | # streamlit app config
 91 | #
 92 | st.set_page_config(page_title="Lets chat with a Website", page_icon="💻")
 93 | st.title("Lets chat with a Website")
 94 | 
 95 | # sidebar setup
 96 | with st.sidebar:
 97 |     st.header("Setting")
 98 |     website_url = st.text_input("Type the URL here")
 99 | 
100 | if website_url is None or website_url == "":
101 |     st.info("Please enter a website URL...")
102 | 
103 | else:
104 |     # Session State
105 |     #
106 |     # Check the chat history for follow the conversation
107 |     if "chat_history" not in st.session_state:
108 |         st.session_state.chat_history = [
109 |             AIMessage(content="Hello, I am a bot. How can I help you?"),
110 |         ]
111 |     # Check if there are already info stored in the vectorDB
112 |     if "vector_store" not in st.session_state:
113 |         st.session_state.vector_store = get_vectorStrore_from_url(website_url)
114 |     
115 |     # user input
116 |     user_query = st.chat_input("Type here...")
117 |     if user_query is not None and user_query != "":
118 | 
119 |         response = get_response(user_query)
120 |         
121 |         st.session_state.chat_history.append(HumanMessage(content=user_query))
122 |         st.session_state.chat_history.append(AIMessage(content=response))
123 | 
124 |     # conversation history
125 |     for message in st.session_state.chat_history:
126 |         if isinstance(message, AIMessage):
127 |             with st.chat_message("AI"):
128 |                 st.write(message.content)
129 |         elif isinstance(message, HumanMessage):
130 |             with st.chat_message("Human"):
131 |                 st.write(message.content)
132 | 


--------------------------------------------------------------------------------