├── .dockerignore
├── .gitignore
├── backend
    ├── Dockerfile
    ├── backend.py
    └── requirements.txt
├── docker-compose.prod.yml
├── docker-compose.yml
├── frontend
    ├── Dockerfile
    ├── frontend.py
    └── requirements.txt
└── run.ps1


/.dockerignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | *.pyc
 3 | *.pyo
 4 | *.pyd
 5 | .Python
 6 | env/
 7 | venv/
 8 | .env
 9 | *.git
10 | *.gitignore
11 | .idea/
12 | *.pytest_cache/
13 | *.coverage
14 | *.log


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .venv/
2 | __pycache__
3 | .env


--------------------------------------------------------------------------------
/backend/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.12-slim as base
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | COPY requirements.txt /app/
 6 | 
 7 | RUN pip install -r requirements.txt
 8 | 
 9 | COPY . /app/
10 | 
11 | CMD ["uvicorn", "backend:app", "--host", "0.0.0.0", "--port", "8000"]


--------------------------------------------------------------------------------
/backend/backend.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import uuid
  4 | import traceback
  5 | from typing import List
  6 | from io import BytesIO
  7 | 
  8 | import boto3
  9 | import docx
 10 | import pymongo
 11 | from dotenv import load_dotenv
 12 | from PyPDF2 import PdfReader
 13 | from pydantic import BaseModel
 14 | from fastapi import FastAPI, UploadFile, status, HTTPException, Request
 15 | import time
 16 | from fastapi.responses import JSONResponse
 17 | from fastapi.middleware.cors import CORSMiddleware
 18 | from langchain.prompts import PromptTemplate
 19 | from langchain.chains import ConversationalRetrievalChain
 20 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 21 | from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
 22 | from langchain_community.vectorstores import FAISS
 23 | from langchain.docstore.document import Document as LangchainDocument
 24 | from langchain_community.callbacks.manager import get_openai_callback
 25 | 
 26 | # from langchain.vectorstores.redis import Redis as RedisVectorStore
 27 | 
 28 | # redis_url = "redis://redis:6379"
 29 | 
 30 | 
 31 | if "OPENAI_API_BASE" in os.environ:
 32 |     del os.environ["OPENAI_API_BASE"]
 33 | load_dotenv()
 34 | 
 35 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 36 | AZURE_ENDPOINT = os.getenv("AZURE_ENDPOINT")
 37 | S3_KEY = os.getenv("S3_KEY")
 38 | S3_SECRET = os.getenv("S3_SECRET")
 39 | S3_BUCKET = os.getenv("S3_BUCKET")
 40 | S3_REGION = os.getenv("S3_REGION")
 41 | S3_PATH = os.getenv("S3_PATH")
 42 | MONGO_URL = os.getenv("MONGO_URL")
 43 | 
 44 | # Initialize Azure services
 45 | embeddings = AzureOpenAIEmbeddings(
 46 |     model="text-embedding-ada-002",
 47 |     azure_endpoint=AZURE_ENDPOINT,
 48 |     api_key=OPENAI_API_KEY,
 49 |     openai_api_version="2023-07-01-preview",
 50 | )
 51 | 
 52 | llm = AzureChatOpenAI(
 53 |     azure_endpoint=AZURE_ENDPOINT,
 54 |     openai_api_version="2023-07-01-preview",
 55 |     deployment_name="GPT4",
 56 |     openai_api_key=OPENAI_API_KEY,
 57 |     openai_api_type="azure",
 58 |     model_name="text-embedding-ada-002",
 59 |     temperature=0,
 60 | )
 61 | 
 62 | # Initialize AWS S3 to read file
 63 | s3 = boto3.client(
 64 |     "s3",
 65 |     aws_access_key_id=S3_KEY,
 66 |     aws_secret_access_key=S3_SECRET,
 67 |     region_name=S3_REGION,
 68 | )
 69 | 
 70 | # Initialize aws s3 session for uploading file
 71 | aws_s3 = boto3.Session(
 72 |     aws_access_key_id=S3_KEY,
 73 |     aws_secret_access_key=S3_SECRET,
 74 |     region_name=S3_REGION,
 75 | )
 76 | 
 77 | # Initialize MongoDB
 78 | try:
 79 |     client = pymongo.MongoClient(MONGO_URL, uuidRepresentation="standard")
 80 |     db = client["chat_with_doc"]
 81 |     conversationcol = db["chat-history"]
 82 |     conversationcol.create_index([("session_id")], unique=True)
 83 | except:
 84 |     print(traceback.format_exc())
 85 |     exc_type, exc_obj, exc_tb = sys.exc_info()
 86 |     fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
 87 |     print(exc_type, fname, exc_tb.tb_lineno)
 88 | 
 89 | 
 90 | # Pydantic models
 91 | class ChatMessageSent(BaseModel):
 92 |     session_id: str = None
 93 |     user_input: str
 94 |     data_source: str
 95 | 
 96 | 
 97 | # Helper functions
 98 | def read_file_from_s3(file_name: str) -> str:
 99 |     """Read and extract text content from S3 files"""
100 |     file_content = s3.get_object(Bucket=S3_BUCKET, Key=f"{S3_PATH}{file_name}")[
101 |         "Body"
102 |     ].read()
103 | 
104 |     if file_name.endswith(".pdf"):
105 |         pdf_reader = PdfReader(BytesIO(file_content))
106 |         return " ".join(page.extract_text() for page in pdf_reader.pages)
107 | 
108 |     elif file_name.endswith(".docx"):
109 |         docx_reader = docx.Document(BytesIO(file_content))
110 |         return "\n".join(paragraph.text for paragraph in docx_reader.paragraphs)
111 | 
112 |     raise ValueError("Unsupported file format. Please use PDF or DOCX files.")
113 | 
114 | 
115 | def get_response(
116 |     file_name: str,
117 |     session_id: str,
118 |     query: str,
119 |     model: str = "text-embedding-ada-002",
120 |     temperature: float = 0,
121 | ):
122 |     file_name = file_name.split("/")[-1]
123 |     text_content = read_file_from_s3(file_name)
124 | 
125 |     data = [LangchainDocument(page_content=text_content)]
126 |     text_splitter = RecursiveCharacterTextSplitter(
127 |         chunk_size=1000, chunk_overlap=100, separators=["\n", " ", ""]
128 |     )
129 |     all_splits = text_splitter.split_documents(data)
130 |     vectorstore = FAISS.from_documents(all_splits, embeddings)
131 | 
132 |     qa_chain = ConversationalRetrievalChain.from_llm(
133 |         llm,
134 |         retriever=vectorstore.as_retriever(),
135 |         condense_question_prompt=PromptTemplate.from_template(
136 |             "You are a professional document analyzer. Please answer the following question based on the document content. "
137 |             "Be direct and precise. If the information is not in the document, clearly state that. "
138 |             "Remember, you are Bob, the analyzer, not the user asking the question.\n\nQuestion: {question}"
139 |         ),
140 |     )
141 | 
142 |     with get_openai_callback() as cb:
143 |         answer = qa_chain.invoke(
144 |             {
145 |                 "question": query,
146 |                 "chat_history": load_memory_to_pass(session_id=session_id),
147 |             }
148 |         )
149 |         answer["total_tokens_used"] = cb.total_tokens
150 | 
151 |     return answer
152 | 
153 | 
154 | def load_memory_to_pass(session_id: str):
155 |     data = conversationcol.find_one({"session_id": session_id})
156 |     history = []
157 |     if data:
158 |         data = data["conversation"]
159 |         for x in range(0, len(data), 2):
160 |             history.extend([(data[x], data[x + 1])])
161 |     return history
162 | 
163 | 
164 | def get_session() -> str:
165 |     return str(uuid.uuid4())
166 | 
167 | 
168 | def add_session_history(session_id: str, new_values: List):
169 |     document = conversationcol.find_one({"session_id": session_id})
170 |     if document:
171 |         conversation = document["conversation"]
172 |         conversation.extend(new_values)
173 |         conversationcol.update_one(
174 |             {"session_id": session_id}, {"$set": {"conversation": conversation}}
175 |         )
176 |     else:
177 |         conversationcol.insert_one(
178 |             {"session_id": session_id, "conversation": new_values}
179 |         )
180 | 
181 | 
182 | app = FastAPI()
183 | 
184 | @app.middleware("http")
185 | async def add_process_time_header(request: Request, call_next):
186 |     start_time = time.time()
187 |     response = await call_next(request)
188 |     process_time = time.time() - start_time
189 |     # Log the timing
190 |     print(f"Time taken: {process_time:.3f} seconds")
191 |     # Add timing to response headers
192 |     response.headers["X-Process-Time"] = str(process_time)
193 |     return response
194 | 
195 | app.add_middleware(
196 |     CORSMiddleware,
197 |     allow_origins=["*"],
198 |     allow_credentials=False,
199 |     allow_methods=["*"],
200 |     allow_headers=["*"],
201 | )
202 | 
203 | 
204 | # API endpoints
205 | @app.post("/chat")
206 | async def create_chat_message(chats: ChatMessageSent):
207 |     try:
208 |         session_id = chats.session_id or get_session()
209 |         payload = ChatMessageSent(
210 |             session_id=session_id,
211 |             user_input=chats.user_input,
212 |             data_source=chats.data_source,
213 |         ).model_dump()
214 | 
215 |         response = get_response(
216 |             file_name=payload.get("data_source"),
217 |             session_id=payload.get("session_id"),
218 |             query=payload.get("user_input"),
219 |         )
220 | 
221 |         add_session_history(
222 |             session_id=session_id,
223 |             new_values=[payload.get("user_input"), response["answer"]],
224 |         )
225 | 
226 |         return JSONResponse(
227 |             content={
228 |                 "response": response,
229 |                 "session_id": str(session_id),
230 |             }
231 |         )
232 | 
233 |     except Exception:
234 |         print(traceback.format_exc())
235 |         exc_type, exc_obj, exc_tb = sys.exc_info()
236 |         fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
237 |         print(exc_type, fname, exc_tb.tb_lineno)
238 |         raise HTTPException(status_code=status.HTTP_204_NO_CONTENT, detail="error")
239 | 
240 | 
241 | @app.post("/uploadFile")
242 | async def uploadtos3(data_file: UploadFile):
243 |     try:
244 |         content = await data_file.read()
245 |         
246 |         # Upload directly to S3 using boto3 client
247 |         s3.put_object(
248 |             Bucket=S3_BUCKET,
249 |             Key=f"{S3_PATH}{data_file.filename.split('/')[-1]}",
250 |             Body=content
251 |         )
252 | 
253 |         response = {
254 |             "filename": data_file.filename.split("/")[-1],
255 |             "file_path": f"s3://{S3_BUCKET}/{S3_PATH}{data_file.filename.split('/')[-1]}",
256 |         }
257 |         return JSONResponse(content=response)
258 | 
259 |     except FileNotFoundError:
260 |         raise HTTPException(status_code=404, detail="Item not found")
261 | 
262 | 
263 | import uvicorn
264 | 
265 | if __name__ == "__main__":
266 |     uvicorn.run("backend:app", host="0.0.0.0", port=8000, reload=True)
267 | 


--------------------------------------------------------------------------------
/backend/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ovalentin2/SuperChatBot/67af2cb8f3abf318f7ff4e1e78fe6d1073408281/backend/requirements.txt


--------------------------------------------------------------------------------
/docker-compose.prod.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   backend:
 3 |     image: superchatbot_backend
 4 |     build:
 5 |       context: ./backend
 6 |       dockerfile: Dockerfile
 7 |     ports:
 8 |       - "8000:8000"
 9 |     env_file:
10 |       - ./backend/.env
11 | 
12 |   frontend:
13 |     image: superchatbot_frontend
14 |     build:
15 |       context: ./frontend
16 |       dockerfile: Dockerfile
17 |     ports:
18 |       - "8501:8501"
19 |     environment:
20 |       - BACKEND_URL=http://localhost:8000
21 |     depends_on:
22 |       - backend
23 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   backend:
 3 |     image: superchatbot_backend
 4 |     build:
 5 |       context: ./backend
 6 |       dockerfile: Dockerfile
 7 |     ports:
 8 |       - "8000:8000"
 9 |     env_file:
10 |       - ./backend/.env
11 | 
12 |   frontend:
13 |     image: superchatbot_frontend
14 |     build:
15 |       context: ./frontend
16 |       dockerfile: Dockerfile
17 |     ports:
18 |       - "8501:8501"
19 |     environment:
20 |       - BACKEND_URL=http://backend:8000
21 |     depends_on:
22 |       - backend


--------------------------------------------------------------------------------
/frontend/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.12-slim
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | COPY requirements.txt .
 6 | 
 7 | RUN pip install -r requirements.txt
 8 | 
 9 | COPY frontend.py .
10 | 
11 | EXPOSE 8501
12 | ENTRYPOINT ["streamlit", "run", "frontend.py"]


--------------------------------------------------------------------------------
/frontend/frontend.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import json
  3 | 
  4 | import os
  5 | 
  6 | BACKEND_URL = os.getenv("BACKEND_URL") or "http://localhost:8000"
  7 | 
  8 | def chat(user_input, data, session_id=None):
  9 |     """
 10 |     Sends a user input to a chat API and returns the response.
 11 | 
 12 |     Args:
 13 |         user_input (str): The user's input.
 14 |         data (str): The data source.
 15 |         session_id (str, optional): Session identifier. Defaults to None.
 16 | 
 17 |     Returns:
 18 |         tuple: A tuple containing the response answer and the updated session_id.
 19 |     """
 20 |     # API endpoint for chat
 21 |     url = BACKEND_URL+"/chat"
 22 | 
 23 |     # Print inputs for debugging
 24 |     print("user ", user_input)
 25 |     print("data", data)
 26 |     print("session_id", session_id)
 27 | 
 28 |     # Prepare payload for the API request
 29 |     if session_id is None:
 30 |         payload = json.dumps({"user_input": user_input, "data_source": data})
 31 |     else:
 32 |         payload = json.dumps(
 33 |             {"user_input": user_input, "data_source": data, "session_id": session_id}
 34 |         )
 35 | 
 36 |     # Set headers for the API request
 37 |     headers = {
 38 |         "accept": "application/json",
 39 |         "Content-Type": "application/json",
 40 |     }
 41 | 
 42 |     # Make a POST request to the chat API
 43 |     response = requests.request("POST", url, headers=headers, data=payload)
 44 |     print(response)
 45 |     # Print the API response for debugging
 46 |     print(response.json())
 47 | 
 48 |     # Check if the request was successful (status code 200)
 49 |     if response.status_code == 200:
 50 |         # Return the response answer and updated session_id
 51 |         return response.json()["response"]["answer"], response.json()["session_id"]
 52 | 
 53 | 
 54 | def upload_file(data_file):
 55 |     url = BACKEND_URL + "/uploadFile"
 56 |     files = {
 57 |         "data_file": (data_file.name, data_file, data_file.type)
 58 |     }
 59 |     
 60 |     headers = {"accept": "application/json"}
 61 |     response = requests.post(url, headers=headers, files=files)
 62 | 
 63 |     if response.status_code == 200:
 64 |         return response.json()["file_path"]
 65 | 
 66 | 
 67 | import streamlit as st
 68 | import time
 69 | import os
 70 | 
 71 | # Set page configuration for the Streamlit app
 72 | st.set_page_config(page_title="Document Chat", page_icon="📕", layout="wide")
 73 | 
 74 | # Initialize chat history and session variables
 75 | if "messages" not in st.session_state:
 76 |     st.session_state.messages = []
 77 | if "sessionid" not in st.session_state:
 78 |     st.session_state.sessionid = None
 79 | 
 80 | # Allow user to upload a file (PDF or DOCX)
 81 | data_file = st.file_uploader(
 82 |     label="Input file", accept_multiple_files=False, type=["pdf", "docx"]
 83 | )
 84 | st.divider()
 85 | 
 86 | # Process the uploaded file if available
 87 | if data_file:
 88 |     # Directly upload the file to the specified API endpoint
 89 |     s3_upload_url = upload_file(data_file)
 90 | 
 91 |     # Display chat messages from history on app rerun
 92 |     for message in st.session_state.messages:
 93 |         with st.chat_message(message["role"]):
 94 |             st.markdown(message["content"])
 95 | 
 96 |     if prompt := st.chat_input("You can ask any question"):
 97 |         st.session_state.messages.append({"role": "user", "content": prompt})
 98 |         with st.chat_message("user"):
 99 |             st.markdown(prompt)
100 | 
101 |         with st.chat_message("assistant"):
102 |             if st.session_state.sessionid is None:
103 |                 assistant_response, session_id = chat(prompt, data=s3_upload_url)
104 |                 st.session_state.sessionid = session_id
105 |             else:
106 |                 assistant_response, session_id = chat(prompt, session_id=st.session_state.sessionid, data=s3_upload_url)
107 | 
108 |             message_placeholder = st.empty()
109 |             full_response = ""
110 | 
111 |             for chunk in assistant_response.split():
112 |                 full_response += chunk + " "
113 |                 time.sleep(0.03)
114 |                 message_placeholder.markdown(full_response + "▌")
115 | 
116 |             message_placeholder.markdown(full_response)
117 | 
118 |         st.session_state.messages.append({"role": "assistant", "content": full_response})


--------------------------------------------------------------------------------
/frontend/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | streamlit


--------------------------------------------------------------------------------
/run.ps1:
--------------------------------------------------------------------------------
 1 | $dockerComposeFile = "docker-compose.yml"
 2 | 
 3 | # Build the Docker images
 4 | Write-Host "Building Docker images..."
 5 | docker-compose -f $dockerComposeFile build
 6 | 
 7 | # Start the services using Docker Compose
 8 | Write-Host "Starting services..."
 9 | docker-compose -f $dockerComposeFile up
10 | 
11 | # Display the status of the running containers
12 | Write-Host "Displaying the status of running containers..."
13 | docker ps


--------------------------------------------------------------------------------