├── nginx
    ├── Dockerfile
    └── default.conf
├── run.sh
├── utils
    ├── google_authenticate.py
    ├── string_util.py
    ├── ip_tools.py
    └── mailersend.py
├── requirements.txt
├── .vscode
    └── settings.json
├── database
    ├── connection.py
    ├── cost_manager.py
    ├── session_manager.py
    ├── chat_manager.py
    └── user_manager.py
├── helpers
    └── validators.py
├── test.py
├── Dockerfile
├── docker-compose.local.yml
├── config
    └── settings.py
├── app.py
├── docker-compose.yml
├── views
    ├── capcha_plugin.py
    ├── login_view.py
    ├── signup_view.py
    └── main_view.py
├── main.py
├── README.md
├── .dockerignore
├── .gitignore
└── embeddings
    └── vector_store.py


/nginx/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nginx:latest
2 | 
3 | COPY default.conf /etc/nginx/conf.d


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash  
2 | 
3 | # Start Streamlit in the background  
4 | streamlit run app.py &  
5 | 
6 | # Start your Python script  
7 | python main.py


--------------------------------------------------------------------------------
/utils/google_authenticate.py:
--------------------------------------------------------------------------------
1 | from streamlit_google_auth import Authenticate
2 | 
3 | authenticator = Authenticate(
4 |     secret_credentials_path='./client_secret.json',
5 |     cookie_name='rag-system-biscoito',
6 |     cookie_key='senha_maluca_12345',
7 |     redirect_uri='https://gtrag.bot/',
8 | )


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | streamlit
 2 | google-generativeai
 3 | python-dotenv
 4 | langchain
 5 | PyPDF2
 6 | chromadb
 7 | faiss-cpu
 8 | langchain_google_genai
 9 | langchain-community
10 | mysql-connector-python
11 | bcrypt
12 | ratelimit
13 | openai
14 | langfuse
15 | captcha
16 | mailersend
17 | pyodbc
18 | streamlit_google_auth
19 | streamlit-extras
20 | psycopg2-binary
21 | pymupdf
22 | streamlit-url-fragment


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "sqltools.connections": [
 3 |         {
 4 |             "previewLimit": 50,
 5 |             "server": "localhost",
 6 |             "port": 5432,
 7 |             "driver": "PostgreSQL",
 8 |             "name": "oknoke",
 9 |             "group": "oknoke",
10 |             "database": "oknoke",
11 |             "username": "oknoke",
12 |             "password": "oknoke"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/database/connection.py:
--------------------------------------------------------------------------------
 1 | import psycopg2  
 2 | from config import settings  
 3 | 
 4 | def create_connection():  
 5 |     try:  
 6 |         conn = psycopg2.connect(  
 7 |             host=settings.DB_HOST,  
 8 |             database=settings.DB_NAME,  
 9 |             user=settings.DB_USER,  
10 |             password=settings.DB_PASSWORD  
11 |         )  
12 |         return conn 
13 |     except Exception as e:  
14 |         print(e)
15 |         return None  


--------------------------------------------------------------------------------
/helpers/validators.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | def is_valid_email(email):
 4 |     email_regex = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
 5 |     return re.match(email_regex, email) is not None
 6 | 
 7 | def is_valid_password(password):
 8 |     if len(password) < 8:
 9 |         return False
10 |     if not re.search(r"[A-Za-z]", password):
11 |         return False
12 |     if not re.search(r"[0-9]", password):
13 |         return False
14 |     if not re.search(r"[!@#$%^&*(),.?\":{}|<>]", password):
15 |         return False
16 |     return True


--------------------------------------------------------------------------------
/utils/string_util.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def find_positions_multiple(text, start_substring, end_substring):  
 3 |     positions = []  
 4 |     start_idx = 0  
 5 |     while True:  
 6 |         start = text.find(start_substring, start_idx)  
 7 |         if start == -1:  
 8 |             break  
 9 |         end = text.find(end_substring, start + len(start_substring))  
10 |         if end == -1:  
11 |             break  
12 |         positions.append((start, end + len(end_substring)))  # Adjust end position to include end_substring  
13 |         start_idx = end + len(end_substring)  
14 |     return positions  


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | from config import settings
 3 | openai_client = openai.OpenAI(api_key=settings.OPENAI_API_KEY)
 4 | # List all files  
 5 | files = openai_client.files.list()  
 6 | print(files)
 7 | # Delete each file  
 8 | for file in files.data:  
 9 |     file_id = file.id
10 |     openai_client.files.delete(file_id)  
11 |     print(f"Deleted file: {file_id}")
12 | 
13 | 
14 | vectors = openai_client.beta.vector_stores.list()
15 | print(vectors)
16 | 
17 | for vector in vectors:  
18 |     vector_id = vector.id
19 |     openai_client.beta.vector_stores.delete(
20 |         vector_store_id=vector_id
21 |     )
22 |     print(f"Deleted vector: {vector_id}")
23 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use an official Python runtime as a parent image  
 2 | FROM python:3.9-slim  
 3 |     
 4 | # Set the working directory in the container  
 5 | WORKDIR /usr/src/app  
 6 | 
 7 | # Copy the requirements file into the container  
 8 | COPY requirements.txt ./  
 9 | 
10 | # Install any dependencies specified in requirements.txt  
11 | RUN pip install --no-cache-dir -r requirements.txt  
12 | 
13 | # Copy the rest of the application code into the container  
14 | COPY . .  
15 | 
16 | # Make port 80 available to the world outside this container  
17 | # (Optional, only if your application runs on a specific port)  
18 | # EXPOSE 80  
19 | 
20 | # Define environment variable  
21 | # ENV PYTHONUNBUFFERED=1  
22 | 
23 | # Make the run script executable  
24 | RUN chmod +x run.sh  
25 | 
26 | # Command to run the script  
27 | CMD ["./run.sh"]  


--------------------------------------------------------------------------------
/docker-compose.local.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   postgres:  
 5 |     image: postgres:13  
 6 |     container_name: postgres  
 7 |     environment:  
 8 |       POSTGRES_DB: oknoke  
 9 |       POSTGRES_USER: oknoke  
10 |       POSTGRES_PASSWORD: oknoke
11 |     ports:  
12 |       - "5432:5432"  # Expose PostgreSQL port  
13 |     volumes:  
14 |       - postgres_data:/var/lib/postgresql/data  
15 |   
16 |   # nginx:
17 |   #   container_name: nginx
18 |   #   restart: always
19 |   #   build:
20 |   #     context: ./nginx
21 |   #     dockerfile: Dockerfile
22 |   #   volumes:
23 |   #     - /etc/letsencrypt/live/gtrag.bot/fullchain.pem:/etc/letsencrypt/live/gtrag.bot/fullchain.pem
24 |   #     - /etc/letsencrypt/live/gtrag.bot/privkey.pem:/etc/letsencrypt/live/gtrag.bot/privkey.pem
25 |   #   ports:
26 |   #     - "80:80"
27 |   #     - "443:443"
28 |   #   depends_on:
29 |   #     - postgres
30 | 
31 | volumes:  
32 |   postgres_data: {}  


--------------------------------------------------------------------------------
/config/settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dotenv import load_dotenv
 3 | 
 4 | load_dotenv()
 5 | 
 6 | # LLM Model Information
 7 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 8 | 
 9 | # MySQL Database information
10 | DB_HOST = os.getenv("DB_HOST")
11 | DB_USER = os.getenv("DB_USER")
12 | DB_PASSWORD = os.getenv("DB_PASSWORD")
13 | DB_NAME = os.getenv("DB_NAME")
14 | 
15 | # Mailersend Information
16 | MAILERSEND_API_KEY = os.getenv("MAILERSEND_API_KEY")
17 | EMAIL_TEMPLATE_SIGNUP = os.getenv("EMAIL_TEMPLATE_SIGNUP")
18 | 
19 | # URL Information
20 | BACKEND_URL = os.getenv("BACKEND_URL")
21 | PRODUCT_URL = os.getenv("PRODUCT_URL")
22 | 
23 | # reCAPTCHA Information
24 | RECAPTCHA_SITE_KEY = os.getenv("RECAPTCHA_SITE_KEY")
25 | RECAPTCHA_SECRET_KEY = os.getenv("RECAPTCHA_SECRET_KEY")
26 | 
27 | # langfuse Information
28 | LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY")
29 | LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY")
30 | LANGFUSE_HOST = os.getenv("LANGFUSE_HOST")


--------------------------------------------------------------------------------
/utils/ip_tools.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from streamlit import runtime
 3 | from streamlit.runtime.scriptrunner import get_script_run_ctx
 4 | 
 5 | def get_remote_ip() -> str:
 6 |     """Get remote ip."""
 7 |     try:
 8 |         ctx = get_script_run_ctx()
 9 |         if ctx is None:
10 |             return None
11 | 
12 |         session_info = runtime.get_instance().get_client(ctx.session_id)
13 |         if session_info is None:
14 |             return None
15 |     except Exception as e:
16 |         return None
17 | 
18 |     return session_info.request.remote_ip
19 | 
20 | def get_country_name(ip_address: str) -> str:
21 |     """Get country name from IP address using ipapi service."""
22 |     try:
23 |         response = requests.get(f"http://ip-api.com/json/{ip_address}")
24 |         data = response.json()
25 |         return data.get("country", "Unknown")
26 |     except Exception as e:
27 |         return "Unknown"
28 | 
29 | def get_remote_country() -> str:
30 |     """Get remote country"""
31 |     ip_address = get_remote_ip()
32 |     country = get_country_name(ip_address)
33 |     return country


--------------------------------------------------------------------------------
/database/cost_manager.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st  
 2 | from database.connection import create_connection  
 3 | 
 4 | def create_cost_table():  
 5 |     conn = create_connection()  
 6 |     if conn:  
 7 |         try:  
 8 |             cursor = conn.cursor()  
 9 |             create_table_query = """  
10 |             CREATE TABLE IF NOT EXISTS cost (  
11 |                 id SERIAL PRIMARY KEY,  
12 |                 session_id INT NOT NULL,  
13 |                 cost FLOAT NOT NULL  
14 |             );  
15 |             """  
16 |             cursor.execute(create_table_query)  
17 |             conn.commit()  
18 |         except Exception as e:
19 |             print(e)
20 |         finally:  
21 |             cursor.close()  
22 |             conn.close()  
23 | 
24 | def insert_cost(session_id, cost):  
25 |     query = "INSERT INTO cost (session_id, cost) VALUES (%s, %s)"  
26 |     conn = create_connection()  
27 |     if conn:  
28 |         try:  
29 |             cursor = conn.cursor()  
30 |             cursor.execute(query, (session_id, cost))  
31 |             conn.commit()  
32 |         except Exception as e:  
33 |             print(f"Error: {e}")  
34 |         finally:  
35 |             if 'cursor' in locals():  
36 |                 cursor.close()  
37 |             conn.close()


--------------------------------------------------------------------------------
/utils/mailersend.py:
--------------------------------------------------------------------------------
 1 | from mailersend import emails
 2 | from config import settings
 3 | 
 4 | api_key = settings.MAILERSEND_API_KEY
 5 | signup_template = settings.EMAIL_TEMPLATE_SIGNUP
 6 | backend_url = settings.BACKEND_URL
 7 | 
 8 | mailer = emails.NewEmail(api_key)
 9 | 
10 | def signup_mailer(customer_email, verify_token):
11 |     signup_link = f"{backend_url}/verify-email?token={verify_token}"
12 |     print(signup_link)
13 |     print(api_key)
14 |     print(signup_template)
15 |     print(backend_url)
16 |     mail_body = {"signup_link": signup_link}
17 |     mail_from = {
18 |         "name": "GTRAG",
19 |         "email": "info@gtrag.com",
20 |     }
21 |     recipients = [
22 |         {
23 |             "email": customer_email,
24 |         }
25 |     ]
26 |     personalization = [
27 |         {
28 |             "email": customer_email,
29 |             "data": {
30 |                 "verify_id": signup_link
31 |             }
32 |         }
33 |     ]
34 |     mailer.set_mail_from(mail_from, mail_body)
35 |     mailer.set_mail_to(recipients, mail_body)
36 |     mailer.set_subject("Please verify your email", mail_body)
37 |     mailer.set_template(signup_template, mail_body)
38 |     mailer.set_personalization(personalization, mail_body)
39 |     response = mailer.send(mail_body)
40 |     print(response)
41 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | st.set_page_config(layout="wide")
 3 | 
 4 | import asyncio
 5 | from views import main_view, login_view, signup_view
 6 | from database.user_manager import create_users_table
 7 | from database.chat_manager import create_chat_table
 8 | from database.cost_manager import create_cost_table
 9 | from database.session_manager import create_session_table
10 | 
11 | # st.set_page_config(
12 | #     page_title="Chatbot",
13 | #     page_icon="🤖"
14 | # )
15 | async def main():
16 |     """Main function to execute the Streamlit app."""
17 |     if 'connected' not in st.session_state:
18 |         st.session_state['connected'] = False
19 | 
20 |     if "logged_in" not in st.session_state:
21 |         st.session_state["logged_in"] = False
22 |         st.session_state['page'] = 'login'
23 | 
24 |     if "user_id" in st.session_state and st.session_state["user_id"]:
25 |         st.session_state["logged_in"] = True
26 | 
27 |     if 'user_info' not in st.session_state:
28 |         st.session_state['user_info'] = {}
29 | 
30 |     if st.session_state['logged_in']:
31 |         await main_view.main_content()
32 |     else:
33 |         if st.session_state['page'] == 'login':
34 |             login_view.login_page()
35 |         else:
36 |             signup_view.signup_page()
37 | 
38 | if __name__ == "__main__":
39 |     # Call the function to create the table
40 |     create_users_table()
41 |     create_chat_table()
42 |     create_cost_table()
43 |     create_session_table()
44 |     # Run the main function
45 |     asyncio.run(main())


--------------------------------------------------------------------------------
/nginx/default.conf:
--------------------------------------------------------------------------------
 1 | server {
 2 |     listen 80;
 3 |     server_name gtrag.bot;
 4 | 
 5 |     proxy_read_timeout 600;
 6 |     proxy_connect_timeout 300;
 7 | 
 8 |     ssl_certificate /etc/letsencrypt/live/gtrag.bot/fullchain.pem;
 9 |     ssl_certificate_key /etc/letsencrypt/live/gtrag.bot/privkey.pem;
10 |     
11 |     location / {
12 |         proxy_pass http://app:8000;
13 | 
14 |         proxy_http_version 1.1;  
15 |         proxy_set_header Upgrade $http_upgrade;  
16 |         proxy_set_header Connection "upgrade";  
17 |         proxy_set_header Host $host;
18 |         proxy_set_header X-Real-IP $remote_addr;
19 |         proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
20 |         proxy_set_header X-Forwarded-Proto $scheme;
21 |     }
22 | }
23 | 
24 | server {
25 |     listen 443 ssl;
26 |     server_name gtrag.bot;
27 | 
28 |     proxy_read_timeout 600;
29 |     proxy_connect_timeout 300;
30 | 
31 |     client_max_body_size 200M;
32 | 
33 |     ssl_certificate /etc/letsencrypt/live/gtrag.bot/fullchain.pem;
34 |     ssl_certificate_key /etc/letsencrypt/live/gtrag.bot/privkey.pem;
35 |     
36 |     location / {
37 |         proxy_pass http://app:8501;
38 | 
39 |         proxy_http_version 1.1;  
40 |         proxy_set_header Upgrade $http_upgrade;  
41 |         proxy_set_header Connection "upgrade";  
42 |         proxy_set_header Host $host;
43 |         proxy_set_header X-Real-IP $remote_addr;
44 |         proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
45 |         proxy_set_header X-Forwarded-Proto $scheme;
46 |     }
47 | }


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   app:
 5 |     build:
 6 |       context: .
 7 |       dockerfile: Dockerfile
 8 |     container_name: Oknoke
 9 |     environment:
10 |       - OPENAI_API_KEY=${OPENAI_API_KEY}
11 |       - DB_HOST=postgres
12 |       - DB_USER=oknoke
13 |       - DB_PASSWORD=oknoke
14 |       - DB_NAME=oknoke
15 |       - MAILERSEND_API_KEY=${MAILERSEND_API_KEY}
16 |       - EMAIL_TEMPLATE_SIGNUP=${EMAIL_TEMPLATE_SIGNUP}
17 |       - BACKEND_URL=${BACKEND_URL}
18 |       - PRODUCT_URL=${PRODUCT_URL}
19 |       - RECAPTCHA_SITE_KEY=''
20 |       - RECAPTCHA_SECRET_KEY=''
21 |       - LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY}
22 |       - LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY}
23 |       - LANGFUSE_HOST=${LANGFUSE_HOST}
24 |     volumes:
25 |       - .:/usr/src/app
26 |     ports:
27 |       - "5000:5000"
28 |     entrypoint: ["./run.sh"]
29 | 
30 |   postgres:  
31 |     image: postgres:13  
32 |     container_name: postgres  
33 |     environment:  
34 |       POSTGRES_DB: oknoke  
35 |       POSTGRES_USER: oknoke  
36 |       POSTGRES_PASSWORD: oknoke  
37 |     ports:  
38 |       - "5432:5432"  # Expose PostgreSQL port  
39 |     volumes:  
40 |       - postgres_data:/var/lib/postgresql/data  
41 | 
42 |   nginx:
43 |     container_name: nginx
44 |     restart: always
45 |     build:
46 |       context: ./nginx
47 |       dockerfile: Dockerfile
48 |     volumes:
49 |       - /etc/letsencrypt/live/gtrag.bot/fullchain.pem:/etc/letsencrypt/live/gtrag.bot/fullchain.pem
50 |       - /etc/letsencrypt/live/gtrag.bot/privkey.pem:/etc/letsencrypt/live/gtrag.bot/privkey.pem
51 |     ports:
52 |       - "80:80"
53 |       - "443:443"
54 |     depends_on:
55 |       - app
56 |       - postgres
57 | 
58 | volumes:  
59 |   postgres_data: {}  


--------------------------------------------------------------------------------
/views/capcha_plugin.py:
--------------------------------------------------------------------------------
 1 | # import library
 2 | import streamlit as st
 3 | from captcha.image import ImageCaptcha
 4 | import random, string
 5 | 
 6 | 
 7 | # define the costant
 8 | length_captcha = 4
 9 | width = 220
10 | height = 100
11 | 
12 | # define the function for the captcha control
13 | def captcha_control():
14 |     if 'controllo' not in st.session_state or st.session_state['controllo'] == False:
15 |         st.session_state['controllo'] = False
16 |         
17 |         # Set up the captcha text
18 |         if 'Captcha' not in st.session_state:
19 |             st.session_state['Captcha'] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=4))
20 |         
21 |         # Create columns for the captcha image, input, and verify button
22 |         col1, col2, col3 = st.columns([2, 4, 1])
23 |         
24 |         with col1:
25 |             image = ImageCaptcha(width=width, height=height)
26 |             data = image.generate(st.session_state['Captcha'])
27 |             st.image(data)
28 |             
29 |         with col2:
30 |             capta2_text = st.text_input('Enter captcha text', placeholder='Type here...')
31 |         
32 |         with col3:
33 |             st.text('')
34 |             if st.button("Verify", key="verify_button", help="Click to verify the captcha"):
35 |             # if st.button("Verify", key="verify_button"):
36 |                 if st.session_state['Captcha'].lower() == capta2_text.lower().strip():
37 |                     del st.session_state['Captcha']
38 |                     st.session_state['controllo'] = True
39 |                     st.rerun()
40 |                 else:
41 |                     st.error("❌ Incorrect captcha. Please try again.")
42 |                     del st.session_state['Captcha']
43 |                     st.rerun()
44 |             else:
45 |                 st.stop()


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from fastapi.responses import HTMLResponse  
 2 | from fastapi import FastAPI, HTTPException, Request
 3 | from pydantic import BaseModel
 4 | from database.connection import create_connection
 5 | from utils.mailersend import signup_mailer
 6 | import bcrypt
 7 | import random
 8 | import string
 9 | import uvicorn  
10 | import os
11 | from dotenv import load_dotenv
12 | 
13 | load_dotenv()
14 | PRODUCT_URL = os.getenv("PRODUCT_URL")
15 | 
16 | app = FastAPI()
17 | 
18 | 
19 | @app.get("/api/verify-email")
20 | async def verify_email(token: str, request: Request):
21 |     conn = create_connection()
22 |     if conn:
23 |         try:
24 |             cursor = conn.cursor()
25 |             cursor.execute(f"UPDATE users SET status = 'verified' WHERE verification_token = '{token}'")
26 |             conn.commit()
27 |             html_content = f"""  
28 |                 <!DOCTYPE html>  
29 |                 <html>  
30 |                 <head>  
31 |                     <title>Email Verified</title>  
32 |                      <script type="text/javascript">  
33 |                         setTimeout(function() {{ 
34 |                             window.location.href = "{PRODUCT_URL}";
35 |                         }}, 1500);  
36 |                     </script>  
37 |                 </head>  
38 |                 <body>  
39 |                     <h1>Email Verified Successfully!</h1>  
40 |                     <p>You will be redirected shortly...</p>  
41 |                 </body>  
42 |                 </html>  
43 |             """  
44 |             return HTMLResponse(content=html_content, status_code=200) 
45 |         except Exception as e:
46 |             raise HTTPException(status_code=400, detail=f"Error: {e}")
47 |         finally:
48 |             if 'cursor' in locals():
49 |                 cursor.close()
50 |             conn.close()
51 | 
52 | if __name__ == "__main__":  
53 |     print("server up")
54 |     uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True) 


--------------------------------------------------------------------------------
/database/session_manager.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st  
 2 | from database.connection import create_connection  
 3 | 
 4 | def create_session_table():  
 5 |     conn = create_connection()  
 6 |     if conn:  
 7 |         try:  
 8 |             cursor = conn.cursor()  
 9 |             create_table_query = """  
10 |             CREATE TABLE IF NOT EXISTS session (  
11 |                 id SERIAL PRIMARY KEY,  
12 |                 user_id INT NOT NULL,  
13 |                 start_session TIMESTAMP DEFAULT CURRENT_TIMESTAMP,  
14 |                 end_session TIMESTAMP NULL  
15 |             );  
16 |             """  
17 |             cursor.execute(create_table_query)  
18 |             conn.commit()  
19 |         except Exception as e:  
20 |             print(e)
21 |         finally:  
22 |             cursor.close()  
23 |             conn.close()  
24 | 
25 | def insert_start_session(user_id):  
26 |     conn = create_connection()  
27 |     new_id = None  
28 |     if conn:  
29 |         try:  
30 |             cursor = conn.cursor()  
31 |             cursor.execute("SELECT id, end_session FROM session ORDER BY id DESC LIMIT 1")  
32 |             record = cursor.fetchone()  
33 |             if record and record[1] is None:  
34 |                 update_end_session(record[0])  
35 |             cursor.execute("INSERT INTO session (user_id) VALUES (%s) RETURNING id;", (user_id,))  
36 |             new_id = cursor.fetchone()[0]  
37 |             conn.commit()  
38 |         except Exception as e:  
39 |             print(e)
40 |         finally:  
41 |             if 'cursor' in locals():  
42 |                 cursor.close()  
43 |             conn.close()  
44 |     return new_id  
45 | 
46 | def update_end_session(session_id):  
47 |     query = "UPDATE session SET end_session = CURRENT_TIMESTAMP WHERE id = %s"  
48 |     conn = create_connection()  
49 |     if conn:  
50 |         try:  
51 |             cursor = conn.cursor()  
52 |             cursor.execute(query, (session_id,))  
53 |             conn.commit()  
54 |         except Exception as e:  
55 |             print(e)
56 |         finally:  
57 |             if 'cursor' in locals():  
58 |                 cursor.close()  
59 |             conn.close()


--------------------------------------------------------------------------------
/views/login_view.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from database.user_manager import authenticate_user, get_user_id, update_user_country, create_google_user
 3 | from database.session_manager import insert_start_session
 4 | from views.capcha_plugin import captcha_control
 5 | from utils.ip_tools import get_remote_country
 6 | from embeddings.vector_store import clear_cache
 7 | from streamlit_google_auth import Authenticate
 8 | from utils.google_authenticate import authenticator
 9 | 
10 | 
11 | def init_login_session(email):
12 |     user_id = get_user_id(email)
13 |     session_id = insert_start_session(user_id)
14 |     st.session_state["logged_in"] = True
15 |     st.session_state["email"] = email
16 |     st.session_state["user_id"] = user_id
17 |     st.session_state["session_id"] = session_id
18 |     clear_cache(user_id)
19 | 
20 | def login_page():
21 |     st.title("Sign In")
22 |     email = st.text_input("Email")
23 |     password = st.text_input("Password", type="password")
24 |     authenticator.check_authentification()
25 |     authorization_url = authenticator.get_authorization_url()
26 | 
27 |     print(st.session_state["connected"])
28 |     # SignIn By Google
29 |     if st.session_state["connected"] == False:
30 |         st.link_button('Sign In With Google', authorization_url, use_container_width=True)
31 |     elif st.session_state["connected"] == True:
32 |         email = st.session_state['user_info'].get('email')
33 |         create_google_user(email)
34 |         init_login_session(email)
35 |         st.rerun()
36 | 
37 |     # Captcha Component
38 |     captcha_control()
39 | 
40 |     # Signin By Email and Password
41 |     col1, col2, col3 = st.columns([1, 3, 1])
42 |     with col2:
43 |         if st.button("Sign In", use_container_width=True):
44 |             res = authenticate_user(email, password)
45 |             if res == "Success":
46 |                 update_user_country(email, get_remote_country())
47 |                 init_login_session(email)
48 |                 st.rerun()
49 |             else:
50 |                 # st.error(res)
51 |                 print(res)
52 |   
53 |         if st.button("Go to Sign Up", use_container_width=True):
54 |             st.session_state['page'] = 'signup'
55 |             st.rerun()
56 | 
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RAG System with OpenAI and Streamlit 🤖📄
 2 | 
 3 | Welcome to the **RAG System** (Retrieve and Generate) — an innovative AI-powered chatbot that leverages the OpenAI Assistant API and Streamlit to provide real-time, context-aware answers based on user-uploaded documents.
 4 | 
 5 | ## Features ✨
 6 | 
 7 | - **Multiformat Document Support**: Upload and process various file types including PDFs, DOCX, PPTX, TXT, and script files. 📂
 8 | - **Real-time Responses**: Enjoy seamless and interactive responses fetched via WebSocket connections. 🔄
 9 | - **Context Aware**: Provides answers based on the content of the uploaded documents, making the interactions more meaningful and personalized. 🔍
10 | 
11 | ## Installation ⚙️
12 | 
13 | To get started with the RAG system, follow these steps:
14 | 
15 | 1. **Clone the Repository**:
16 |    ```bash
17 |    git clone https://github.com/SuperGalaxy0901/Streamlit-OpenAI-Chatbot.git
18 |    cd rag-system
19 |    ```
20 | 
21 | 2. **Set up a Virtual Environment** (recommended):
22 |    ```bash
23 |    python -m venv env
24 |    source env/bin/activate  # On Windows use `env\Scripts\activate`
25 |    ```
26 | 
27 | 3. **Install the Required Packages**:
28 |    ```bash
29 |    pip install -r requirements.txt
30 |    ```
31 | 
32 | 4. **Set Up Environment Variables**:
33 |    - Create a `.env` file to safely store your API keys and configuration settings.
34 |      ```plaintext
35 |      OPENAI_API_KEY=your_openai_api_key
36 |      ```
37 | 
38 | ## Usage 🚀
39 | 
40 | 1. **Run the Streamlit App**:
41 |    ```bash
42 |    streamlit run app.py
43 |    ```
44 | 
45 | 2. **Interact with the Chatbot**:
46 |    - Upload documents via the application interface. 📤
47 |    - Engage with the chatbot as it generates insightful responses based on your document contents. 💬
48 | 
49 | ## Architecture Overview 🏗️
50 | 
51 | - **Streamlit**: Provides the front-end interface where users can upload documents and interact with the chatbot. 🌐
52 | - **OpenAI Assistant API**: Powers the natural language comprehension and generation. 🧠
53 | - **WebSockets**: Enables real-time, efficient communication between the front-end and back-end services. 📡
54 | 
55 | ## Acknowledgements 🙏
56 | 
57 | - [OpenAI](https://openai.com) for their incredible API.
58 | - [Streamlit](https://streamlit.io) for the easy-to-use app framework.
59 | 


--------------------------------------------------------------------------------
/views/signup_view.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from database.user_manager import create_user, verify_user, create_google_user
 3 | from helpers.validators import is_valid_email, is_valid_password
 4 | from views.capcha_plugin import captcha_control
 5 | from views.login_view import init_login_session
 6 | from utils.google_authenticate import authenticator
 7 | 
 8 | def signup_page():
 9 |     st.title("🔐 Sign Up")
10 |     
11 |     new_email = st.text_input("Email")
12 |     new_password = st.text_input("New Password", type='password')
13 |     confirm_password = st.text_input("Confirm Password", type='password')
14 | 
15 |     # SignUp By Google
16 |     authenticator.check_authentification()
17 |     authorization_url = authenticator.get_authorization_url()
18 | 
19 |     if not st.session_state.get('connected', False):
20 |         st.link_button('Sign Up With Google', authorization_url, use_container_width=True)
21 |     else:
22 |         email = st.session_state['user_info'].get('email')
23 |         create_google_user(email)
24 |         init_login_session(email)
25 |         st.rerun()
26 |     
27 |     # Captcha Component
28 |     captcha_control()
29 | 
30 |     if 'server_code' not in st.session_state:
31 |         st.session_state.server_code = ""
32 | 
33 |     col1, col2, col3 = st.columns([1, 3, 1])
34 |     with col2:
35 |         if st.button("Sign Up", use_container_width=True):
36 |             if not is_valid_email(new_email):
37 |                 st.error("🚫 Invalid email format")
38 |             elif not is_valid_password(new_password):
39 |                 st.error("🚫 Password must be at least 8 characters long, contain a letter, a number, and a special character")
40 |             elif new_password != confirm_password:
41 |                 st.error("🚫 Passwords do not match")
42 |             else:
43 |                 ret = create_user(new_email, new_password)
44 |                 if ret != "error":
45 |                     st.success("✅ User created successfully. Please check your email for the verification link.")
46 |                 else:
47 |                     st.error("🚫 Error creating user. Please try again.")
48 | 
49 |         if st.button("Go to Sign In", use_container_width=True):
50 |             st.session_state['page'] = "login"
51 |             st.rerun()
52 | 
53 |     st.markdown("<div style='margin-top: 20px;'></div>", unsafe_allow_html=True)
54 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files  
  2 | __pycache__/  
  3 | *.py[cod]  
  4 | *$py.class  
  5 | 
  6 | # C extensions  
  7 | *.so  
  8 | 
  9 | # Distribution / packaging  
 10 | .Python  
 11 | env/  
 12 | venv/  
 13 | ENV/  
 14 | build/  
 15 | develop-eggs/  
 16 | dist/  
 17 | downloads/  
 18 | eggs/  
 19 | .eggs/  
 20 | lib/  
 21 | lib64/  
 22 | parts/  
 23 | sdist/  
 24 | var/  
 25 | wheels/  
 26 | *.egg-info/  
 27 | .installed.cfg  
 28 | *.egg  
 29 | MANIFEST  
 30 | 
 31 | # PyInstaller  
 32 | # Usually these files are written by a python script from a template  
 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.  
 34 | *.manifest  
 35 | *.spec  
 36 | 
 37 | # Installer logs  
 38 | pip-log.txt  
 39 | pip-delete-this-directory.txt  
 40 | 
 41 | # Unit test / coverage reports  
 42 | htmlcov/  
 43 | .tox/  
 44 | .nox/  
 45 | .coverage  
 46 | .coverage.*  
 47 | .cache  
 48 | nosetests.xml  
 49 | coverage.xml  
 50 | *.cover  
 51 | .hypothesis/  
 52 | .pytest_cache/  
 53 | tests/__pycache__/  
 54 | 
 55 | # Translations  
 56 | *.mo  
 57 | *.pot  
 58 | 
 59 | # Django stuff:  
 60 | *.log  
 61 | local_settings.py  
 62 | db.sqlite3  
 63 | db.sqlite3-journal  
 64 | 
 65 | # Flask stuff:  
 66 | instance/  
 67 | .webassets-cache  
 68 | 
 69 | # Scrapy stuff:  
 70 | .scrapy  
 71 | 
 72 | # Sphinx documentation  
 73 | docs/_build/  
 74 | 
 75 | # Jupyter Notebook  
 76 | .ipynb_checkpoints  
 77 | 
 78 | # IPython  
 79 | profile_default/  
 80 | ipython_config.py  
 81 | 
 82 | # pyenv  
 83 | .python-version  
 84 | 
 85 | # Celery stuff  
 86 | celerybeat-schedule.*  
 87 | 
 88 | # SageMath parsed files  
 89 | *.sage.py  
 90 | 
 91 | # Environments  
 92 | .env  
 93 | .venv  
 94 | env/  
 95 | venv/  
 96 | ENV/  
 97 | env.bak/  
 98 | venv.bak/  
 99 | 
100 | # mkdocs documentation  
101 | /site  
102 | 
103 | # mypy  
104 | .mypy_cache/  
105 | .dmypy.json  
106 | dmypy.json  
107 | 
108 | # Pyre type checker  
109 | .pyre/  
110 | 
111 | # pytype static type analyzer  
112 | .pytype/  
113 | 
114 | # Cython debug symbols  
115 | cython_debug/  
116 | 
117 | # Other artifacts  
118 | *.swp  
119 | *~  
120 | .DS_Store  
121 | Thumbs.db  
122 | 
123 | # Docker-specific ignores  
124 | docker-compose.local.yml
125 | # .dockerignore file itself to avoid inclusions  
126 | .dockerignore  
127 | # Others  
128 | .git  
129 | .gitignore  
130 | .tmp  
131 | .vscode/  
132 | .idea/  
133 | *.bak  
134 | *.tmp


--------------------------------------------------------------------------------
/database/chat_manager.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st  
 2 | from database.connection import create_connection  
 3 | import bcrypt  
 4 | 
 5 | def create_chat_table():  
 6 |     conn = create_connection()  
 7 |     if conn:  
 8 |         try:  
 9 |             cursor = conn.cursor()  
10 |             create_table_query = """  
11 |             CREATE TABLE IF NOT EXISTS chat (  
12 |                 id SERIAL PRIMARY KEY,  
13 |                 user_id INT NOT NULL,  
14 |                 vector_id VARCHAR(255) NOT NULL,  
15 |                 thread_id VARCHAR(255) NOT NULL,  
16 |                 file_id VARCHAR(255) NOT NULL,  
17 |                 assistant_id VARCHAR(255) NOT NULL  
18 |             );  
19 |             """  
20 |             cursor.execute(create_table_query)  
21 |             conn.commit()  
22 |         except Exception as e:  
23 |             print(e)
24 |         finally:  
25 |             cursor.close()  
26 |             conn.close()  
27 | 
28 | def create_chat(user_id, vector_id, thread_id, file_id, assistant_id):  
29 |     conn = create_connection()  
30 |     if conn:  
31 |         try:  
32 |             cursor = conn.cursor()  
33 |             cursor.execute("INSERT INTO chat (user_id, vector_id, thread_id, file_id, assistant_id) VALUES (%s, %s, %s, %s, %s)",   
34 |                            (user_id, vector_id, thread_id, file_id, assistant_id))  
35 |             conn.commit()  
36 |         except Exception as e:  
37 |             print(e)
38 |         finally:  
39 |             if 'cursor' in locals():  
40 |                 cursor.close()  
41 |             conn.close()  
42 | 
43 | def get_individual_chat(id):  
44 |     conn = create_connection()  
45 |     if conn:  
46 |         try:  
47 |             cursor = conn.cursor()  
48 |             cursor.execute("SELECT user_id, vector_id, thread_id, file_id, assistant_id FROM chat WHERE id = %s", (id,))  
49 |             record = cursor.fetchone()  
50 |             if record:  
51 |                 return record  
52 |             else:  
53 |                 return None  
54 |         except Exception as e:  
55 |             print(e)
56 |             return None  
57 |         finally:  
58 |             if 'cursor' in locals():  
59 |                 cursor.close()  
60 |             conn.close()  
61 | 
62 | def get_user_chats(user_id):  
63 |     conn = create_connection()  
64 |     if conn:  
65 |         try:  
66 |             cursor = conn.cursor()  
67 |             cursor.execute("SELECT * FROM chat WHERE user_id = %s", (user_id,))  
68 |             record = cursor.fetchall()  
69 |             if record:  
70 |                 return record  
71 |             else:  
72 |                 return None  
73 |         except Exception as e:  
74 |             print(e)
75 |             return None  
76 |         finally:  
77 |             if 'cursor' in locals():  
78 |                 cursor.close()  
79 |             conn.close()


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | # client_secret.json
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | # .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 


--------------------------------------------------------------------------------
/database/user_manager.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st  
  2 | from database.connection import create_connection  
  3 | from utils.mailersend import signup_mailer  
  4 | import bcrypt  
  5 | import random  
  6 | 
  7 | def generate_6_digit_number():  
  8 |     return random.randint(100000, 999999)  
  9 | 
 10 | def create_users_table():  
 11 |     conn = create_connection()  
 12 |     if conn:  
 13 |         try:  
 14 |             cursor = conn.cursor()  
 15 |             create_table_query = """  
 16 |             CREATE TABLE IF NOT EXISTS users (  
 17 |                 id SERIAL PRIMARY KEY,  
 18 |                 email VARCHAR(255) UNIQUE NOT NULL,  
 19 |                 password VARCHAR(255) NOT NULL,  
 20 |                 verify_id VARCHAR(255),  
 21 |                 status VARCHAR(20),  
 22 |                 country VARCHAR(30),  
 23 |                 verification_token VARCHAR(255),  
 24 |                 is_gmail INT  
 25 |             );  
 26 |             """  
 27 |             cursor.execute(create_table_query)  
 28 |             conn.commit()  
 29 |         except Exception as e:  
 30 |             print(e)
 31 |         finally:  
 32 |             cursor.close()  
 33 |             conn.close()  
 34 |     else:  
 35 |         st.error("Unable to connect to the database.")  
 36 | 
 37 | def create_user(email, password):  
 38 |     print("create user!!!!!!")
 39 |     flag = 0  
 40 |     conn = create_connection()  
 41 |     if conn:  
 42 |         try:  
 43 |             cursor = conn.cursor()  
 44 |             hashed_password = bcrypt.hashpw(password.encode('utf-8'), bcrypt.gensalt()).decode("utf-8")
 45 |             verify_id = generate_6_digit_number()  
 46 |             status = "pending"  
 47 |             cursor.execute(  
 48 |                 "INSERT INTO users (email, password, verification_token, status, is_gmail) VALUES (%s, %s, %s, %s, %s)",  
 49 |                 (email, hashed_password, verify_id, status, 0)  
 50 |             )  
 51 |             print("signup_mailer called!!!!!!")
 52 |             signup_mailer(email, verify_id)  
 53 |             conn.commit()  
 54 |         except Exception as e:  
 55 |             print(e)
 56 |             flag = 1  
 57 |         finally:  
 58 |             if 'cursor' in locals():  
 59 |                 cursor.close()  
 60 |             conn.close()  
 61 |             if flag == 0:  
 62 |                 return verify_id  
 63 |             else:  
 64 |                 return "error"  
 65 | 
 66 | def create_google_user(email):  
 67 |     conn = create_connection()  
 68 |     if conn:  
 69 |         try:  
 70 |             cursor = conn.cursor()  
 71 |             cursor.execute("SELECT * FROM users WHERE email = %s", (email,))  
 72 |             result = cursor.fetchone()  
 73 |             if result is None:  
 74 |                 status = "verified"  
 75 |                 cursor.execute(  
 76 |                     "INSERT INTO users (email, password, status, is_gmail) VALUES (%s, %s, %s, %s)",  
 77 |                     (email, 'XXX', status, 1)  
 78 |                 )  
 79 |                 conn.commit()  
 80 |             else:  
 81 |                 print(email)  
 82 |         except Exception as e:  
 83 |             print(e)  
 84 |             pass  
 85 |         finally:  
 86 |             if 'cursor' in locals():  
 87 |                 cursor.close()  
 88 |             conn.close()  
 89 | 
 90 | def verify_user(email):  
 91 |     conn = create_connection()  
 92 |     if conn:  
 93 |         try:  
 94 |             cursor = conn.cursor()  
 95 |             cursor.execute(  
 96 |                 "UPDATE users SET status = %s WHERE email = %s",  
 97 |                 ('verified', email)  
 98 |             )  
 99 |             conn.commit()  
100 |         except Exception as e:  
101 |             print(e)
102 |         finally:  
103 |             if 'cursor' in locals():  
104 |                 cursor.close()  
105 |             conn.close()  
106 | 
107 | def authenticate_user(email, password):  
108 |     conn = create_connection()  
109 |     if conn:  
110 |         try:  
111 |             cursor = conn.cursor()  
112 |             query = "SELECT password, status FROM users WHERE email = %s AND is_gmail = %s"  
113 |             cursor.execute(query, (email, 0))  
114 |             record = cursor.fetchone()  
115 | 
116 |             if record:  
117 |                 if bcrypt.checkpw(password.encode('utf-8'), record[0].encode('utf-8')):  
118 |                     if record[1] == "verified":  
119 |                         return "Success"  
120 |                     else:  
121 |                         return "Email has not been verified"  
122 |                 else:  
123 |                     return "Invalid email or password"  
124 |             else:  
125 |                 return "Invalid email or password"  
126 |         except Exception as e:  
127 |             print(e)
128 |             return False  
129 |         finally:  
130 |             if 'cursor' in locals():  
131 |                 cursor.close()  
132 |             conn.close()  
133 |     else:  
134 |         print(e)
135 |         return False  
136 | 
137 | def get_user_id(email):  
138 |     conn = create_connection()  
139 |     if conn:  
140 |         try:  
141 |             cursor = conn.cursor()  
142 |             cursor.execute("SELECT id FROM users WHERE email = %s", (email,))  
143 |             record = cursor.fetchone()  
144 |             if record:  
145 |                 return record[0]  
146 |             else:  
147 |                 return None  
148 |         except Exception as e:  
149 |             print(e)
150 |             return None  
151 |         finally:  
152 |             if 'cursor' in locals():  
153 |                 cursor.close()  
154 |             conn.close()  
155 | 
156 | def update_user_country(email, country):  
157 |     conn = create_connection()  
158 |     if conn:  
159 |         try:  
160 |             cursor = conn.cursor()  
161 |             cursor.execute(  
162 |                 "UPDATE users SET country = %s WHERE email = %s",  
163 |                 (country, email)  
164 |             )  
165 |             conn.commit()  
166 |         except Exception as e:  
167 |             print(e)
168 |         finally:  
169 |             if 'cursor' in locals():  
170 |                 cursor.close()  
171 |             conn.close()


--------------------------------------------------------------------------------
/embeddings/vector_store.py:
--------------------------------------------------------------------------------
  1 | from config import settings
  2 | from langfuse import Langfuse 
  3 | from langfuse.openai import openai  
  4 | from langfuse.decorators import langfuse_context, observe
  5 | import hashlib
  6 | import secrets
  7 | import io
  8 | import time
  9 | from database.chat_manager import create_chat, get_user_chats
 10 | from database.cost_manager import insert_cost
 11 | 
 12 | openai_client = openai.OpenAI(api_key=settings.OPENAI_API_KEY)
 13 | # Initialize Langfuse Client  
 14 | langfuse_client = Langfuse(  
 15 |     secret_key=settings.LANGFUSE_SECRET_KEY,  
 16 |     public_key=settings.LANGFUSE_PUBLIC_KEY,  
 17 |     host=settings.LANGFUSE_HOST
 18 | )
 19 | 
 20 | @observe()
 21 | def generate_vector_store(uploaded_files, user_id):
 22 |     # Generate Vector Name Using Hash
 23 |     random_bytes = secrets.token_bytes(64)
 24 |     hash_object = hashlib.sha256()
 25 |     hash_object.update(random_bytes)
 26 |     vector_name = hash_object.hexdigest()
 27 | 
 28 |     # Init File for upload OpenAI
 29 |     file_content = uploaded_files[0].getvalue()
 30 |     file_bytes_io = io.BytesIO(file_content.encode('utf-8') if isinstance(file_content, str) else file_content)
 31 |     file_bytes_io.name = uploaded_files[0].name
 32 | 
 33 |     # Upload File on OpenAI store
 34 |     file = openai_client.files.create(
 35 |         file=file_bytes_io,
 36 |         purpose='assistants'
 37 |     )
 38 |     uploaded_files[0] = file.id
 39 | 
 40 |     # Vector Store
 41 |     vector = openai_client.beta.vector_stores.create(
 42 |         name=vector_name
 43 |     )
 44 |     openai_client.beta.vector_stores.files.create(
 45 |         vector_store_id=vector.id,
 46 |         file_id=file.id
 47 |     )
 48 | 
 49 |     # chat assistant
 50 |     assistant = openai_client.beta.assistants.create(
 51 |         instructions="Use the file provided as your knowledge base to best respond to customer queries. Only include at least on file citation(for example: 【4:1†source】) in the answer.",
 52 |         model="gpt-4o-mini",
 53 |         tools=[
 54 |             { 
 55 |                 "type": "file_search",
 56 |             }
 57 |             ],
 58 |         tool_resources={
 59 |             "file_search": {
 60 |             "vector_store_ids":[vector.id]
 61 |             }
 62 |         }
 63 |     )
 64 | 
 65 |     # chat thread
 66 |     thread = openai_client.beta.threads.create()
 67 |     create_chat(user_id, vector.id, thread.id, file.id, assistant.id)
 68 |     print(f"Thread: {thread.id}")
 69 |     print(f"Assistant: {assistant.id}")
 70 |     return thread.id, assistant.id, file_bytes_io.name
 71 | 
 72 | @observe()
 73 | def get_conversational_chain(user_question, thread_id, assistant_id, file_name, session_id):
 74 |     """Ignore thread_id parameter due to cost limit"""
 75 |     print(f"Thread: {thread_id}")
 76 |     print(f"Assistant: {assistant_id}")
 77 |     print(f"User Question: {user_question}")
 78 |     if user_question == "":
 79 |         return
 80 |     thread = openai_client.beta.threads.create()
 81 |     openai_client.beta.threads.messages.create(
 82 |         thread_id=thread.id,
 83 |         role="user",
 84 |         content=user_question + """Include references after the answer in this format: 
 85 |             <div class="file-citation">
 86 |                 <h3 class="citation-title">File Name with extension Here</h3>
 87 |                 > Exact contents of the references(10 sentences)
 88 |             </div>
 89 | 
 90 |             Never produce unnecessary statements like "For more details, here is the citation formatted as requested:"
 91 |             Only include at least on file citation(for example: 【4:1†source】) in the answer and do not include file citations(for example: 【4:1†source】) in the references.
 92 |             Generate answer at any cost.
 93 |             """
 94 |     )
 95 |     with openai_client.beta.threads.runs.stream(
 96 |     thread_id=thread.id,
 97 |     assistant_id=assistant_id,
 98 |     instructions=f"You will act as a helpful assistant. Please analyze {file_name} and provide accurate answers to user question. Only include at least on file citation(for example: 【4:1†source】) in the answer.",
 99 |     ) as stream:
100 |         for event in stream:  
101 | 
102 |             if event.event == 'thread.run.step.created':  
103 |                 print('\nMessage creation detected...')  
104 | 
105 |                 for text in stream.text_deltas:  
106 |                     yield text
107 |             elif event.event == 'thread.message.delta':  
108 |                 yield event.data.delta.content[0].text.value  
109 | 
110 |     if stream._current_message_content:
111 |         file_citation_annotations = stream._current_message_content.text.annotations
112 |         yield file_citation_annotations
113 | 
114 |     # log internal generation within the openai assistant as a separate child generation to langfuse  
115 |     if stream.current_run:          
116 |         # Extract and print cost data
117 |         cost_data = cost_for_tokens(stream.current_run.usage)
118 |         print(f"Cost for this response: {cost_data}")
119 |         insert_cost(session_id, cost_data)
120 | 
121 | @observe()
122 | def generate_questions(thread_id, assistant_id, file_name, session_id):
123 |     thread = openai_client.beta.threads.create()
124 |     openai_client.beta.threads.messages.create(
125 |         thread_id=thread.id,
126 |         role="user",
127 |         content=f"Generate 3 questions for the {file_name}. Only output 3 questions and never output statement."
128 |     )
129 | 
130 |     run = openai_client.beta.threads.runs.create(
131 |         thread_id=thread.id,
132 |         assistant_id=assistant_id,
133 |         instructions=f"You will act as a helpful assistant. Please analyze {file_name} and provide accurate answers to user question."
134 |     )
135 | 
136 |     retrieved_run = openai_client.beta.threads.runs.retrieve(
137 |         thread_id=thread.id,
138 |         run_id=run.id
139 |     )
140 | 
141 |     counter = 0
142 |     while retrieved_run.status != "completed":
143 |         retrieved_run = openai_client.beta.threads.runs.retrieve(
144 |         thread_id=thread.id,
145 |         run_id=run.id
146 |         )
147 |         counter += 1
148 |         if counter % 10 == 0:
149 |             time.sleep(1)
150 |     
151 |     thread_messages = openai_client.beta.threads.messages.list(thread.id)
152 |     
153 |     # log the generation for tracking cost  
154 |     if retrieved_run:  
155 |         langfuse_client.generation(  
156 |             trace_id=langfuse_context.get_current_trace_id(),  
157 |             parent_observation_id=langfuse_context.get_current_observation_id(),  
158 |             model=retrieved_run.model,  
159 |             usage=retrieved_run.usage,  
160 |             input=f"generate 3 questions for the {file_name}. only output 3 questions",  
161 |             output=[msg.content[0].text.value for msg in thread_messages.data if msg.role == 'assistant']  
162 |         )
163 |         # Extract and print cost data
164 |         cost_data = cost_for_tokens(retrieved_run.usage)
165 |         print(f"Cost for this response: {cost_data}")
166 |         insert_cost(session_id, cost_data)
167 |     return thread_messages.data[0].content[0].text.value
168 | 
169 | def clear_cache(user_id):  
170 |     records = get_user_chats(user_id)  
171 |     
172 |     # Check if records is None and set it to an empty list if it is  
173 |     if records is None:  
174 |         records = []  
175 |     
176 |     for record in records:  
177 |         vector_id = record[2]
178 |         file_id = record[4]
179 |         print(vector_id)
180 |         print(file_id)
181 |         try:  
182 |             # Ensure you pass the parameters correctly as dicts  
183 |             openai_client.beta.vector_stores.delete(vector_store_id=vector_id)  
184 |             openai_client.files.delete(file_id=file_id)  
185 |         except Exception as e:  
186 |             print(f"Error while deleting resources: {e}")  
187 |     
188 |     return
189 | 
190 | def cost_for_tokens(usage, model = "GPT-4o mini"):
191 |     return usage.prompt_tokens * 0.15 / 1000000 + usage.completion_tokens * 0.6 / 1000000
192 | 
193 | def get_file_content(file_id):  
194 |     # Retrieve file content
195 |     try:  
196 |         return openai_client.files.retrieve(file_id=file_id)
197 |     except Exception as e:  
198 |         print(f"Failed to retrieve file metadata: {e}")  
199 |         return None
200 | 


--------------------------------------------------------------------------------
/views/main_view.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | from embeddings.vector_store import generate_questions, generate_vector_store, get_conversational_chain, clear_cache, get_file_content
  3 | from database.session_manager import update_end_session
  4 | from functools import partial
  5 | from streamlit_url_fragment import get_fragment
  6 | import fitz
  7 | import time
  8 | import re
  9 | 
 10 | from utils.string_util import find_positions_multiple
 11 | 
 12 | file_citations = []
 13 | 
 14 | def clear_chat_history():
 15 |     st.session_state.answered_questions = []
 16 |     st.session_state.messages = [
 17 |         {"role": "assistant", "content": "Upload some PDFs and ask me a question"}]
 18 | 
 19 | async def user_input(user_question, thread_id, assistant_id, file_name, session_id):
 20 |     response = get_conversational_chain(user_question, thread_id, assistant_id, file_name, session_id)
 21 |     for chunk in response:
 22 |         yield chunk
 23 | 
 24 | def display_question(thread_id, assistant_id, file_name, session_id):
 25 |     questions = generate_questions(thread_id, assistant_id, file_name, session_id)
 26 |     return [q.strip() for q in questions.split('\n') if q.strip()]
 27 | 
 28 | def process_file():
 29 |     progress_bar = st.progress(0)
 30 |     for i in range(10):
 31 |         time.sleep(0.2)
 32 |         progress_bar.progress((i + 1) * 10)
 33 | 
 34 | def start_new_job():
 35 |     st.session_state.questions = []
 36 |     st.session_state.uploaded_files = []
 37 |     st.session_state.file_uploader_key += 1
 38 |     st.session_state.clicked_file_id = None
 39 |     st.session_state.citation_index = None
 40 |     st.session_state.initial_state = True
 41 |     clear_chat_history()
 42 |     clear_cache(st.session_state["user_id"])
 43 |     st.rerun()
 44 | 
 45 | def set_file_id(file_id, index):
 46 |     if st.session_state.clicked_file_id != file_id or st.session_state.citation_index != index:
 47 |         if st.session_state.clicked_file_id is not None:
 48 |             st.session_state.initial_state = False
 49 |         st.session_state.clicked_file_id = file_id
 50 |         st.session_state.citation_index = index
 51 |         st.rerun()
 52 | 
 53 | async def display_chat_room():
 54 |     global file_citations
 55 | 
 56 |     st.title("Chat with document files 🤖")
 57 |     st.write("Welcome to the chat!")
 58 |     st.write("Upload any file types or several types(pdf, docx, csv, pptx, xlsx, txt)")
 59 | 
 60 |     st.sidebar.button('Clear Chat History', on_click=clear_chat_history, use_container_width=True)
 61 |     st.sidebar.button('SignOut', on_click=logout)
 62 | 
 63 |     current_file_id = get_fragment()
 64 |     if current_file_id is not None:
 65 |         conf = current_file_id.split("#")
 66 |         if len(conf) > 2:
 67 |             set_file_id(conf[1], conf[2])
 68 |         else:
 69 |             set_file_id(None, None)
 70 |     # Chat input
 71 |     if "messages" not in st.session_state.keys():
 72 |         st.session_state.messages = [
 73 |             {"role": "assistant", "content": "Upload some PDFs and ask me a question"}]
 74 |         
 75 |     for message in st.session_state.messages:
 76 |         with st.chat_message(message["role"]):
 77 |             # st.code(message["content"], language="markdown", wrap_lines=True)
 78 |             st.markdown(f"""{message["content"]}""", unsafe_allow_html=True)
 79 | 
 80 |     button_pressed = ''
 81 |     if 'questions' in st.session_state and st.session_state.questions:
 82 |         for question in st.session_state.questions:
 83 |             if st.button(question, disabled=question in st.session_state.answered_questions):
 84 |                 button_pressed = question[3:]
 85 |                 if question not in st.session_state.answered_questions:
 86 |                     st.session_state.answered_questions.append(question)
 87 | 
 88 |     if prompt := ((st.chat_input()) or button_pressed):
 89 |         st.session_state.messages.append({"role": "user", "content": prompt})
 90 |         with st.chat_message("user"):
 91 |             st.code(prompt, language="markdown", wrap_lines=True)
 92 |     
 93 |     st.markdown("""
 94 |                  <style>
 95 |                 .file-citation {
 96 |                     display: none;
 97 |                 }
 98 |                 </style>
 99 |                 """, unsafe_allow_html=True)
100 | 
101 |     if st.session_state.messages[-1]["role"] != "assistant" and 'thread_id' in st.session_state:
102 |         with st.chat_message("assistant"):
103 |             with st.spinner("Thinking..."):
104 |                 response = ""
105 |                 placeholder = st.empty()
106 |                 async for chunk in user_input(prompt, st.session_state.thread_id, st.session_state.assistant_id, st.session_state['file_name'], st.session_state.session_id):
107 |                     if isinstance(chunk, str):
108 |                         response = response + chunk
109 |                     else:
110 |                         positions = find_positions_multiple(response, "【", "】")
111 | 
112 |                         if len(positions) > 0:
113 |                             placeholder.markdown("")
114 | 
115 |                             end_pos = 0
116 |                             result = ""
117 |                             for index, (start, end) in enumerate(positions):
118 |                                 result += response[end_pos:start]
119 |                                 end_pos = end
120 | 
121 |                                 html_link = f"<a href='#{chunk[index].file_citation.file_id}#{chunk[index].start_index}' id='{chunk[index].file_citation.file_id}'>{response[start:end]}</a>"
122 |                                 result += html_link
123 | 
124 |                                 pattern = r'<div class="file-citation">\s*(.*?)\s*</div>'
125 |                                 matches = re.findall(pattern, response[end:], re.DOTALL)  
126 | 
127 |                                 if len(matches) > 0:
128 |                                     file_citations.append({
129 |                                         "index": chunk[index].start_index,
130 |                                         "markdown": matches[0]
131 |                                     })
132 | 
133 |                             result += response[end_pos:]
134 |                             response = result
135 | 
136 |                     placeholder.markdown(response, unsafe_allow_html=True)
137 | 
138 |         if response:
139 |             st.session_state.messages.append({"role": "assistant", "content": response})
140 |         st.rerun()
141 |         
142 |     if len(st.session_state.answered_questions) is 3:
143 |         st.session_state.btn_disabed = False
144 |         st.text("Would you like to continue?")
145 |         if st.button("Yes", disabled=st.session_state.btn_disabled):
146 |             st.session_state.answered_questions = []
147 |             st.session_state.questions = display_question(st.session_state['thread_id'], st.session_state['assistant_id'], st.session_state['file_name'], st.session_state.session_id)
148 |             st.rerun()
149 |         if st.button("No", disabled=st.session_state.btn_disabled):
150 |             st.session_state.btn_disabled = True
151 |             st.rerun()
152 | 
153 | async def main_content():
154 |     if "file_uploader_key" not in st.session_state:
155 |         st.session_state["file_uploader_key"] = 0
156 |     if "uploaded_files" not in st.session_state:
157 |         st.session_state["uploaded_files"] = []
158 |     if 'clicked_file_id' not in st.session_state:  
159 |         st.session_state.clicked_file_id = None
160 |     if 'citation_index' not in st.session_state:  
161 |         st.session_state.citation_index = None
162 |     if 'initial_state' not in st.session_state:  
163 |         st.session_state.initial_state = False
164 |     if 'answered_questions' not in st.session_state:
165 |         st.session_state.answered_questions = []
166 |     if 'btn_disabled' not in st.session_state:
167 |         st.session_state.btn_disabled = False
168 |     # Sidebar for uploading PDF files
169 |     with st.sidebar:
170 |         st.title("Menu:")
171 |         if st.button("Start a New Job", use_container_width=True):
172 |             start_new_job()
173 |         uploaded_files = st.file_uploader(
174 |             "Upload your Files and Click on the Submit & Process Button", accept_multiple_files=True, key=st.session_state["file_uploader_key"])
175 |         if uploaded_files:
176 |             st.session_state["uploaded_files"] = uploaded_files    
177 |         if uploaded_files != []:
178 |             if st.button("Submit & Process", use_container_width=True):
179 |                 with st.spinner("Processing..."):
180 |                     start_time = time.time()
181 |                     thread_id, assistant_id, file_name = generate_vector_store(uploaded_files, user_id=st.session_state.user_id)
182 |                     st.session_state['thread_id'] = thread_id
183 |                     st.session_state['assistant_id'] = assistant_id
184 |                     st.session_state['file_name'] = file_name
185 |                     st.session_state.questions = display_question(thread_id, assistant_id, st.session_state['file_name'], st.session_state.session_id)
186 |                     process_file()
187 |                     end_time = time.time()
188 |                     elapsed_time = end_time - start_time
189 |                     st.success(f"Done in {elapsed_time:.2f} seconds")
190 | 
191 |     if st.session_state.clicked_file_id and not st.session_state.initial_state:
192 |         col1, col2 = st.columns(2)
193 |         with col1:
194 |             for citation in file_citations:
195 |                 if citation["index"] == int(st.session_state.citation_index):
196 |                     st.markdown(citation["markdown"], unsafe_allow_html=True)
197 |         with col2:
198 |             await display_chat_room()
199 |     else:
200 |         await display_chat_room()
201 | 
202 | def logout():
203 |     update_end_session(st.session_state["session_id"])
204 |     clear_cache(st.session_state["user_id"])
205 |     st.session_state.clear()
206 |     st.session_state['logout'] = True
207 |     st.session_state['name'] = None
208 |     st.session_state['username'] = None
209 |     st.session_state['connected'] = False


--------------------------------------------------------------------------------