├── nginx ├── Dockerfile └── default.conf ├── run.sh ├── utils ├── google_authenticate.py ├── string_util.py ├── ip_tools.py └── mailersend.py ├── requirements.txt ├── .vscode └── settings.json ├── database ├── connection.py ├── cost_manager.py ├── session_manager.py ├── chat_manager.py └── user_manager.py ├── helpers └── validators.py ├── test.py ├── Dockerfile ├── docker-compose.local.yml ├── config └── settings.py ├── app.py ├── docker-compose.yml ├── views ├── capcha_plugin.py ├── login_view.py ├── signup_view.py └── main_view.py ├── main.py ├── README.md ├── .dockerignore ├── .gitignore └── embeddings └── vector_store.py /nginx/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nginx:latest 2 | 3 | COPY default.conf /etc/nginx/conf.d -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Start Streamlit in the background 4 | streamlit run app.py & 5 | 6 | # Start your Python script 7 | python main.py -------------------------------------------------------------------------------- /utils/google_authenticate.py: -------------------------------------------------------------------------------- 1 | from streamlit_google_auth import Authenticate 2 | 3 | authenticator = Authenticate( 4 | secret_credentials_path='./client_secret.json', 5 | cookie_name='rag-system-biscoito', 6 | cookie_key='senha_maluca_12345', 7 | redirect_uri='https://gtrag.bot/', 8 | ) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | google-generativeai 3 | python-dotenv 4 | langchain 5 | PyPDF2 6 | chromadb 7 | faiss-cpu 8 | langchain_google_genai 9 | langchain-community 10 | mysql-connector-python 11 | bcrypt 12 | ratelimit 13 | openai 14 | langfuse 15 | captcha 16 | mailersend 17 | pyodbc 18 | streamlit_google_auth 19 | streamlit-extras 20 | psycopg2-binary 21 | pymupdf 22 | streamlit-url-fragment -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "sqltools.connections": [ 3 | { 4 | "previewLimit": 50, 5 | "server": "localhost", 6 | "port": 5432, 7 | "driver": "PostgreSQL", 8 | "name": "oknoke", 9 | "group": "oknoke", 10 | "database": "oknoke", 11 | "username": "oknoke", 12 | "password": "oknoke" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /database/connection.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from config import settings 3 | 4 | def create_connection(): 5 | try: 6 | conn = psycopg2.connect( 7 | host=settings.DB_HOST, 8 | database=settings.DB_NAME, 9 | user=settings.DB_USER, 10 | password=settings.DB_PASSWORD 11 | ) 12 | return conn 13 | except Exception as e: 14 | print(e) 15 | return None -------------------------------------------------------------------------------- /helpers/validators.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def is_valid_email(email): 4 | email_regex = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' 5 | return re.match(email_regex, email) is not None 6 | 7 | def is_valid_password(password): 8 | if len(password) < 8: 9 | return False 10 | if not re.search(r"[A-Za-z]", password): 11 | return False 12 | if not re.search(r"[0-9]", password): 13 | return False 14 | if not re.search(r"[!@#$%^&*(),.?\":{}|<>]", password): 15 | return False 16 | return True -------------------------------------------------------------------------------- /utils/string_util.py: -------------------------------------------------------------------------------- 1 | 2 | def find_positions_multiple(text, start_substring, end_substring): 3 | positions = [] 4 | start_idx = 0 5 | while True: 6 | start = text.find(start_substring, start_idx) 7 | if start == -1: 8 | break 9 | end = text.find(end_substring, start + len(start_substring)) 10 | if end == -1: 11 | break 12 | positions.append((start, end + len(end_substring))) # Adjust end position to include end_substring 13 | start_idx = end + len(end_substring) 14 | return positions -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import openai 2 | from config import settings 3 | openai_client = openai.OpenAI(api_key=settings.OPENAI_API_KEY) 4 | # List all files 5 | files = openai_client.files.list() 6 | print(files) 7 | # Delete each file 8 | for file in files.data: 9 | file_id = file.id 10 | openai_client.files.delete(file_id) 11 | print(f"Deleted file: {file_id}") 12 | 13 | 14 | vectors = openai_client.beta.vector_stores.list() 15 | print(vectors) 16 | 17 | for vector in vectors: 18 | vector_id = vector.id 19 | openai_client.beta.vector_stores.delete( 20 | vector_store_id=vector_id 21 | ) 22 | print(f"Deleted vector: {vector_id}") 23 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | # Set the working directory in the container 5 | WORKDIR /usr/src/app 6 | 7 | # Copy the requirements file into the container 8 | COPY requirements.txt ./ 9 | 10 | # Install any dependencies specified in requirements.txt 11 | RUN pip install --no-cache-dir -r requirements.txt 12 | 13 | # Copy the rest of the application code into the container 14 | COPY . . 15 | 16 | # Make port 80 available to the world outside this container 17 | # (Optional, only if your application runs on a specific port) 18 | # EXPOSE 80 19 | 20 | # Define environment variable 21 | # ENV PYTHONUNBUFFERED=1 22 | 23 | # Make the run script executable 24 | RUN chmod +x run.sh 25 | 26 | # Command to run the script 27 | CMD ["./run.sh"] -------------------------------------------------------------------------------- /docker-compose.local.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | postgres: 5 | image: postgres:13 6 | container_name: postgres 7 | environment: 8 | POSTGRES_DB: oknoke 9 | POSTGRES_USER: oknoke 10 | POSTGRES_PASSWORD: oknoke 11 | ports: 12 | - "5432:5432" # Expose PostgreSQL port 13 | volumes: 14 | - postgres_data:/var/lib/postgresql/data 15 | 16 | # nginx: 17 | # container_name: nginx 18 | # restart: always 19 | # build: 20 | # context: ./nginx 21 | # dockerfile: Dockerfile 22 | # volumes: 23 | # - /etc/letsencrypt/live/gtrag.bot/fullchain.pem:/etc/letsencrypt/live/gtrag.bot/fullchain.pem 24 | # - /etc/letsencrypt/live/gtrag.bot/privkey.pem:/etc/letsencrypt/live/gtrag.bot/privkey.pem 25 | # ports: 26 | # - "80:80" 27 | # - "443:443" 28 | # depends_on: 29 | # - postgres 30 | 31 | volumes: 32 | postgres_data: {} -------------------------------------------------------------------------------- /config/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | 4 | load_dotenv() 5 | 6 | # LLM Model Information 7 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 8 | 9 | # MySQL Database information 10 | DB_HOST = os.getenv("DB_HOST") 11 | DB_USER = os.getenv("DB_USER") 12 | DB_PASSWORD = os.getenv("DB_PASSWORD") 13 | DB_NAME = os.getenv("DB_NAME") 14 | 15 | # Mailersend Information 16 | MAILERSEND_API_KEY = os.getenv("MAILERSEND_API_KEY") 17 | EMAIL_TEMPLATE_SIGNUP = os.getenv("EMAIL_TEMPLATE_SIGNUP") 18 | 19 | # URL Information 20 | BACKEND_URL = os.getenv("BACKEND_URL") 21 | PRODUCT_URL = os.getenv("PRODUCT_URL") 22 | 23 | # reCAPTCHA Information 24 | RECAPTCHA_SITE_KEY = os.getenv("RECAPTCHA_SITE_KEY") 25 | RECAPTCHA_SECRET_KEY = os.getenv("RECAPTCHA_SECRET_KEY") 26 | 27 | # langfuse Information 28 | LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY") 29 | LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY") 30 | LANGFUSE_HOST = os.getenv("LANGFUSE_HOST") -------------------------------------------------------------------------------- /utils/ip_tools.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from streamlit import runtime 3 | from streamlit.runtime.scriptrunner import get_script_run_ctx 4 | 5 | def get_remote_ip() -> str: 6 | """Get remote ip.""" 7 | try: 8 | ctx = get_script_run_ctx() 9 | if ctx is None: 10 | return None 11 | 12 | session_info = runtime.get_instance().get_client(ctx.session_id) 13 | if session_info is None: 14 | return None 15 | except Exception as e: 16 | return None 17 | 18 | return session_info.request.remote_ip 19 | 20 | def get_country_name(ip_address: str) -> str: 21 | """Get country name from IP address using ipapi service.""" 22 | try: 23 | response = requests.get(f"http://ip-api.com/json/{ip_address}") 24 | data = response.json() 25 | return data.get("country", "Unknown") 26 | except Exception as e: 27 | return "Unknown" 28 | 29 | def get_remote_country() -> str: 30 | """Get remote country""" 31 | ip_address = get_remote_ip() 32 | country = get_country_name(ip_address) 33 | return country -------------------------------------------------------------------------------- /database/cost_manager.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from database.connection import create_connection 3 | 4 | def create_cost_table(): 5 | conn = create_connection() 6 | if conn: 7 | try: 8 | cursor = conn.cursor() 9 | create_table_query = """ 10 | CREATE TABLE IF NOT EXISTS cost ( 11 | id SERIAL PRIMARY KEY, 12 | session_id INT NOT NULL, 13 | cost FLOAT NOT NULL 14 | ); 15 | """ 16 | cursor.execute(create_table_query) 17 | conn.commit() 18 | except Exception as e: 19 | print(e) 20 | finally: 21 | cursor.close() 22 | conn.close() 23 | 24 | def insert_cost(session_id, cost): 25 | query = "INSERT INTO cost (session_id, cost) VALUES (%s, %s)" 26 | conn = create_connection() 27 | if conn: 28 | try: 29 | cursor = conn.cursor() 30 | cursor.execute(query, (session_id, cost)) 31 | conn.commit() 32 | except Exception as e: 33 | print(f"Error: {e}") 34 | finally: 35 | if 'cursor' in locals(): 36 | cursor.close() 37 | conn.close() -------------------------------------------------------------------------------- /utils/mailersend.py: -------------------------------------------------------------------------------- 1 | from mailersend import emails 2 | from config import settings 3 | 4 | api_key = settings.MAILERSEND_API_KEY 5 | signup_template = settings.EMAIL_TEMPLATE_SIGNUP 6 | backend_url = settings.BACKEND_URL 7 | 8 | mailer = emails.NewEmail(api_key) 9 | 10 | def signup_mailer(customer_email, verify_token): 11 | signup_link = f"{backend_url}/verify-email?token={verify_token}" 12 | print(signup_link) 13 | print(api_key) 14 | print(signup_template) 15 | print(backend_url) 16 | mail_body = {"signup_link": signup_link} 17 | mail_from = { 18 | "name": "GTRAG", 19 | "email": "info@gtrag.com", 20 | } 21 | recipients = [ 22 | { 23 | "email": customer_email, 24 | } 25 | ] 26 | personalization = [ 27 | { 28 | "email": customer_email, 29 | "data": { 30 | "verify_id": signup_link 31 | } 32 | } 33 | ] 34 | mailer.set_mail_from(mail_from, mail_body) 35 | mailer.set_mail_to(recipients, mail_body) 36 | mailer.set_subject("Please verify your email", mail_body) 37 | mailer.set_template(signup_template, mail_body) 38 | mailer.set_personalization(personalization, mail_body) 39 | response = mailer.send(mail_body) 40 | print(response) 41 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | st.set_page_config(layout="wide") 3 | 4 | import asyncio 5 | from views import main_view, login_view, signup_view 6 | from database.user_manager import create_users_table 7 | from database.chat_manager import create_chat_table 8 | from database.cost_manager import create_cost_table 9 | from database.session_manager import create_session_table 10 | 11 | # st.set_page_config( 12 | # page_title="Chatbot", 13 | # page_icon="🤖" 14 | # ) 15 | async def main(): 16 | """Main function to execute the Streamlit app.""" 17 | if 'connected' not in st.session_state: 18 | st.session_state['connected'] = False 19 | 20 | if "logged_in" not in st.session_state: 21 | st.session_state["logged_in"] = False 22 | st.session_state['page'] = 'login' 23 | 24 | if "user_id" in st.session_state and st.session_state["user_id"]: 25 | st.session_state["logged_in"] = True 26 | 27 | if 'user_info' not in st.session_state: 28 | st.session_state['user_info'] = {} 29 | 30 | if st.session_state['logged_in']: 31 | await main_view.main_content() 32 | else: 33 | if st.session_state['page'] == 'login': 34 | login_view.login_page() 35 | else: 36 | signup_view.signup_page() 37 | 38 | if __name__ == "__main__": 39 | # Call the function to create the table 40 | create_users_table() 41 | create_chat_table() 42 | create_cost_table() 43 | create_session_table() 44 | # Run the main function 45 | asyncio.run(main()) -------------------------------------------------------------------------------- /nginx/default.conf: -------------------------------------------------------------------------------- 1 | server { 2 | listen 80; 3 | server_name gtrag.bot; 4 | 5 | proxy_read_timeout 600; 6 | proxy_connect_timeout 300; 7 | 8 | ssl_certificate /etc/letsencrypt/live/gtrag.bot/fullchain.pem; 9 | ssl_certificate_key /etc/letsencrypt/live/gtrag.bot/privkey.pem; 10 | 11 | location / { 12 | proxy_pass http://app:8000; 13 | 14 | proxy_http_version 1.1; 15 | proxy_set_header Upgrade $http_upgrade; 16 | proxy_set_header Connection "upgrade"; 17 | proxy_set_header Host $host; 18 | proxy_set_header X-Real-IP $remote_addr; 19 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 20 | proxy_set_header X-Forwarded-Proto $scheme; 21 | } 22 | } 23 | 24 | server { 25 | listen 443 ssl; 26 | server_name gtrag.bot; 27 | 28 | proxy_read_timeout 600; 29 | proxy_connect_timeout 300; 30 | 31 | client_max_body_size 200M; 32 | 33 | ssl_certificate /etc/letsencrypt/live/gtrag.bot/fullchain.pem; 34 | ssl_certificate_key /etc/letsencrypt/live/gtrag.bot/privkey.pem; 35 | 36 | location / { 37 | proxy_pass http://app:8501; 38 | 39 | proxy_http_version 1.1; 40 | proxy_set_header Upgrade $http_upgrade; 41 | proxy_set_header Connection "upgrade"; 42 | proxy_set_header Host $host; 43 | proxy_set_header X-Real-IP $remote_addr; 44 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 45 | proxy_set_header X-Forwarded-Proto $scheme; 46 | } 47 | } -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | app: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | container_name: Oknoke 9 | environment: 10 | - OPENAI_API_KEY=${OPENAI_API_KEY} 11 | - DB_HOST=postgres 12 | - DB_USER=oknoke 13 | - DB_PASSWORD=oknoke 14 | - DB_NAME=oknoke 15 | - MAILERSEND_API_KEY=${MAILERSEND_API_KEY} 16 | - EMAIL_TEMPLATE_SIGNUP=${EMAIL_TEMPLATE_SIGNUP} 17 | - BACKEND_URL=${BACKEND_URL} 18 | - PRODUCT_URL=${PRODUCT_URL} 19 | - RECAPTCHA_SITE_KEY='' 20 | - RECAPTCHA_SECRET_KEY='' 21 | - LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY} 22 | - LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY} 23 | - LANGFUSE_HOST=${LANGFUSE_HOST} 24 | volumes: 25 | - .:/usr/src/app 26 | ports: 27 | - "5000:5000" 28 | entrypoint: ["./run.sh"] 29 | 30 | postgres: 31 | image: postgres:13 32 | container_name: postgres 33 | environment: 34 | POSTGRES_DB: oknoke 35 | POSTGRES_USER: oknoke 36 | POSTGRES_PASSWORD: oknoke 37 | ports: 38 | - "5432:5432" # Expose PostgreSQL port 39 | volumes: 40 | - postgres_data:/var/lib/postgresql/data 41 | 42 | nginx: 43 | container_name: nginx 44 | restart: always 45 | build: 46 | context: ./nginx 47 | dockerfile: Dockerfile 48 | volumes: 49 | - /etc/letsencrypt/live/gtrag.bot/fullchain.pem:/etc/letsencrypt/live/gtrag.bot/fullchain.pem 50 | - /etc/letsencrypt/live/gtrag.bot/privkey.pem:/etc/letsencrypt/live/gtrag.bot/privkey.pem 51 | ports: 52 | - "80:80" 53 | - "443:443" 54 | depends_on: 55 | - app 56 | - postgres 57 | 58 | volumes: 59 | postgres_data: {} -------------------------------------------------------------------------------- /views/capcha_plugin.py: -------------------------------------------------------------------------------- 1 | # import library 2 | import streamlit as st 3 | from captcha.image import ImageCaptcha 4 | import random, string 5 | 6 | 7 | # define the costant 8 | length_captcha = 4 9 | width = 220 10 | height = 100 11 | 12 | # define the function for the captcha control 13 | def captcha_control(): 14 | if 'controllo' not in st.session_state or st.session_state['controllo'] == False: 15 | st.session_state['controllo'] = False 16 | 17 | # Set up the captcha text 18 | if 'Captcha' not in st.session_state: 19 | st.session_state['Captcha'] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=4)) 20 | 21 | # Create columns for the captcha image, input, and verify button 22 | col1, col2, col3 = st.columns([2, 4, 1]) 23 | 24 | with col1: 25 | image = ImageCaptcha(width=width, height=height) 26 | data = image.generate(st.session_state['Captcha']) 27 | st.image(data) 28 | 29 | with col2: 30 | capta2_text = st.text_input('Enter captcha text', placeholder='Type here...') 31 | 32 | with col3: 33 | st.text('') 34 | if st.button("Verify", key="verify_button", help="Click to verify the captcha"): 35 | # if st.button("Verify", key="verify_button"): 36 | if st.session_state['Captcha'].lower() == capta2_text.lower().strip(): 37 | del st.session_state['Captcha'] 38 | st.session_state['controllo'] = True 39 | st.rerun() 40 | else: 41 | st.error("❌ Incorrect captcha. Please try again.") 42 | del st.session_state['Captcha'] 43 | st.rerun() 44 | else: 45 | st.stop() -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from fastapi.responses import HTMLResponse 2 | from fastapi import FastAPI, HTTPException, Request 3 | from pydantic import BaseModel 4 | from database.connection import create_connection 5 | from utils.mailersend import signup_mailer 6 | import bcrypt 7 | import random 8 | import string 9 | import uvicorn 10 | import os 11 | from dotenv import load_dotenv 12 | 13 | load_dotenv() 14 | PRODUCT_URL = os.getenv("PRODUCT_URL") 15 | 16 | app = FastAPI() 17 | 18 | 19 | @app.get("/api/verify-email") 20 | async def verify_email(token: str, request: Request): 21 | conn = create_connection() 22 | if conn: 23 | try: 24 | cursor = conn.cursor() 25 | cursor.execute(f"UPDATE users SET status = 'verified' WHERE verification_token = '{token}'") 26 | conn.commit() 27 | html_content = f""" 28 | 29 | 30 | 31 | Email Verified 32 | 37 | 38 | 39 |

Email Verified Successfully!

40 |

You will be redirected shortly...

41 | 42 | 43 | """ 44 | return HTMLResponse(content=html_content, status_code=200) 45 | except Exception as e: 46 | raise HTTPException(status_code=400, detail=f"Error: {e}") 47 | finally: 48 | if 'cursor' in locals(): 49 | cursor.close() 50 | conn.close() 51 | 52 | if __name__ == "__main__": 53 | print("server up") 54 | uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True) -------------------------------------------------------------------------------- /database/session_manager.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from database.connection import create_connection 3 | 4 | def create_session_table(): 5 | conn = create_connection() 6 | if conn: 7 | try: 8 | cursor = conn.cursor() 9 | create_table_query = """ 10 | CREATE TABLE IF NOT EXISTS session ( 11 | id SERIAL PRIMARY KEY, 12 | user_id INT NOT NULL, 13 | start_session TIMESTAMP DEFAULT CURRENT_TIMESTAMP, 14 | end_session TIMESTAMP NULL 15 | ); 16 | """ 17 | cursor.execute(create_table_query) 18 | conn.commit() 19 | except Exception as e: 20 | print(e) 21 | finally: 22 | cursor.close() 23 | conn.close() 24 | 25 | def insert_start_session(user_id): 26 | conn = create_connection() 27 | new_id = None 28 | if conn: 29 | try: 30 | cursor = conn.cursor() 31 | cursor.execute("SELECT id, end_session FROM session ORDER BY id DESC LIMIT 1") 32 | record = cursor.fetchone() 33 | if record and record[1] is None: 34 | update_end_session(record[0]) 35 | cursor.execute("INSERT INTO session (user_id) VALUES (%s) RETURNING id;", (user_id,)) 36 | new_id = cursor.fetchone()[0] 37 | conn.commit() 38 | except Exception as e: 39 | print(e) 40 | finally: 41 | if 'cursor' in locals(): 42 | cursor.close() 43 | conn.close() 44 | return new_id 45 | 46 | def update_end_session(session_id): 47 | query = "UPDATE session SET end_session = CURRENT_TIMESTAMP WHERE id = %s" 48 | conn = create_connection() 49 | if conn: 50 | try: 51 | cursor = conn.cursor() 52 | cursor.execute(query, (session_id,)) 53 | conn.commit() 54 | except Exception as e: 55 | print(e) 56 | finally: 57 | if 'cursor' in locals(): 58 | cursor.close() 59 | conn.close() -------------------------------------------------------------------------------- /views/login_view.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from database.user_manager import authenticate_user, get_user_id, update_user_country, create_google_user 3 | from database.session_manager import insert_start_session 4 | from views.capcha_plugin import captcha_control 5 | from utils.ip_tools import get_remote_country 6 | from embeddings.vector_store import clear_cache 7 | from streamlit_google_auth import Authenticate 8 | from utils.google_authenticate import authenticator 9 | 10 | 11 | def init_login_session(email): 12 | user_id = get_user_id(email) 13 | session_id = insert_start_session(user_id) 14 | st.session_state["logged_in"] = True 15 | st.session_state["email"] = email 16 | st.session_state["user_id"] = user_id 17 | st.session_state["session_id"] = session_id 18 | clear_cache(user_id) 19 | 20 | def login_page(): 21 | st.title("Sign In") 22 | email = st.text_input("Email") 23 | password = st.text_input("Password", type="password") 24 | authenticator.check_authentification() 25 | authorization_url = authenticator.get_authorization_url() 26 | 27 | print(st.session_state["connected"]) 28 | # SignIn By Google 29 | if st.session_state["connected"] == False: 30 | st.link_button('Sign In With Google', authorization_url, use_container_width=True) 31 | elif st.session_state["connected"] == True: 32 | email = st.session_state['user_info'].get('email') 33 | create_google_user(email) 34 | init_login_session(email) 35 | st.rerun() 36 | 37 | # Captcha Component 38 | captcha_control() 39 | 40 | # Signin By Email and Password 41 | col1, col2, col3 = st.columns([1, 3, 1]) 42 | with col2: 43 | if st.button("Sign In", use_container_width=True): 44 | res = authenticate_user(email, password) 45 | if res == "Success": 46 | update_user_country(email, get_remote_country()) 47 | init_login_session(email) 48 | st.rerun() 49 | else: 50 | # st.error(res) 51 | print(res) 52 | 53 | if st.button("Go to Sign Up", use_container_width=True): 54 | st.session_state['page'] = 'signup' 55 | st.rerun() 56 | 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RAG System with OpenAI and Streamlit 🤖📄 2 | 3 | Welcome to the **RAG System** (Retrieve and Generate) — an innovative AI-powered chatbot that leverages the OpenAI Assistant API and Streamlit to provide real-time, context-aware answers based on user-uploaded documents. 4 | 5 | ## Features ✨ 6 | 7 | - **Multiformat Document Support**: Upload and process various file types including PDFs, DOCX, PPTX, TXT, and script files. 📂 8 | - **Real-time Responses**: Enjoy seamless and interactive responses fetched via WebSocket connections. 🔄 9 | - **Context Aware**: Provides answers based on the content of the uploaded documents, making the interactions more meaningful and personalized. 🔍 10 | 11 | ## Installation ⚙️ 12 | 13 | To get started with the RAG system, follow these steps: 14 | 15 | 1. **Clone the Repository**: 16 | ```bash 17 | git clone https://github.com/SuperGalaxy0901/Streamlit-OpenAI-Chatbot.git 18 | cd rag-system 19 | ``` 20 | 21 | 2. **Set up a Virtual Environment** (recommended): 22 | ```bash 23 | python -m venv env 24 | source env/bin/activate # On Windows use `env\Scripts\activate` 25 | ``` 26 | 27 | 3. **Install the Required Packages**: 28 | ```bash 29 | pip install -r requirements.txt 30 | ``` 31 | 32 | 4. **Set Up Environment Variables**: 33 | - Create a `.env` file to safely store your API keys and configuration settings. 34 | ```plaintext 35 | OPENAI_API_KEY=your_openai_api_key 36 | ``` 37 | 38 | ## Usage 🚀 39 | 40 | 1. **Run the Streamlit App**: 41 | ```bash 42 | streamlit run app.py 43 | ``` 44 | 45 | 2. **Interact with the Chatbot**: 46 | - Upload documents via the application interface. 📤 47 | - Engage with the chatbot as it generates insightful responses based on your document contents. 💬 48 | 49 | ## Architecture Overview 🏗️ 50 | 51 | - **Streamlit**: Provides the front-end interface where users can upload documents and interact with the chatbot. 🌐 52 | - **OpenAI Assistant API**: Powers the natural language comprehension and generation. 🧠 53 | - **WebSockets**: Enables real-time, efficient communication between the front-end and back-end services. 📡 54 | 55 | ## Acknowledgements 🙏 56 | 57 | - [OpenAI](https://openai.com) for their incredible API. 58 | - [Streamlit](https://streamlit.io) for the easy-to-use app framework. 59 | -------------------------------------------------------------------------------- /views/signup_view.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from database.user_manager import create_user, verify_user, create_google_user 3 | from helpers.validators import is_valid_email, is_valid_password 4 | from views.capcha_plugin import captcha_control 5 | from views.login_view import init_login_session 6 | from utils.google_authenticate import authenticator 7 | 8 | def signup_page(): 9 | st.title("🔐 Sign Up") 10 | 11 | new_email = st.text_input("Email") 12 | new_password = st.text_input("New Password", type='password') 13 | confirm_password = st.text_input("Confirm Password", type='password') 14 | 15 | # SignUp By Google 16 | authenticator.check_authentification() 17 | authorization_url = authenticator.get_authorization_url() 18 | 19 | if not st.session_state.get('connected', False): 20 | st.link_button('Sign Up With Google', authorization_url, use_container_width=True) 21 | else: 22 | email = st.session_state['user_info'].get('email') 23 | create_google_user(email) 24 | init_login_session(email) 25 | st.rerun() 26 | 27 | # Captcha Component 28 | captcha_control() 29 | 30 | if 'server_code' not in st.session_state: 31 | st.session_state.server_code = "" 32 | 33 | col1, col2, col3 = st.columns([1, 3, 1]) 34 | with col2: 35 | if st.button("Sign Up", use_container_width=True): 36 | if not is_valid_email(new_email): 37 | st.error("🚫 Invalid email format") 38 | elif not is_valid_password(new_password): 39 | st.error("🚫 Password must be at least 8 characters long, contain a letter, a number, and a special character") 40 | elif new_password != confirm_password: 41 | st.error("🚫 Passwords do not match") 42 | else: 43 | ret = create_user(new_email, new_password) 44 | if ret != "error": 45 | st.success("✅ User created successfully. Please check your email for the verification link.") 46 | else: 47 | st.error("🚫 Error creating user. Please try again.") 48 | 49 | if st.button("Go to Sign In", use_container_width=True): 50 | st.session_state['page'] = "login" 51 | st.rerun() 52 | 53 | st.markdown("
", unsafe_allow_html=True) 54 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | venv/ 13 | ENV/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | tests/__pycache__/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # Celery stuff 86 | celerybeat-schedule.* 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | .dmypy.json 106 | dmypy.json 107 | 108 | # Pyre type checker 109 | .pyre/ 110 | 111 | # pytype static type analyzer 112 | .pytype/ 113 | 114 | # Cython debug symbols 115 | cython_debug/ 116 | 117 | # Other artifacts 118 | *.swp 119 | *~ 120 | .DS_Store 121 | Thumbs.db 122 | 123 | # Docker-specific ignores 124 | docker-compose.local.yml 125 | # .dockerignore file itself to avoid inclusions 126 | .dockerignore 127 | # Others 128 | .git 129 | .gitignore 130 | .tmp 131 | .vscode/ 132 | .idea/ 133 | *.bak 134 | *.tmp -------------------------------------------------------------------------------- /database/chat_manager.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from database.connection import create_connection 3 | import bcrypt 4 | 5 | def create_chat_table(): 6 | conn = create_connection() 7 | if conn: 8 | try: 9 | cursor = conn.cursor() 10 | create_table_query = """ 11 | CREATE TABLE IF NOT EXISTS chat ( 12 | id SERIAL PRIMARY KEY, 13 | user_id INT NOT NULL, 14 | vector_id VARCHAR(255) NOT NULL, 15 | thread_id VARCHAR(255) NOT NULL, 16 | file_id VARCHAR(255) NOT NULL, 17 | assistant_id VARCHAR(255) NOT NULL 18 | ); 19 | """ 20 | cursor.execute(create_table_query) 21 | conn.commit() 22 | except Exception as e: 23 | print(e) 24 | finally: 25 | cursor.close() 26 | conn.close() 27 | 28 | def create_chat(user_id, vector_id, thread_id, file_id, assistant_id): 29 | conn = create_connection() 30 | if conn: 31 | try: 32 | cursor = conn.cursor() 33 | cursor.execute("INSERT INTO chat (user_id, vector_id, thread_id, file_id, assistant_id) VALUES (%s, %s, %s, %s, %s)", 34 | (user_id, vector_id, thread_id, file_id, assistant_id)) 35 | conn.commit() 36 | except Exception as e: 37 | print(e) 38 | finally: 39 | if 'cursor' in locals(): 40 | cursor.close() 41 | conn.close() 42 | 43 | def get_individual_chat(id): 44 | conn = create_connection() 45 | if conn: 46 | try: 47 | cursor = conn.cursor() 48 | cursor.execute("SELECT user_id, vector_id, thread_id, file_id, assistant_id FROM chat WHERE id = %s", (id,)) 49 | record = cursor.fetchone() 50 | if record: 51 | return record 52 | else: 53 | return None 54 | except Exception as e: 55 | print(e) 56 | return None 57 | finally: 58 | if 'cursor' in locals(): 59 | cursor.close() 60 | conn.close() 61 | 62 | def get_user_chats(user_id): 63 | conn = create_connection() 64 | if conn: 65 | try: 66 | cursor = conn.cursor() 67 | cursor.execute("SELECT * FROM chat WHERE user_id = %s", (user_id,)) 68 | record = cursor.fetchall() 69 | if record: 70 | return record 71 | else: 72 | return None 73 | except Exception as e: 74 | print(e) 75 | return None 76 | finally: 77 | if 'cursor' in locals(): 78 | cursor.close() 79 | conn.close() -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | # client_secret.json 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | # .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /database/user_manager.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from database.connection import create_connection 3 | from utils.mailersend import signup_mailer 4 | import bcrypt 5 | import random 6 | 7 | def generate_6_digit_number(): 8 | return random.randint(100000, 999999) 9 | 10 | def create_users_table(): 11 | conn = create_connection() 12 | if conn: 13 | try: 14 | cursor = conn.cursor() 15 | create_table_query = """ 16 | CREATE TABLE IF NOT EXISTS users ( 17 | id SERIAL PRIMARY KEY, 18 | email VARCHAR(255) UNIQUE NOT NULL, 19 | password VARCHAR(255) NOT NULL, 20 | verify_id VARCHAR(255), 21 | status VARCHAR(20), 22 | country VARCHAR(30), 23 | verification_token VARCHAR(255), 24 | is_gmail INT 25 | ); 26 | """ 27 | cursor.execute(create_table_query) 28 | conn.commit() 29 | except Exception as e: 30 | print(e) 31 | finally: 32 | cursor.close() 33 | conn.close() 34 | else: 35 | st.error("Unable to connect to the database.") 36 | 37 | def create_user(email, password): 38 | print("create user!!!!!!") 39 | flag = 0 40 | conn = create_connection() 41 | if conn: 42 | try: 43 | cursor = conn.cursor() 44 | hashed_password = bcrypt.hashpw(password.encode('utf-8'), bcrypt.gensalt()).decode("utf-8") 45 | verify_id = generate_6_digit_number() 46 | status = "pending" 47 | cursor.execute( 48 | "INSERT INTO users (email, password, verification_token, status, is_gmail) VALUES (%s, %s, %s, %s, %s)", 49 | (email, hashed_password, verify_id, status, 0) 50 | ) 51 | print("signup_mailer called!!!!!!") 52 | signup_mailer(email, verify_id) 53 | conn.commit() 54 | except Exception as e: 55 | print(e) 56 | flag = 1 57 | finally: 58 | if 'cursor' in locals(): 59 | cursor.close() 60 | conn.close() 61 | if flag == 0: 62 | return verify_id 63 | else: 64 | return "error" 65 | 66 | def create_google_user(email): 67 | conn = create_connection() 68 | if conn: 69 | try: 70 | cursor = conn.cursor() 71 | cursor.execute("SELECT * FROM users WHERE email = %s", (email,)) 72 | result = cursor.fetchone() 73 | if result is None: 74 | status = "verified" 75 | cursor.execute( 76 | "INSERT INTO users (email, password, status, is_gmail) VALUES (%s, %s, %s, %s)", 77 | (email, 'XXX', status, 1) 78 | ) 79 | conn.commit() 80 | else: 81 | print(email) 82 | except Exception as e: 83 | print(e) 84 | pass 85 | finally: 86 | if 'cursor' in locals(): 87 | cursor.close() 88 | conn.close() 89 | 90 | def verify_user(email): 91 | conn = create_connection() 92 | if conn: 93 | try: 94 | cursor = conn.cursor() 95 | cursor.execute( 96 | "UPDATE users SET status = %s WHERE email = %s", 97 | ('verified', email) 98 | ) 99 | conn.commit() 100 | except Exception as e: 101 | print(e) 102 | finally: 103 | if 'cursor' in locals(): 104 | cursor.close() 105 | conn.close() 106 | 107 | def authenticate_user(email, password): 108 | conn = create_connection() 109 | if conn: 110 | try: 111 | cursor = conn.cursor() 112 | query = "SELECT password, status FROM users WHERE email = %s AND is_gmail = %s" 113 | cursor.execute(query, (email, 0)) 114 | record = cursor.fetchone() 115 | 116 | if record: 117 | if bcrypt.checkpw(password.encode('utf-8'), record[0].encode('utf-8')): 118 | if record[1] == "verified": 119 | return "Success" 120 | else: 121 | return "Email has not been verified" 122 | else: 123 | return "Invalid email or password" 124 | else: 125 | return "Invalid email or password" 126 | except Exception as e: 127 | print(e) 128 | return False 129 | finally: 130 | if 'cursor' in locals(): 131 | cursor.close() 132 | conn.close() 133 | else: 134 | print(e) 135 | return False 136 | 137 | def get_user_id(email): 138 | conn = create_connection() 139 | if conn: 140 | try: 141 | cursor = conn.cursor() 142 | cursor.execute("SELECT id FROM users WHERE email = %s", (email,)) 143 | record = cursor.fetchone() 144 | if record: 145 | return record[0] 146 | else: 147 | return None 148 | except Exception as e: 149 | print(e) 150 | return None 151 | finally: 152 | if 'cursor' in locals(): 153 | cursor.close() 154 | conn.close() 155 | 156 | def update_user_country(email, country): 157 | conn = create_connection() 158 | if conn: 159 | try: 160 | cursor = conn.cursor() 161 | cursor.execute( 162 | "UPDATE users SET country = %s WHERE email = %s", 163 | (country, email) 164 | ) 165 | conn.commit() 166 | except Exception as e: 167 | print(e) 168 | finally: 169 | if 'cursor' in locals(): 170 | cursor.close() 171 | conn.close() -------------------------------------------------------------------------------- /embeddings/vector_store.py: -------------------------------------------------------------------------------- 1 | from config import settings 2 | from langfuse import Langfuse 3 | from langfuse.openai import openai 4 | from langfuse.decorators import langfuse_context, observe 5 | import hashlib 6 | import secrets 7 | import io 8 | import time 9 | from database.chat_manager import create_chat, get_user_chats 10 | from database.cost_manager import insert_cost 11 | 12 | openai_client = openai.OpenAI(api_key=settings.OPENAI_API_KEY) 13 | # Initialize Langfuse Client 14 | langfuse_client = Langfuse( 15 | secret_key=settings.LANGFUSE_SECRET_KEY, 16 | public_key=settings.LANGFUSE_PUBLIC_KEY, 17 | host=settings.LANGFUSE_HOST 18 | ) 19 | 20 | @observe() 21 | def generate_vector_store(uploaded_files, user_id): 22 | # Generate Vector Name Using Hash 23 | random_bytes = secrets.token_bytes(64) 24 | hash_object = hashlib.sha256() 25 | hash_object.update(random_bytes) 26 | vector_name = hash_object.hexdigest() 27 | 28 | # Init File for upload OpenAI 29 | file_content = uploaded_files[0].getvalue() 30 | file_bytes_io = io.BytesIO(file_content.encode('utf-8') if isinstance(file_content, str) else file_content) 31 | file_bytes_io.name = uploaded_files[0].name 32 | 33 | # Upload File on OpenAI store 34 | file = openai_client.files.create( 35 | file=file_bytes_io, 36 | purpose='assistants' 37 | ) 38 | uploaded_files[0] = file.id 39 | 40 | # Vector Store 41 | vector = openai_client.beta.vector_stores.create( 42 | name=vector_name 43 | ) 44 | openai_client.beta.vector_stores.files.create( 45 | vector_store_id=vector.id, 46 | file_id=file.id 47 | ) 48 | 49 | # chat assistant 50 | assistant = openai_client.beta.assistants.create( 51 | instructions="Use the file provided as your knowledge base to best respond to customer queries. Only include at least on file citation(for example: 【4:1†source】) in the answer.", 52 | model="gpt-4o-mini", 53 | tools=[ 54 | { 55 | "type": "file_search", 56 | } 57 | ], 58 | tool_resources={ 59 | "file_search": { 60 | "vector_store_ids":[vector.id] 61 | } 62 | } 63 | ) 64 | 65 | # chat thread 66 | thread = openai_client.beta.threads.create() 67 | create_chat(user_id, vector.id, thread.id, file.id, assistant.id) 68 | print(f"Thread: {thread.id}") 69 | print(f"Assistant: {assistant.id}") 70 | return thread.id, assistant.id, file_bytes_io.name 71 | 72 | @observe() 73 | def get_conversational_chain(user_question, thread_id, assistant_id, file_name, session_id): 74 | """Ignore thread_id parameter due to cost limit""" 75 | print(f"Thread: {thread_id}") 76 | print(f"Assistant: {assistant_id}") 77 | print(f"User Question: {user_question}") 78 | if user_question == "": 79 | return 80 | thread = openai_client.beta.threads.create() 81 | openai_client.beta.threads.messages.create( 82 | thread_id=thread.id, 83 | role="user", 84 | content=user_question + """Include references after the answer in this format: 85 |
86 |

File Name with extension Here

87 | > Exact contents of the references(10 sentences) 88 |
89 | 90 | Never produce unnecessary statements like "For more details, here is the citation formatted as requested:" 91 | Only include at least on file citation(for example: 【4:1†source】) in the answer and do not include file citations(for example: 【4:1†source】) in the references. 92 | Generate answer at any cost. 93 | """ 94 | ) 95 | with openai_client.beta.threads.runs.stream( 96 | thread_id=thread.id, 97 | assistant_id=assistant_id, 98 | instructions=f"You will act as a helpful assistant. Please analyze {file_name} and provide accurate answers to user question. Only include at least on file citation(for example: 【4:1†source】) in the answer.", 99 | ) as stream: 100 | for event in stream: 101 | 102 | if event.event == 'thread.run.step.created': 103 | print('\nMessage creation detected...') 104 | 105 | for text in stream.text_deltas: 106 | yield text 107 | elif event.event == 'thread.message.delta': 108 | yield event.data.delta.content[0].text.value 109 | 110 | if stream._current_message_content: 111 | file_citation_annotations = stream._current_message_content.text.annotations 112 | yield file_citation_annotations 113 | 114 | # log internal generation within the openai assistant as a separate child generation to langfuse 115 | if stream.current_run: 116 | # Extract and print cost data 117 | cost_data = cost_for_tokens(stream.current_run.usage) 118 | print(f"Cost for this response: {cost_data}") 119 | insert_cost(session_id, cost_data) 120 | 121 | @observe() 122 | def generate_questions(thread_id, assistant_id, file_name, session_id): 123 | thread = openai_client.beta.threads.create() 124 | openai_client.beta.threads.messages.create( 125 | thread_id=thread.id, 126 | role="user", 127 | content=f"Generate 3 questions for the {file_name}. Only output 3 questions and never output statement." 128 | ) 129 | 130 | run = openai_client.beta.threads.runs.create( 131 | thread_id=thread.id, 132 | assistant_id=assistant_id, 133 | instructions=f"You will act as a helpful assistant. Please analyze {file_name} and provide accurate answers to user question." 134 | ) 135 | 136 | retrieved_run = openai_client.beta.threads.runs.retrieve( 137 | thread_id=thread.id, 138 | run_id=run.id 139 | ) 140 | 141 | counter = 0 142 | while retrieved_run.status != "completed": 143 | retrieved_run = openai_client.beta.threads.runs.retrieve( 144 | thread_id=thread.id, 145 | run_id=run.id 146 | ) 147 | counter += 1 148 | if counter % 10 == 0: 149 | time.sleep(1) 150 | 151 | thread_messages = openai_client.beta.threads.messages.list(thread.id) 152 | 153 | # log the generation for tracking cost 154 | if retrieved_run: 155 | langfuse_client.generation( 156 | trace_id=langfuse_context.get_current_trace_id(), 157 | parent_observation_id=langfuse_context.get_current_observation_id(), 158 | model=retrieved_run.model, 159 | usage=retrieved_run.usage, 160 | input=f"generate 3 questions for the {file_name}. only output 3 questions", 161 | output=[msg.content[0].text.value for msg in thread_messages.data if msg.role == 'assistant'] 162 | ) 163 | # Extract and print cost data 164 | cost_data = cost_for_tokens(retrieved_run.usage) 165 | print(f"Cost for this response: {cost_data}") 166 | insert_cost(session_id, cost_data) 167 | return thread_messages.data[0].content[0].text.value 168 | 169 | def clear_cache(user_id): 170 | records = get_user_chats(user_id) 171 | 172 | # Check if records is None and set it to an empty list if it is 173 | if records is None: 174 | records = [] 175 | 176 | for record in records: 177 | vector_id = record[2] 178 | file_id = record[4] 179 | print(vector_id) 180 | print(file_id) 181 | try: 182 | # Ensure you pass the parameters correctly as dicts 183 | openai_client.beta.vector_stores.delete(vector_store_id=vector_id) 184 | openai_client.files.delete(file_id=file_id) 185 | except Exception as e: 186 | print(f"Error while deleting resources: {e}") 187 | 188 | return 189 | 190 | def cost_for_tokens(usage, model = "GPT-4o mini"): 191 | return usage.prompt_tokens * 0.15 / 1000000 + usage.completion_tokens * 0.6 / 1000000 192 | 193 | def get_file_content(file_id): 194 | # Retrieve file content 195 | try: 196 | return openai_client.files.retrieve(file_id=file_id) 197 | except Exception as e: 198 | print(f"Failed to retrieve file metadata: {e}") 199 | return None 200 | -------------------------------------------------------------------------------- /views/main_view.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from embeddings.vector_store import generate_questions, generate_vector_store, get_conversational_chain, clear_cache, get_file_content 3 | from database.session_manager import update_end_session 4 | from functools import partial 5 | from streamlit_url_fragment import get_fragment 6 | import fitz 7 | import time 8 | import re 9 | 10 | from utils.string_util import find_positions_multiple 11 | 12 | file_citations = [] 13 | 14 | def clear_chat_history(): 15 | st.session_state.answered_questions = [] 16 | st.session_state.messages = [ 17 | {"role": "assistant", "content": "Upload some PDFs and ask me a question"}] 18 | 19 | async def user_input(user_question, thread_id, assistant_id, file_name, session_id): 20 | response = get_conversational_chain(user_question, thread_id, assistant_id, file_name, session_id) 21 | for chunk in response: 22 | yield chunk 23 | 24 | def display_question(thread_id, assistant_id, file_name, session_id): 25 | questions = generate_questions(thread_id, assistant_id, file_name, session_id) 26 | return [q.strip() for q in questions.split('\n') if q.strip()] 27 | 28 | def process_file(): 29 | progress_bar = st.progress(0) 30 | for i in range(10): 31 | time.sleep(0.2) 32 | progress_bar.progress((i + 1) * 10) 33 | 34 | def start_new_job(): 35 | st.session_state.questions = [] 36 | st.session_state.uploaded_files = [] 37 | st.session_state.file_uploader_key += 1 38 | st.session_state.clicked_file_id = None 39 | st.session_state.citation_index = None 40 | st.session_state.initial_state = True 41 | clear_chat_history() 42 | clear_cache(st.session_state["user_id"]) 43 | st.rerun() 44 | 45 | def set_file_id(file_id, index): 46 | if st.session_state.clicked_file_id != file_id or st.session_state.citation_index != index: 47 | if st.session_state.clicked_file_id is not None: 48 | st.session_state.initial_state = False 49 | st.session_state.clicked_file_id = file_id 50 | st.session_state.citation_index = index 51 | st.rerun() 52 | 53 | async def display_chat_room(): 54 | global file_citations 55 | 56 | st.title("Chat with document files 🤖") 57 | st.write("Welcome to the chat!") 58 | st.write("Upload any file types or several types(pdf, docx, csv, pptx, xlsx, txt)") 59 | 60 | st.sidebar.button('Clear Chat History', on_click=clear_chat_history, use_container_width=True) 61 | st.sidebar.button('SignOut', on_click=logout) 62 | 63 | current_file_id = get_fragment() 64 | if current_file_id is not None: 65 | conf = current_file_id.split("#") 66 | if len(conf) > 2: 67 | set_file_id(conf[1], conf[2]) 68 | else: 69 | set_file_id(None, None) 70 | # Chat input 71 | if "messages" not in st.session_state.keys(): 72 | st.session_state.messages = [ 73 | {"role": "assistant", "content": "Upload some PDFs and ask me a question"}] 74 | 75 | for message in st.session_state.messages: 76 | with st.chat_message(message["role"]): 77 | # st.code(message["content"], language="markdown", wrap_lines=True) 78 | st.markdown(f"""{message["content"]}""", unsafe_allow_html=True) 79 | 80 | button_pressed = '' 81 | if 'questions' in st.session_state and st.session_state.questions: 82 | for question in st.session_state.questions: 83 | if st.button(question, disabled=question in st.session_state.answered_questions): 84 | button_pressed = question[3:] 85 | if question not in st.session_state.answered_questions: 86 | st.session_state.answered_questions.append(question) 87 | 88 | if prompt := ((st.chat_input()) or button_pressed): 89 | st.session_state.messages.append({"role": "user", "content": prompt}) 90 | with st.chat_message("user"): 91 | st.code(prompt, language="markdown", wrap_lines=True) 92 | 93 | st.markdown(""" 94 | 99 | """, unsafe_allow_html=True) 100 | 101 | if st.session_state.messages[-1]["role"] != "assistant" and 'thread_id' in st.session_state: 102 | with st.chat_message("assistant"): 103 | with st.spinner("Thinking..."): 104 | response = "" 105 | placeholder = st.empty() 106 | async for chunk in user_input(prompt, st.session_state.thread_id, st.session_state.assistant_id, st.session_state['file_name'], st.session_state.session_id): 107 | if isinstance(chunk, str): 108 | response = response + chunk 109 | else: 110 | positions = find_positions_multiple(response, "【", "】") 111 | 112 | if len(positions) > 0: 113 | placeholder.markdown("") 114 | 115 | end_pos = 0 116 | result = "" 117 | for index, (start, end) in enumerate(positions): 118 | result += response[end_pos:start] 119 | end_pos = end 120 | 121 | html_link = f"{response[start:end]}" 122 | result += html_link 123 | 124 | pattern = r'
\s*(.*?)\s*
' 125 | matches = re.findall(pattern, response[end:], re.DOTALL) 126 | 127 | if len(matches) > 0: 128 | file_citations.append({ 129 | "index": chunk[index].start_index, 130 | "markdown": matches[0] 131 | }) 132 | 133 | result += response[end_pos:] 134 | response = result 135 | 136 | placeholder.markdown(response, unsafe_allow_html=True) 137 | 138 | if response: 139 | st.session_state.messages.append({"role": "assistant", "content": response}) 140 | st.rerun() 141 | 142 | if len(st.session_state.answered_questions) is 3: 143 | st.session_state.btn_disabed = False 144 | st.text("Would you like to continue?") 145 | if st.button("Yes", disabled=st.session_state.btn_disabled): 146 | st.session_state.answered_questions = [] 147 | st.session_state.questions = display_question(st.session_state['thread_id'], st.session_state['assistant_id'], st.session_state['file_name'], st.session_state.session_id) 148 | st.rerun() 149 | if st.button("No", disabled=st.session_state.btn_disabled): 150 | st.session_state.btn_disabled = True 151 | st.rerun() 152 | 153 | async def main_content(): 154 | if "file_uploader_key" not in st.session_state: 155 | st.session_state["file_uploader_key"] = 0 156 | if "uploaded_files" not in st.session_state: 157 | st.session_state["uploaded_files"] = [] 158 | if 'clicked_file_id' not in st.session_state: 159 | st.session_state.clicked_file_id = None 160 | if 'citation_index' not in st.session_state: 161 | st.session_state.citation_index = None 162 | if 'initial_state' not in st.session_state: 163 | st.session_state.initial_state = False 164 | if 'answered_questions' not in st.session_state: 165 | st.session_state.answered_questions = [] 166 | if 'btn_disabled' not in st.session_state: 167 | st.session_state.btn_disabled = False 168 | # Sidebar for uploading PDF files 169 | with st.sidebar: 170 | st.title("Menu:") 171 | if st.button("Start a New Job", use_container_width=True): 172 | start_new_job() 173 | uploaded_files = st.file_uploader( 174 | "Upload your Files and Click on the Submit & Process Button", accept_multiple_files=True, key=st.session_state["file_uploader_key"]) 175 | if uploaded_files: 176 | st.session_state["uploaded_files"] = uploaded_files 177 | if uploaded_files != []: 178 | if st.button("Submit & Process", use_container_width=True): 179 | with st.spinner("Processing..."): 180 | start_time = time.time() 181 | thread_id, assistant_id, file_name = generate_vector_store(uploaded_files, user_id=st.session_state.user_id) 182 | st.session_state['thread_id'] = thread_id 183 | st.session_state['assistant_id'] = assistant_id 184 | st.session_state['file_name'] = file_name 185 | st.session_state.questions = display_question(thread_id, assistant_id, st.session_state['file_name'], st.session_state.session_id) 186 | process_file() 187 | end_time = time.time() 188 | elapsed_time = end_time - start_time 189 | st.success(f"Done in {elapsed_time:.2f} seconds") 190 | 191 | if st.session_state.clicked_file_id and not st.session_state.initial_state: 192 | col1, col2 = st.columns(2) 193 | with col1: 194 | for citation in file_citations: 195 | if citation["index"] == int(st.session_state.citation_index): 196 | st.markdown(citation["markdown"], unsafe_allow_html=True) 197 | with col2: 198 | await display_chat_room() 199 | else: 200 | await display_chat_room() 201 | 202 | def logout(): 203 | update_end_session(st.session_state["session_id"]) 204 | clear_cache(st.session_state["user_id"]) 205 | st.session_state.clear() 206 | st.session_state['logout'] = True 207 | st.session_state['name'] = None 208 | st.session_state['username'] = None 209 | st.session_state['connected'] = False --------------------------------------------------------------------------------