├── activate.bat ├── requirements.txt ├── create_venv.bat ├── .gitignore ├── template.env ├── main.py ├── pages ├── chatbot_page.py └── ingest_page.py ├── chroma_services.py ├── genai_services.py └── data.txt /activate.bat: -------------------------------------------------------------------------------- 1 | .venv/Scripts/activate -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | openai 3 | tiktoken 4 | chromadb 5 | markitdown[all] 6 | -------------------------------------------------------------------------------- /create_venv.bat: -------------------------------------------------------------------------------- 1 | python -m venv .venv 2 | call activate.bat 3 | pip install -r requirements.txt -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Virtual Environment 2 | .venv 3 | 4 | # Chroma DB 5 | chroma_db 6 | 7 | # Temporary files 8 | __pycache__ 9 | 10 | # Environment variables 11 | .env -------------------------------------------------------------------------------- /template.env: -------------------------------------------------------------------------------- 1 | MODEL_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/" 2 | MODEL_API_KEY = "GEMINI_API_KEY" 3 | MODEL_NAME = "gemini-2.0-flash" 4 | 5 | 6 | CHROMA_COLLECTION_NAME = "rag_collection" -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | st.set_page_config(page_title="RAG QnA & Summarization Chatbot", layout="wide") 4 | 5 | ingest_page = st.Page("pages/ingest_page.py", title="Ingest") 6 | chatbot_page = st.Page("pages/chatbot_page.py", title="Chatbot") 7 | 8 | pg = st.navigation([ 9 | ingest_page, 10 | chatbot_page 11 | ]) 12 | 13 | pg.run() 14 | -------------------------------------------------------------------------------- /pages/chatbot_page.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from genai_services import answer_with_context 3 | from chroma_services import query_documents 4 | 5 | st.title("RAG QnA Chatbot") 6 | st.write("Ask questions about your ingested document!") 7 | user_query = st.chat_input("Your question:") 8 | if user_query: 9 | # Query Chroma for context 10 | context_chunks = query_documents(user_query, n_results=3) 11 | 12 | with st.spinner("Generating answer..."): 13 | answer = answer_with_context(user_query, context_chunks) 14 | st.markdown(f"**Answer:** {answer}") 15 | st.expander("Show retrieved context").write("\n".join(context_chunks)) 16 | -------------------------------------------------------------------------------- /chroma_services.py: -------------------------------------------------------------------------------- 1 | import chromadb 2 | import os 3 | 4 | from dotenv import load_dotenv 5 | 6 | load_dotenv(dotenv_path=".env") 7 | 8 | chroma_client = chromadb.PersistentClient(path="./chroma_db") 9 | collection = chroma_client.get_or_create_collection( 10 | name=os.getenv("CHROMA_COLLECTION_NAME") 11 | ) 12 | 13 | 14 | def ingest_documents(docs): 15 | """Ingest documents into ChromaDB using 'all-MiniLM-L6-v2' Sentence Transformer 16 | 17 | Args: 18 | docs: list of strings (document chunks) 19 | """ 20 | # Ids for the docs 21 | ids = [f"chunk_{i}" for i in range(len(docs))] 22 | 23 | # Ingest chunks into the collection 24 | collection.add(documents=docs, ids=ids) 25 | 26 | return len(docs) 27 | 28 | 29 | def query_documents(query_text, n_results=3): 30 | """Query the collection for relevant documents 31 | 32 | Args: 33 | query_text: string to search for 34 | n_results: number of results to return 35 | 36 | Returns: 37 | List of relevant document chunks 38 | """ 39 | results = collection.query(query_texts=[query_text], n_results=n_results) 40 | if 'documents' in results and results['documents']: 41 | return results['documents'][0] 42 | else: 43 | return [] 44 | -------------------------------------------------------------------------------- /pages/ingest_page.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from markitdown import MarkItDown 3 | from genai_services import summarize_text, chunk_text 4 | from chroma_services import ingest_documents 5 | import tempfile 6 | import os 7 | 8 | st.title("Document Ingestion & Summarization") 9 | uploaded_file = st.file_uploader( 10 | "Upload a document (txt, pdf, or any text-based file supported by markitdown)", 11 | type=[ 12 | "txt", "pdf", "md", "html", "docx" 13 | ] 14 | ) 15 | if uploaded_file: 16 | # Save to temp file 17 | with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp: 18 | tmp.write(uploaded_file.read()) 19 | tmp_path = tmp.name 20 | 21 | # Convert to text using markitdown 22 | converter = MarkItDown() 23 | doc_text = converter.convert(tmp_path).text_content 24 | st.subheader("Document Preview:") 25 | st.text_area("Extracted Text", doc_text, height=200) 26 | 27 | # Summarize 28 | with st.spinner("Summarizing document..."): 29 | summary = summarize_text(doc_text) 30 | st.subheader("Summary:") 31 | st.write(summary) 32 | # Upload button if st.button("Upload & Ingest to Chroma DB"): 33 | # Chunk and ingest 34 | with st.spinner("Ingesting document..."): 35 | chunks = chunk_text(doc_text) 36 | ingest_documents(chunks) 37 | if st.button("Chatbot"): 38 | st.switch_page("pages/chatbot_page.py") 39 | -------------------------------------------------------------------------------- /genai_services.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tiktoken 3 | from typing import List 4 | from openai import OpenAI 5 | 6 | from dotenv import load_dotenv 7 | 8 | load_dotenv(dotenv_path=".env") 9 | 10 | # Set OpenAI client with Gemini API configuration 11 | # You need to get your Gemini API key 12 | 13 | openai_client = OpenAI( 14 | api_key=os.getenv("MODEL_API_KEY"), 15 | base_url=os.getenv("MODEL_BASE_URL") 16 | ) 17 | 18 | 19 | def call_llm(messages: List[dict]) -> str: 20 | """Helper function to call Gemini API""" 21 | response = openai_client.chat.completions.create( 22 | model=os.getenv("MODEL_NAME"), 23 | messages=messages, 24 | ) 25 | return response.choices[0].message.content 26 | 27 | 28 | def summarize_text(text: str) -> str: 29 | """ 30 | Generate a summary of the text using LLM 31 | 32 | Args: 33 | text: Text to summarize 34 | 35 | Returns: 36 | Summary of the text 37 | """ 38 | messages = [ 39 | { 40 | "role": "system", 41 | "content": "You are a helpful assistant that summarizes documents accurately and concisely." 42 | }, 43 | { 44 | "role": "user", 45 | "content": f"Please summarize the following text concisely while capturing the key points:\n\n{text}" 46 | } 47 | ] 48 | 49 | return call_llm(messages) 50 | 51 | 52 | def chunk_text(text: str, chunk_size: int = 100, chunk_overlap: int = 10) -> List[str]: 53 | """ 54 | Split text into overlapping chunks of specified size 55 | 56 | Args: 57 | text: Text to split into chunks 58 | chunk_size: Maximum size of each chunk in tokens 59 | chunk_overlap: Overlap between chunks in tokens 60 | 61 | Returns: 62 | List of text chunks 63 | """ 64 | if not text: 65 | return [] 66 | 67 | # Use tiktoken to count tokens (or fallback to splitting by length) 68 | 69 | enc = tiktoken.encoding_for_model("gpt-3.5-turbo") 70 | tokens = enc.encode(text) 71 | 72 | # Create chunks with overlap 73 | chunks = [] 74 | i = 0 75 | while i < len(tokens): 76 | # Get chunk of size chunk_size 77 | chunk_end = min(i + chunk_size, len(tokens)) 78 | chunks.append(enc.decode(tokens[i:chunk_end])) 79 | # Move with overlap 80 | i = chunk_end - chunk_overlap if chunk_end < len(tokens) else chunk_end 81 | 82 | return chunks 83 | 84 | 85 | def answer_with_context(question: str, contexts: List[str]) -> str: 86 | """ 87 | Generate a response to a query using context from RAG 88 | 89 | Args: 90 | question: User's question 91 | contexts: List of relevant document chunks from ChromaDB 92 | 93 | Returns: 94 | LLM response to the question 95 | """ 96 | # Combine context into a single string with limited length 97 | combined_context = "\n\n---\n\n".join(contexts) 98 | 99 | messages = [ 100 | { 101 | "role": "system", 102 | "content": "You are a helpful assistant that answers questions based on the provided context. If you don't know the answer based on the context, say so." 103 | }, 104 | { 105 | "role": "user", 106 | "content": f"Context information:\n\n{combined_context}\n\nQuestion: {question}\n\nAnswer:" 107 | } 108 | ] 109 | 110 | return call_llm(messages) 111 | -------------------------------------------------------------------------------- /data.txt: -------------------------------------------------------------------------------- 1 | Top 20 International News – June 2025 2 | 1. U.S.–China Trade Talks Scheduled in London 3 | President Donald Trump announced that high-level trade negotiations between the U.S. and China will take place in London on June 9. This meeting aims to ease tensions and address issues such as rare earth exports and advanced technology trades. 4 | 5 | 2. UN to Host Israeli-Palestinian Peace Conference 6 | The United Nations will convene a conference from June 17–20 in New York, co-chaired by France and Saudi Arabia, to advance the two-state solution for the Israeli-Palestinian conflict. Israel has declined to participate, citing concerns over Hamas's actions. 7 | 8 | 3. European Leaders Criticize Israel's Actions in Gaza 9 | European leaders, including French President Emmanuel Macron and British Prime Minister Keir Starmer, have condemned Israel's military campaign in Gaza, which has resulted in over 54,000 Palestinian deaths. The EU is considering sanctions and reviewing trade agreements with Israel. 10 | 11 | 4. Trump and Musk Engage in Public Feud 12 | A public dispute has erupted between President Trump and tech billionaire Elon Musk over Trump's proposed "Big Beautiful Bill." Musk criticized the bill, leading to a sharp decline in Tesla's stock price and raising concerns about regulatory risks to Musk's companies. 13 | 14 | 5. Global Markets React to Trump-Musk Dispute 15 | The feud between Trump and Musk has impacted global markets, with Tesla shares dropping by over 14%. Investors are also cautious ahead of the U.S. May jobs report, as jobless claims have risen. 16 | 17 | 6. Kilmar Abrego Garcia Returned to U.S. for Trial 18 | Kilmar Abrego Garcia, previously deported by the Trump administration, has been returned to the U.S. to face charges related to a 2022 human smuggling case in Tennessee. 19 | 20 | 7. UN Calls for Ceasefire in Gaza 21 | The UN Security Council's elected members have initiated a draft resolution calling for an immediate, unconditional, and permanent ceasefire in Gaza, along with the release of hostages and lifting of aid restrictions. 22 | 23 | 8. World Environment Day Observed Globally 24 | June 5 marked the 52nd World Environment Day, led by the UN Environment Programme and hosted by Jeju, South Korea, under the theme #BeatPlasticPollution. 25 | 26 | 9. AIDS Still Claims a Life Every Minute 27 | Despite progress, AIDS-related deaths continue, with one person dying every minute. Funding cuts have disrupted HIV services, threatening hard-won gains. 28 | 29 | 10. Sudan's Women Face Health Crisis Amid Conflict 30 | In Sudan, ongoing hostilities and funding cuts have isolated rape survivors and pregnant women from essential health services, according to the UN sexual and reproductive health agency. 31 | 32 | 11. Trump Open to Meeting Putin and Zelensky in Turkey 33 | The White House has indicated that President Trump is open to meeting Russian President Vladimir Putin and Ukrainian President Volodymyr Zelensky in Turkey to discuss ongoing tensions. 34 | 35 | 12. Landslide in Sikkim Causes Fatalities 36 | A landslide in North Sikkim's Chatten area has resulted in at least three deaths, with more individuals feared missing. Search and identification efforts are ongoing. 37 | 38 | 13. International Days Observed in June 39 | June hosts several international observances, including World Sickle Cell Day on June 19, World Refugee Day on June 20, and International Yoga Day on June 21. 40 | 41 | 14. Pakistan Reports Military Losses Amid Tensions with India 42 | Reports indicate that Pakistan lost 30 missiles, six fighter jets, and two AWACS planes during recent tensions with India. 43 | 44 | 15. Mount Etna Erupts in Sicily 45 | Sicily's Mount Etna has erupted, emitting columns of smoke and ash. Authorities are monitoring the situation for potential hazards. 46 | 47 | 16. 5.8-Magnitude Earthquake Hits Turkey 48 | A 5.8-magnitude earthquake has struck Turkey's southwestern resorts, causing structural damage and prompting emergency responses. 49 | 50 | 17. North Korea Claims Success in Missile Launch 51 | North Korea has announced that a previously failed missile launch has now been successfully uprighted, showcasing its advancing military capabilities. 52 | 53 | 18. Kim Jong Un Pledges Support for Russia 54 | North Korean leader Kim Jong Un has vowed "unconditional support" for Russia's actions in Ukraine, strengthening ties between the two nations. 55 | 56 | 19. Belarus President to Visit China 57 | Belarusian President Alexander Lukashenko is scheduled to visit China for a three-day trip, aiming to bolster bilateral relations. 58 | 59 | 20. UN Highlights Violations of Child Rights in Guatemala 60 | The UN has reported violations of child rights in Guatemala, emphasizing the need for immediate action to protect vulnerable populations. --------------------------------------------------------------------------------