├── README.md ├── requirements.txt ├── .gitignore ├── frontend.py ├── survey.csv └── model.py /README.md: -------------------------------------------------------------------------------- 1 | # Rag-with-your-csv -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | accelerate 3 | langchain 4 | openai 5 | chromadb 6 | langchain-community 7 | tiktoken 8 | python-dotenv 9 | langchain_ollama 10 | streamlit 11 | streamlit_chat -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### AL ### 2 | #Template for AL projects for Dynamics 365 Business Central 3 | #launch.json folder 4 | .vscode/ 5 | #Cache folder 6 | .alcache/ 7 | #Symbols folder 8 | .alpackages/ 9 | #Snapshots folder 10 | .snapshots/ 11 | #Testing Output folder 12 | .output/ 13 | #Extension App-file 14 | *.app 15 | #Rapid Application Development File 16 | rad.json 17 | #Translation Base-file 18 | *.g.xlf 19 | #License-file 20 | *.flf 21 | #Test results file 22 | TestResults.xml -------------------------------------------------------------------------------- /frontend.py: -------------------------------------------------------------------------------- 1 | from model import run 2 | import streamlit as st 3 | from streamlit_chat import message as st_message 4 | 5 | 6 | # Initialize chat history 7 | if "messages" not in st.session_state: 8 | st.session_state.messages = [] 9 | 10 | # Display chat messages from history on app rerun 11 | for message in st.session_state.messages: 12 | with st.chat_message(message["role"]): 13 | st.markdown(message["content"]) 14 | 15 | if prompt := st.chat_input("What is up?"): 16 | # Display user message in chat message container 17 | with st.chat_message("user"): 18 | st.markdown(prompt) 19 | # Add user message to chat history 20 | st.session_state.messages.append({"role": "user", "content": prompt}) 21 | 22 | response = run(prompt) 23 | # Display assistant response in chat message container 24 | with st.chat_message("assistant"): 25 | st.markdown(response) 26 | # Add assistant response to chat history 27 | st.session_state.messages.append({"role": "assistant", "content": response}) -------------------------------------------------------------------------------- /survey.csv: -------------------------------------------------------------------------------- 1 | Question,Answer 2 | What sizes do you carry?,"We carry a range of sizes from XS to XXL, and some styles include additional sizes." 3 | Do you offer plus-size options?,"Yes, we offer an extensive plus-size collection designed for style and comfort." 4 | Are your clothes true to size?,"Our clothing generally runs true to size, but we recommend checking our size guide for precise measurements." 5 | What materials are your clothes made from?,"Our clothes are made from high-quality materials such as cotton, linen, and sustainable synthetics." 6 | How should I care for your clothing items?,"Each item includes a care label; generally, machine washing on a gentle cycle and air drying is recommended." 7 | Where are your products manufactured?,"Our products are manufactured in ethical factories located across various regions, including Europe and Asia." 8 | Do you offer international shipping?,"Yes, we offer international shipping to many countries. Please check our website for specific locations." 9 | What is your return policy?,We have a 30-day return policy for unworn items with original tags attached. 10 | How long does shipping usually take?,"Shipping typically takes 3-7 business days domestically and 10-15 days internationally, depending on location." 11 | Do you have a physical store location?,"Yes, we have a flagship store in [City/Location], and our store locator can help you find one near you." 12 | Can I track my order?,"Yes, once your order is shipped, you'll receive a tracking number via email to monitor your package." 13 | Are there any discounts or promotions available?,We often have promotions; sign up for our newsletter and follow us on social media for updates. 14 | How often do you release new collections?,"We release new collections seasonally, around four times a year, to align with fashion trends." 15 | Do you provide gift wrapping services?,"Yes, we offer gift wrapping for a small additional fee, perfect for special occasions." 16 | What payment methods do you accept?,"We accept major credit cards, digital wallets, and other secure payment options." 17 | Can I exchange an item for a different size?,"We offer exchanges for different sizes, subject to availability. Contact customer service for assistance." 18 | Do you have a loyalty or rewards program?,"Yes, join our loyalty program to earn points with every purchase and receive exclusive discounts." 19 | Are your products environmentally friendly?,We prioritize sustainability by incorporating eco-friendly materials and ethical production practices. 20 | How do I know if an item is in stock?,"Our website reflects real-time stock availability. If an item is out, consider signing up for restock notifications." 21 | Can I pre-order upcoming items?,"Yes, certain items can be pre-ordered, allowing you early access to fresh styles." -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | from dotenv import load_dotenv # Load environment variables from a .env file 4 | 5 | import textwrap # (Optional) For formatting text if needed later 6 | 7 | # Import various libraries used in the script 8 | import langchain 9 | import chromadb 10 | import transformers 11 | import openai 12 | import torch 13 | import requests 14 | import json 15 | 16 | # Import specific classes and functions from libraries 17 | from transformers import AutoTokenizer 18 | from langchain.llms.huggingface_pipeline import HuggingFacePipeline 19 | from langchain.text_splitter import CharacterTextSplitter 20 | from langchain.document_loaders.csv_loader import CSVLoader 21 | from langchain.embeddings import OpenAIEmbeddings 22 | from langchain.vectorstores import Chroma 23 | from langchain.chains import RetrievalQA 24 | from langchain.prompts import PromptTemplate 25 | from langchain_ollama.llms import OllamaLLM 26 | 27 | 28 | 29 | # ============================================================================= 30 | # 1. Environment Setup and API Key Loading 31 | # ============================================================================= 32 | 33 | # Load environment variables from the .env file (e.g., API keys) 34 | 35 | 36 | # ============================================================================= 37 | # 6. Create the RetrievalQA Chain 38 | # ============================================================================= 39 | 40 | # The RetrievalQA chain combines: 41 | # - The language model (model) to generate responses. 42 | # - A retriever (db.as_retriever) that fetches relevant document chunks based on the query. 43 | # - A prompt that provides instructions on how to answer the query. 44 | def run(input): 45 | chromadb.api.client.SharedSystemClient.clear_system_cache() 46 | load_dotenv() 47 | # Retrieve tokens from environment variables 48 | openai_api_key = os.getenv("OPENAI_API_KEY") 49 | 50 | # ============================================================================= 51 | # 2. Initialize the Language Model with Ollama 52 | # ============================================================================= 53 | 54 | # Here, we use a locally pulled model called "deepseek-r1" through OllamaLLM. 55 | # This model will be used later in our RetrievalQA chain. 56 | model = OllamaLLM(model="llama3.1") 57 | 58 | # ============================================================================= 59 | # 3. Document Preprocessing Function 60 | # ============================================================================= 61 | 62 | def docs_preprocessing_helper(file): 63 | """ 64 | Helper function to load and preprocess a CSV file containing data. 65 | 66 | This function performs two main tasks: 67 | 1. Loads the CSV file using CSVLoader from LangChain. 68 | 2. Splits the loaded documents into smaller text chunks using CharacterTextSplitter. 69 | 70 | Args: 71 | file (str): Path to the CSV file. 72 | 73 | Returns: 74 | list: A list of document chunks ready for embedding and indexing. 75 | 76 | Raises: 77 | TypeError: If the output is not in the expected dataframe/document format. 78 | """ 79 | # Load the CSV file using LangChain's CSVLoader. 80 | loader = CSVLoader(file) 81 | docs = loader.load() 82 | 83 | # Create a text splitter that divides the documents into chunks of up to 1000 characters 84 | # with no overlapping text between chunks. 85 | text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) 86 | docs = text_splitter.split_documents(docs) 87 | 88 | return docs 89 | 90 | # Preprocess the CSV file "survey.csv" and store the document chunks in 'docs'. 91 | docs = docs_preprocessing_helper('survey.csv') 92 | 93 | # ============================================================================= 94 | # 4. Set Up the Embedding Function and Chroma Database 95 | # ============================================================================= 96 | 97 | # Initialize the embedding function from OpenAI. This converts text into numerical vectors. 98 | # The OpenAIEmbeddings class uses the openai_api_key for authentication. 99 | embedding_function = OpenAIEmbeddings(openai_api_key=openai_api_key) 100 | 101 | # Create a vector store (ChromaDB) from the document chunks using the embedding function. 102 | # The Chroma database will be used to retrieve the most relevant documents based on the query. 103 | db = Chroma.from_documents(docs, embedding_function) 104 | 105 | # ============================================================================= 106 | # 5. Define and Initialize the Prompt Template 107 | # ============================================================================= 108 | 109 | # Define a prompt template that instructs the chatbot on how to answer customer queries. 110 | # The template includes context information and instructs the bot to use only provided data. 111 | template = """You are a clothing consultant chatbot. 112 | 113 | Answer the customer's questions. When relevant questions come, use the provided documents. Please answer to their specific question. If you are unsure, say "I don't know, please call our customer support". Use engaging, courteous, and professional language similar to a customer representative. 114 | Keep your answers concise. 115 | 116 | {context} 117 | 118 | """ 119 | 120 | # Create a PromptTemplate object from LangChain with the defined template. 121 | # It expects a variable called "context" that can be filled later. 122 | prompt = PromptTemplate(template=template, input_variables=["context"]) 123 | 124 | # Format the prompt with a general context message. 125 | # This additional context tells the chatbot the scenario in which it will be answering questions. 126 | formatted_prompt = prompt.format( 127 | context="A customer is on the clothing company website and wants to chat with the website chatbot. They will ask you a question. Please answer to their specific question" 128 | ) 129 | chain_type_kwargs = {"prompt": prompt} # Pass our custom prompt template to the chain. 130 | chain = RetrievalQA.from_chain_type( 131 | llm=model, # The language model (OllamaLLM with deepseek-r1) 132 | chain_type="stuff", # The "stuff" chain type combines all retrieved documents into one context. 133 | retriever=db.as_retriever(search_kwargs={"k": 1}), # Retrieve the top relevant document chunk. 134 | chain_type_kwargs=chain_type_kwargs, 135 | ) 136 | response = chain.run(input) 137 | return response 138 | 139 | 140 | 141 | --------------------------------------------------------------------------------