├── hello.py ├── .gitignore ├── data ├── 2760342_PM_EN.pdf └── DHT11-Technical-Sheet.pdf ├── __pycache__ ├── query_data.cpython-310.pyc ├── get_embedding_function.cpython-310.pyc └── test_cases.cpython-310-pytest-8.3.1.pyc ├── get_embedding_function.py ├── requirements.txt ├── README.md ├── query_data.py ├── test_cases.py └── load_pdf.py /hello.py: -------------------------------------------------------------------------------- 1 | print("Hello World") -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | chroma_* 3 | chroma 4 | .DS_Store 5 | .idea -------------------------------------------------------------------------------- /data/2760342_PM_EN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runtime/langchain-rag-openai/master/data/2760342_PM_EN.pdf -------------------------------------------------------------------------------- /data/DHT11-Technical-Sheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runtime/langchain-rag-openai/master/data/DHT11-Technical-Sheet.pdf -------------------------------------------------------------------------------- /__pycache__/query_data.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runtime/langchain-rag-openai/master/__pycache__/query_data.cpython-310.pyc -------------------------------------------------------------------------------- /__pycache__/get_embedding_function.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runtime/langchain-rag-openai/master/__pycache__/get_embedding_function.cpython-310.pyc -------------------------------------------------------------------------------- /__pycache__/test_cases.cpython-310-pytest-8.3.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runtime/langchain-rag-openai/master/__pycache__/test_cases.cpython-310-pytest-8.3.1.pyc -------------------------------------------------------------------------------- /get_embedding_function.py: -------------------------------------------------------------------------------- 1 | #openai 2 | from langchain_openai import OpenAIEmbeddings 3 | import openai 4 | #env 5 | from dotenv import load_dotenv 6 | import os 7 | 8 | 9 | load_dotenv() 10 | openai.api_key = os.environ['OPENAI_API_KEY'] 11 | 12 | def get_embedding_function(): 13 | embeddings = OpenAIEmbeddings() 14 | return embeddings -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv==1.0.1 # For reading environment variables stored in .env file 2 | langchain==0.2.2 3 | langchain-community==0.2.3 4 | langchain-openai==0.1.8 # For embeddings 5 | unstructured==0.14.4 # Document loading 6 | # onnxruntime==1.17.1 # chromadb dependency: on Mac use `conda install onnxruntime -c conda-forge` 7 | # For Windows users, install Microsoft Visual C++ Build Tools first 8 | # install onnxruntime before installing `chromadb` 9 | chromadb==0.5.0 # Vector storage 10 | openai==1.31.1 # For embeddings 11 | tiktoken==0.7.0 # For embeddings 12 | 13 | # install markdown depenendies with: `pip install "unstructured[md]"` after install the requirements file. Leave this line commented out. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Arduino Sensor RAG w/OpenAI Embedding & Mistral (on Ollama) for Query Match 2 | 3 | This is an MVP of a LLM Document Search RAG. 4 | 5 | Requirements Doc: 6 | * Scan PDF (pypdf) (AWS Textract) 7 | * Create pages 8 | * Chunk pages (langchain) 9 | * Embeddings (openAI) 10 | * Store in Vector DB (Chroma) 11 | * Test our embeddings (pyTest) 12 | * Retrieve with search query (nistral) 13 | 14 | ## Use Guide 15 | ### Install dependencies 16 | run this command to install dependencies in the `requirements.txt` file. 17 | 18 | ```python 19 | pip install -r requirements.txt 20 | ``` 21 | 22 | 23 | 24 | 25 | ## Installs 26 | ```python 27 | pip install pytest 28 | pip install pyPdf 29 | ``` 30 | 31 | ## Step 1: Start or Add to Existing Chroma db 32 | 33 | To Scan all the pdf files in the data folder and put them into the RAG run: 34 | 35 | ```python 36 | python load_pdf.py 37 | ``` 38 | This will scan the pdfs using pypdf through langchain document loader, split the docs into pages and then will chunk it. Chunks are embedded and stored in Chroma 39 | ## Step 2: Query the database 40 | 41 | Query the Chroma DB and use Mistral to create an answer 42 | 43 | ```python 44 | python query_data.py "Your question relevant to the context of the application" 45 | ``` 46 | 47 | ## Step 3: Test the Query Returns using PyTest and Mistral 48 | 49 | Test Mistral's answers using PyTest 50 | 51 | ```python 52 | pytest test_cases.py 53 | ``` -------------------------------------------------------------------------------- /query_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from langchain_community.vectorstores import Chroma 3 | from langchain.prompts import ChatPromptTemplate 4 | from langchain_community.llms.ollama import Ollama 5 | from get_embedding_function import get_embedding_function 6 | 7 | CHROMA_PATH = "chroma_db" 8 | 9 | PROMPT_TEMPLATE = """ 10 | Answer the question based only on the following context: 11 | 12 | {context} 13 | 14 | --- 15 | 16 | Answer the question based on the above context: {question} 17 | """ 18 | 19 | def main(): 20 | # Create CLI. 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument("query_text", type=str, help="The query text.") 23 | args = parser.parse_args() 24 | query_text = args.query_text 25 | query_rag(query_text) 26 | 27 | def query_rag(query_text: str): 28 | #prepare the DB 29 | embedding_function = get_embedding_function() 30 | db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function) 31 | #search the db 32 | results = db.similarity_search_with_score(query_text, k=5) 33 | #handle the template above 34 | context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results]); 35 | prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) 36 | prompt = prompt_template.format(context=context_text, question=query_text) 37 | print(prompt) 38 | #use open source model locally to do the NLU for the search 39 | model = Ollama(model="mistral") 40 | response_text = model.invoke(prompt) 41 | sources = [doc.metadata.get("id", None) for doc, _score in results] 42 | formatted_response = f"Response: {response_text}\nSources: {sources}" 43 | print(formatted_response) 44 | return response_text 45 | 46 | 47 | 48 | if __name__ == "__main__": 49 | main() -------------------------------------------------------------------------------- /test_cases.py: -------------------------------------------------------------------------------- 1 | from query_data import query_rag 2 | from langchain_community.llms.ollama import Ollama 3 | 4 | EVAL_PROMPT = """ 5 | Expected Response: {expected_response} 6 | Actual Response: {actual_response} 7 | --- 8 | (Answer with 'true' or 'false') Does the actual response match the expected response? 9 | """ 10 | 11 | def test_dh11_voltage(): 12 | assert query_and_validate( 13 | question=" how many volts does the DH11 sensor require to run?", 14 | expected_response="The DHT11 sensor requires a minimum of 3V and can work up to 5.5V. Therefore, it typically operates at 5V.", 15 | ) 16 | 17 | def test_ultrasonic_detection_angle(): 18 | assert query_and_validate( 19 | question="What is the detection angle of the ultrasonic sensor", 20 | expected_response="30°", 21 | ) 22 | 23 | def test_ultrasonic_voltage(): 24 | assert query_and_validate( 25 | question="What is the voltage of the ultrasonic sensor", 26 | expected_response="5V", 27 | ) 28 | 29 | def test_ultrasonic_current(): 30 | assert query_and_validate( 31 | question="What is the current of the ultrasonic sensor", 32 | expected_response="15mA", 33 | ) 34 | 35 | def test_ultrasonic_range(): 36 | assert query_and_validate( 37 | question="What is the range of the ultrasonic sensor", 38 | expected_response="1.2 in – 13 ft(3 cm – 4 m) Ultrasonic", 39 | ) 40 | 41 | def query_and_validate(question: str, expected_response: str): 42 | response_text = query_rag(question) 43 | prompt = EVAL_PROMPT.format( 44 | expected_response=expected_response, actual_response=response_text 45 | ) 46 | 47 | model = Ollama(model="mistral") 48 | evaluation_results_str = model.invoke(prompt) 49 | evaluation_results_str_cleaned = evaluation_results_str.strip().lower() 50 | 51 | print(prompt) 52 | 53 | if "true" in evaluation_results_str_cleaned: 54 | # Print response in Green if it is correct. 55 | print("\033[92m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m") 56 | return True 57 | elif "false" in evaluation_results_str_cleaned: 58 | # Print response in Red if it is incorrect. 59 | print("\033[91m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m") 60 | return False 61 | else: 62 | raise ValueError( 63 | f"Invalid evaluation result. Cannot determine if 'true' or 'false'." 64 | ) -------------------------------------------------------------------------------- /load_pdf.py: -------------------------------------------------------------------------------- 1 | #load arg generator for flagging db 2 | import argparse 3 | #from from langchain_community.document_loaders import PyPDFLoader 4 | from langchain_community.document_loaders import PyPDFDirectoryLoader 5 | #splitter lib 6 | from langchain.text_splitter import RecursiveCharacterTextSplitter 7 | from langchain.schema import Document 8 | #global utils function for embeddings 9 | from get_embedding_function import get_embedding_function 10 | #chroma 11 | from langchain_community.vectorstores import Chroma 12 | import os 13 | #DB persistence 14 | import shutil 15 | 16 | 17 | CHROMA_PATH = "chroma_db" 18 | FILE_PATH = "data" 19 | 20 | 21 | def main(): 22 | print("f[main]") 23 | init() 24 | 25 | def init(): 26 | print("f[init]") 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--reset", action="store_true", help="Reset the database.") 29 | args = parser.parse_args() 30 | if args.reset: 31 | print("✨ Clearing Database") 32 | clear_database() 33 | 34 | # Create (or update) the data store. 35 | # load_pdf(file_path) <-- uses pdfloaders splitter 36 | documents = load_documents() #<-- uses langchain splitter 37 | #documents returns both the content and the meta data 38 | print("f[init] documents[0]: ", documents[0]) 39 | print("f[init] documents[0]: ", documents[1]) 40 | #use split_docs function which uses langchain recursive split 41 | chunks = split_documents(documents) 42 | #print("f[init] chunks[0]: ", chunks[0]) 43 | add_to_chroma(chunks) 44 | 45 | 46 | # def load_pdf(FILE_PATH: str) -> list[str]: 47 | # loader = PyPDFLoader(FILE_PATH) 48 | # #load_and_split seems the same 49 | # pages = loader.load_and_split() 50 | # print("f[load_pdf] complete, pages: ", len(pages)) 51 | # print("f[load_pdf] (example) page 1: ", pages[0].page_content) 52 | # return pages 53 | 54 | def load_documents(): 55 | print("f[load_documents] FILE_PATH ", FILE_PATH) 56 | #PyPDFLoader does one file 57 | #loader = PyPDFLoader(FILE_PATH) 58 | loader = PyPDFDirectoryLoader(FILE_PATH) #<-- Directory loader is needed for recursive file loading 59 | return loader.load() 60 | 61 | #recursive text splitter 62 | def split_documents(documents: list[Document]): 63 | text_splitter = RecursiveCharacterTextSplitter( 64 | chunk_size=800, 65 | chunk_overlap=80, 66 | length_function=len, 67 | is_separator_regex=False, 68 | ) 69 | return text_splitter.split_documents(documents) 70 | 71 | # multi use for storing and retrieving 72 | # def get_embeddings(): 73 | # embeddings = get_embedding_function() 74 | # return embeddings 75 | 76 | #creates a new database 77 | # def add_to_chroma(chunks): 78 | # if os.path.exists(CHROMA_PATH): 79 | # print('[load_pdf] chroma db already exists') 80 | # shutil.rmtree(CHROMA_PATH) 81 | # vectordb = Chroma.from_documents(documents=chunks, embedding=get_embeddings(), persist_directory=CHROMA_PATH) 82 | # print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.") 83 | 84 | #add to existing chroma db. credit to pixegami for the chunking with ids to solve the page vs chunk id issue 85 | def add_to_chroma(chunks: list[Document]): 86 | # Load the existing database. 87 | db = Chroma( 88 | persist_directory=CHROMA_PATH, embedding_function=get_embedding_function() 89 | ) 90 | 91 | # Calculate Page IDs. 92 | chunks_with_ids = calculate_chunk_ids(chunks) 93 | 94 | # Add or Update the documents. 95 | existing_items = db.get(include=[]) # IDs are always included by default?" 96 | existing_ids = set(existing_items["ids"]) 97 | print(f"Number of existing documents in DB: {len(existing_ids)}") 98 | 99 | # Only add documents that don't exist in the DB. 100 | new_chunks = [] 101 | for chunk in chunks_with_ids: 102 | if chunk.metadata["id"] not in existing_ids: 103 | new_chunks.append(chunk) 104 | print("new_chunks: ", new_chunks) 105 | 106 | if len(new_chunks): 107 | print(f"👉 Adding new documents: {len(new_chunks)}") 108 | new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks] 109 | db.add_documents(new_chunks, ids=new_chunk_ids) 110 | db.persist() 111 | else: 112 | print("✅ No new documents to add") 113 | 114 | 115 | def calculate_chunk_ids(chunks): 116 | 117 | # This will create IDs like "data/monopoly.pdf:6:2" 118 | # Page Source : Page Number : Chunk Index 119 | 120 | last_page_id = None 121 | current_chunk_index = 0 122 | 123 | for chunk in chunks: 124 | source = chunk.metadata.get("source") 125 | page = chunk.metadata.get("page") 126 | current_page_id = f"{source}:{page}" 127 | 128 | # If the page ID is the same as the last one, increment the index. 129 | if current_page_id == last_page_id: 130 | current_chunk_index += 1 131 | else: 132 | current_chunk_index = 0 133 | 134 | # Calculate the chunk ID. 135 | chunk_id = f"{current_page_id}:{current_chunk_index}" 136 | last_page_id = current_page_id 137 | 138 | # Add it to the page meta-data. 139 | chunk.metadata["id"] = chunk_id 140 | 141 | return chunks 142 | 143 | 144 | def clear_database(): 145 | if os.path.exists(CHROMA_PATH): 146 | shutil.rmtree(CHROMA_PATH) 147 | 148 | if __name__ == "__main__": 149 | main() --------------------------------------------------------------------------------