├── README.md ├── db ├── c05505f9-768f-4252-a239-185598b4e57e │ ├── link_lists.bin │ ├── header.bin │ └── length.bin └── chroma.sqlite3 ├── requirements.txt └── app.py /README.md: -------------------------------------------------------------------------------- 1 | # RAG-Embedding -------------------------------------------------------------------------------- /db/c05505f9-768f-4252-a239-185598b4e57e/link_lists.bin: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /db/chroma.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hrishikesh332/RAG-Embedding/HEAD/db/chroma.sqlite3 -------------------------------------------------------------------------------- /db/c05505f9-768f-4252-a239-185598b4e57e/header.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hrishikesh332/RAG-Embedding/HEAD/db/c05505f9-768f-4252-a239-185598b4e57e/header.bin -------------------------------------------------------------------------------- /db/c05505f9-768f-4252-a239-185598b4e57e/length.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hrishikesh332/RAG-Embedding/HEAD/db/c05505f9-768f-4252-a239-185598b4e57e/length.bin -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyPDF2 2 | streamlit 3 | openai 4 | streamlit_option_menu 5 | sentence-transformers 6 | langchain==0.1.0 7 | langchain_experimental==0.0.49 8 | openai==1.7.1 9 | tabulate==0.9.0 10 | InstructorEmbedding 11 | torch 12 | sentence-transformers 13 | chromadb 14 | pysqlite3-binary 15 | tiktoken 16 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import textwrap 3 | from PyPDF2 import PdfReader 4 | import streamlit as st 5 | import streamlit.components.v1 as components 6 | from streamlit_option_menu import option_menu 7 | from openai import OpenAI 8 | from langchain.text_splitter import CharacterTextSplitter 9 | from langchain.embeddings.openai import OpenAIEmbeddings 10 | from langchain import FAISS 11 | from langchain.chains.question_answering import load_qa_chain 12 | from langchain.llms import OpenAI 13 | from langchain.callbacks import get_openai_callback 14 | from langchain import OpenAI 15 | from langchain.chains import RetrievalQA 16 | from langchain_community.llms import OpenAI 17 | from langchain_community.vectorstores import Chroma 18 | from langchain.embeddings import HuggingFaceEmbeddings 19 | __import__('pysqlite3') 20 | import sys 21 | sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') 22 | 23 | 24 | page_element=""" 25 | 38 | 39 | """ 40 | 41 | st.markdown(page_element, unsafe_allow_html=True) 42 | st.markdown("

Power of Laws 💬

", unsafe_allow_html=True) 43 | st.markdown("---") 44 | 45 | 46 | 47 | llm = OpenAI(openai_api_key=st.secrets["LLM_API"]) 48 | 49 | 50 | def process_text(text): 51 | 52 | text_splitter = CharacterTextSplitter( 53 | separator="\n", 54 | chunk_size=1000, 55 | chunk_overlap=200, 56 | length_function=len 57 | ) 58 | chunks = text_splitter.split_text(text) 59 | 60 | embeddings = OpenAIEmbeddings(openai_api_key=st.secrets["LLM_API"]) 61 | knowledgeBase = FAISS.from_texts(chunks, embeddings) 62 | 63 | return knowledgeBase 64 | 65 | def wrap_text_preserve_newlines(text, width=110): 66 | 67 | lines = text.split('\n') 68 | wrapped_lines = [textwrap.fill(line, width=width) for line in lines] 69 | # Join the wrapped lines back together using newline characters 70 | wrapped_text = '\n'.join(wrapped_lines) 71 | return wrapped_text 72 | 73 | def process_llm_response(llm_response): 74 | result_output = wrap_text_preserve_newlines(llm_response['result']) 75 | print(result_output) 76 | return result_output 77 | 78 | 79 | flow_option = st.selectbox( 80 | 'Choose an Option -', 81 | ('Power of Laws', 'Upload Another PDF')) 82 | 83 | if flow_option == 'Power of Laws': 84 | query = st.text_input('Ask a question to the PDF') 85 | submit=st.button("Submit") 86 | 87 | model_name = "sentence-transformers/all-mpnet-base-v2" 88 | instructor_embeddings = HuggingFaceEmbeddings( 89 | model_name=model_name, 90 | model_kwargs={'device': 'cpu'}, 91 | encode_kwargs={'normalize_embeddings': False} 92 | ) 93 | 94 | embedding = instructor_embeddings 95 | persist_directory='db' 96 | vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding) 97 | retriever = vectordb.as_retriever(search_kwargs={"k": 3}) 98 | qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True) 99 | if submit: 100 | query = f""" 101 | Do strictly follow the context and for not retrieved data, output as No data. The context is from the Book of Power of Laws - {query} 102 | """ 103 | 104 | llm_response = qa_chain(query) 105 | result_ipc=process_llm_response(llm_response) 106 | st.write(result_ipc) 107 | 108 | 109 | 110 | elif flow_option == 'Upload Another PDF': 111 | 112 | pdf = st.file_uploader('Upload your PDF Document', type='pdf') 113 | 114 | if pdf is not None: 115 | pdf_reader = PdfReader(pdf) 116 | text = "" 117 | for page in pdf_reader.pages: 118 | text += page.extract_text() 119 | 120 | # Create the knowledge base object 121 | knowledgeBase = process_text(text) 122 | query = st.text_input('Ask a question to the PDF') 123 | cancel_button = st.button('Cancel') 124 | 125 | if cancel_button: 126 | st.stop() 127 | 128 | if query: 129 | docs = knowledgeBase.similarity_search(query) 130 | chain = load_qa_chain(llm, chain_type='stuff') 131 | 132 | with get_openai_callback() as cost: 133 | response = chain.run(input_documents=docs, question=query) 134 | print(cost) 135 | 136 | st.write(response) 137 | --------------------------------------------------------------------------------