├── .env ├── data └── iso27001.pdf ├── requirements.txt ├── add_document.py ├── create_index.py ├── config.yaml ├── document_chat.py ├── telegram_bot.py ├── utils.py └── readme.md /.env: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY = 2 | TELEGRAM_KEY = -------------------------------------------------------------------------------- /data/iso27001.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Leon-Sander/langchain_faiss_vectorindex/HEAD/data/iso27001.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langchain 2 | langchain-openai 3 | python-dotenv 4 | openai 5 | pypdf 6 | sentence_transformers 7 | faiss-cpu 8 | #faiss-gpu 9 | python-telegram-bot 10 | tqdm -------------------------------------------------------------------------------- /add_document.py: -------------------------------------------------------------------------------- 1 | from langchain.embeddings.huggingface import HuggingFaceEmbeddings 2 | from utils import load_documents, load_db, save_db, load_embeddings 3 | 4 | db = load_db(embedding_function=load_embeddings()) 5 | db.add_documents(load_documents("new_document/")) 6 | save_db(db) -------------------------------------------------------------------------------- /create_index.py: -------------------------------------------------------------------------------- 1 | from langchain_community.vectorstores import FAISS 2 | from utils import load_documents, save_db, load_embeddings 3 | 4 | embedding_function = load_embeddings() 5 | documents = load_documents("data/") 6 | 7 | db = FAISS.from_documents(documents, embedding_function) 8 | print("Index Created") 9 | save_db(db) 10 | 11 | print(db.similarity_search("ISO/IEC 27001 standard")) -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | faiss_indexstore: 2 | save_path: 'faiss_db/' 3 | index_name: 'books' 4 | 5 | embeddings: 6 | name: 'all-MiniLM-L6-v2' 7 | device: 'cpu' 8 | 9 | TextSplitter: 10 | #The chunk size here means the number of characters. 11 | #The embedding model can only handle 256 tokens 12 | #Based on Openai approximately 4 characters equal one token 13 | #therefore we set the chunksize to 1024 characters approximating 256 tokens 14 | chunk_size: 1024 15 | chunk_overlap: 52 16 | -------------------------------------------------------------------------------- /document_chat.py: -------------------------------------------------------------------------------- 1 | from langchain.chains import RetrievalQA 2 | from dotenv import load_dotenv 3 | from langchain_openai import ChatOpenAI 4 | from utils import load_embeddings, load_db 5 | 6 | load_dotenv() 7 | 8 | class retrieval_chat(): 9 | 10 | def __init__(self) -> None: 11 | 12 | embedding_function = load_embeddings() 13 | 14 | db = load_db(embedding_function) 15 | 16 | self.qa_model = RetrievalQA.from_llm(llm=ChatOpenAI(temperature=0.1), retriever=db.as_retriever(kwargs={"k": 3}), return_source_documents=True) 17 | 18 | def answer_question(self, question :str): 19 | output = self.qa_model.invoke({"query": question}) 20 | #print("Source Documents: ") 21 | #print(output["source_documents"]) 22 | return output["result"] 23 | 24 | if __name__ == "__main__": 25 | qa_chat = retrieval_chat() 26 | while True: 27 | print("Whats Your Question:") 28 | query = input() 29 | if query == "exit": 30 | break 31 | print(qa_chat.answer_question(query)) 32 | -------------------------------------------------------------------------------- /telegram_bot.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | import logging 4 | from telegram import Update 5 | from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, filters, MessageHandler, CallbackContext 6 | from document_chat import retrieval_chat 7 | load_dotenv() 8 | 9 | qa = retrieval_chat() 10 | 11 | logging.basicConfig( 12 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 13 | level=logging.INFO 14 | ) 15 | 16 | async def start(update: Update, context: ContextTypes.DEFAULT_TYPE): 17 | await context.bot.send_message(chat_id=update.effective_chat.id, text="I'm a bot, please talk to me!") 18 | 19 | def generate_answer(user_message): 20 | return qa.answer_question(user_message) 21 | 22 | async def handle_message(update: Update, context: CallbackContext) -> None: 23 | user_message = update.message.text # Extract the user's message 24 | response = generate_answer(user_message) # Generate a response 25 | await context.bot.send_message(chat_id=update.effective_chat.id, text=response) # Send the response 26 | 27 | if __name__ == '__main__': 28 | application = ApplicationBuilder().token(os.getenv("TELEGRAM_KEY")).build() 29 | 30 | start_handler = CommandHandler('start', start) 31 | message_handler = MessageHandler(filters.TEXT & (~filters.COMMAND), handle_message) 32 | application.add_handler(start_handler) 33 | application.add_handler(message_handler) 34 | 35 | application.run_polling() -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from langchain.text_splitter import RecursiveCharacterTextSplitter 2 | from langchain.embeddings.huggingface import HuggingFaceEmbeddings 3 | from langchain_community.document_loaders import PyPDFLoader 4 | from langchain_community.vectorstores import FAISS 5 | from glob import glob 6 | from tqdm import tqdm 7 | import yaml 8 | 9 | def load_config(): 10 | with open('config.yaml', 'r') as file: 11 | config = yaml.safe_load(file) 12 | return config 13 | 14 | config = load_config() 15 | 16 | def load_embeddings(model_name=config["embeddings"]["name"], 17 | model_kwargs = {'device': config["embeddings"]["device"]}): 18 | return HuggingFaceEmbeddings(model_name=model_name, model_kwargs = model_kwargs) 19 | 20 | def load_documents(directory : str): 21 | """Loads all documents from a directory and returns a list of Document objects 22 | args: directory format = directory/ 23 | """ 24 | text_splitter = RecursiveCharacterTextSplitter(chunk_size = config["TextSplitter"]["chunk_size"], 25 | chunk_overlap = config["TextSplitter"]["chunk_overlap"]) 26 | documents = [] 27 | for item_path in tqdm(glob(directory + "*.pdf")): 28 | loader = PyPDFLoader(item_path) 29 | documents.extend(loader.load_and_split(text_splitter=text_splitter)) 30 | 31 | return documents 32 | 33 | def load_db(embedding_function, save_path=config["faiss_indexstore"]["save_path"], index_name=config["faiss_indexstore"]["index_name"]): 34 | db = FAISS.load_local(folder_path=save_path, index_name=index_name, embeddings = embedding_function) 35 | return db 36 | 37 | def save_db(db, save_path=config["faiss_indexstore"]["save_path"], index_name=config["faiss_indexstore"]["index_name"]): 38 | db.save_local(save_path, index_name) 39 | print("Saved db to " + save_path + index_name) -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ### Document Retrieval with FAISS vectorstore, langchain and OpenAI 2 | 3 | This is the repository to my tutorial on YouTube: https://youtu.be/N7TQgp18kA4 4 | It will index your pdf documents from the data directory and store it in faiss_db. 5 | 6 | You can then retrieve information from your documents, which will require an OpenAI api key. 7 | You can run the document_chat.py file to query your documents, or run the 8 | telegram_bot to query them on Telegram. This will require an API token, that you will get 9 | from the BotFather in Telegram. 10 | 11 | You can add new documents to your index by placing new pdf files in the new_document directory and 12 | running the add_document.py file. 13 | 14 | If you want to run everything on gpu, just install faiss-gpu (you must uninstall faiss-cpu then), 15 | and put the embeddings on the gpu by defining it in the config.yaml 16 | 17 | ### Getting Started 18 | 19 | To get started with The document chat, clone the repository and follow these simple steps: 20 | 21 | 1. **Upgrade pip**: pip install --upgrade pip 22 | 23 | 2. **Install Requirements**: enter your api keys in the .env file 24 | 25 | 3. **Customize config file**: Check the config file and change accordingly to the models you downloaded. 26 | 27 | 4. **Run file**: run ```python3 create_index.py``` , it will create the FAISS index with all the documents in the data folder 28 | 29 | 5. **Start Chatting**: ```python3 document_chat.py``` 30 | 31 | 32 | ### Changes I made to the code you see in the video 33 | 34 | I added a config.yaml file where you can define the huggingface embeddings you want to use, and on which device to run it (cpu or cuda). 35 | Also there is the save_path and index_name for FAISS vectorstore defined. 36 | 37 | I added the functions load_db, save_db, load_embeddings and load_config in the utils file, to reduce repetetive code in the other .py files. --------------------------------------------------------------------------------