├── .env
├── data
    └── iso27001.pdf
├── requirements.txt
├── add_document.py
├── create_index.py
├── config.yaml
├── document_chat.py
├── telegram_bot.py
├── utils.py
└── readme.md


/.env:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY =
2 | TELEGRAM_KEY =


--------------------------------------------------------------------------------
/data/iso27001.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leon-Sander/langchain_faiss_vectorindex/HEAD/data/iso27001.pdf


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | langchain
 2 | langchain-openai
 3 | python-dotenv
 4 | openai
 5 | pypdf
 6 | sentence_transformers
 7 | faiss-cpu
 8 | #faiss-gpu
 9 | python-telegram-bot
10 | tqdm


--------------------------------------------------------------------------------
/add_document.py:
--------------------------------------------------------------------------------
1 | from langchain.embeddings.huggingface import HuggingFaceEmbeddings
2 | from utils import load_documents, load_db, save_db, load_embeddings
3 | 
4 | db = load_db(embedding_function=load_embeddings())
5 | db.add_documents(load_documents("new_document/"))
6 | save_db(db)


--------------------------------------------------------------------------------
/create_index.py:
--------------------------------------------------------------------------------
 1 | from langchain_community.vectorstores import FAISS
 2 | from utils import load_documents, save_db, load_embeddings
 3 | 
 4 | embedding_function = load_embeddings()
 5 | documents = load_documents("data/")
 6 | 
 7 | db = FAISS.from_documents(documents, embedding_function)
 8 | print("Index Created")
 9 | save_db(db)
10 | 
11 | print(db.similarity_search("ISO/IEC 27001 standard"))


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | faiss_indexstore:
 2 |   save_path: 'faiss_db/'
 3 |   index_name: 'books'
 4 | 
 5 | embeddings:
 6 |   name: 'all-MiniLM-L6-v2'
 7 |   device: 'cpu'
 8 | 
 9 | TextSplitter:
10 |   #The chunk size here means the number of characters.
11 |   #The embedding model can only handle 256 tokens
12 |   #Based on Openai approximately 4 characters equal one token 
13 |   #therefore we set the chunksize to 1024 characters approximating 256 tokens
14 |   chunk_size: 1024 
15 |   chunk_overlap: 52
16 | 


--------------------------------------------------------------------------------
/document_chat.py:
--------------------------------------------------------------------------------
 1 | from langchain.chains import RetrievalQA
 2 | from dotenv import load_dotenv
 3 | from langchain_openai import ChatOpenAI
 4 | from utils import load_embeddings, load_db
 5 | 
 6 | load_dotenv()
 7 | 
 8 | class retrieval_chat():
 9 | 
10 |     def __init__(self) -> None:
11 |         
12 |         embedding_function = load_embeddings()
13 | 
14 |         db = load_db(embedding_function)
15 | 
16 |         self.qa_model = RetrievalQA.from_llm(llm=ChatOpenAI(temperature=0.1), retriever=db.as_retriever(kwargs={"k": 3}), return_source_documents=True)
17 | 
18 |     def answer_question(self, question :str):
19 |         output = self.qa_model.invoke({"query": question})
20 |         #print("Source Documents: ")
21 |         #print(output["source_documents"])
22 |         return output["result"]
23 | 
24 | if __name__ == "__main__":
25 |     qa_chat = retrieval_chat()
26 |     while True:
27 |         print("Whats Your Question:")
28 |         query = input()
29 |         if query == "exit":
30 |             break
31 |         print(qa_chat.answer_question(query))
32 | 


--------------------------------------------------------------------------------
/telegram_bot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dotenv import load_dotenv
 3 | import logging
 4 | from telegram import Update
 5 | from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, filters, MessageHandler, CallbackContext
 6 | from document_chat import retrieval_chat
 7 | load_dotenv()
 8 | 
 9 | qa = retrieval_chat()
10 | 
11 | logging.basicConfig(
12 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
13 |     level=logging.INFO
14 | )
15 | 
16 | async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
17 |     await context.bot.send_message(chat_id=update.effective_chat.id, text="I'm a bot, please talk to me!")
18 | 
19 | def generate_answer(user_message):
20 |     return qa.answer_question(user_message)
21 | 
22 | async def handle_message(update: Update, context: CallbackContext) -> None:
23 |     user_message = update.message.text  # Extract the user's message
24 |     response = generate_answer(user_message)  # Generate a response
25 |     await context.bot.send_message(chat_id=update.effective_chat.id, text=response)  # Send the response
26 | 
27 | if __name__ == '__main__':
28 |     application = ApplicationBuilder().token(os.getenv("TELEGRAM_KEY")).build()
29 | 
30 |     start_handler = CommandHandler('start', start)
31 |     message_handler = MessageHandler(filters.TEXT & (~filters.COMMAND), handle_message)
32 |     application.add_handler(start_handler)
33 |     application.add_handler(message_handler)
34 | 
35 |     application.run_polling()


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 2 | from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 3 | from langchain_community.document_loaders import PyPDFLoader
 4 | from langchain_community.vectorstores import FAISS
 5 | from glob import glob
 6 | from tqdm import tqdm
 7 | import yaml
 8 | 
 9 | def load_config():
10 |     with open('config.yaml', 'r') as file:
11 |         config = yaml.safe_load(file)
12 |     return config
13 | 
14 | config = load_config()
15 | 
16 | def load_embeddings(model_name=config["embeddings"]["name"],
17 |                     model_kwargs = {'device': config["embeddings"]["device"]}):
18 |     return HuggingFaceEmbeddings(model_name=model_name, model_kwargs = model_kwargs)
19 | 
20 | def load_documents(directory : str):
21 |     """Loads all documents from a directory and returns a list of Document objects
22 |     args: directory format = directory/
23 |     """
24 |     text_splitter = RecursiveCharacterTextSplitter(chunk_size = config["TextSplitter"]["chunk_size"], 
25 |                                                    chunk_overlap = config["TextSplitter"]["chunk_overlap"])
26 |     documents = []
27 |     for item_path in tqdm(glob(directory + "*.pdf")):
28 |         loader = PyPDFLoader(item_path)
29 |         documents.extend(loader.load_and_split(text_splitter=text_splitter))
30 | 
31 |     return documents
32 | 
33 | def load_db(embedding_function, save_path=config["faiss_indexstore"]["save_path"], index_name=config["faiss_indexstore"]["index_name"]):
34 |     db = FAISS.load_local(folder_path=save_path, index_name=index_name, embeddings = embedding_function)
35 |     return db
36 | 
37 | def save_db(db, save_path=config["faiss_indexstore"]["save_path"], index_name=config["faiss_indexstore"]["index_name"]):
38 |     db.save_local(save_path, index_name)
39 |     print("Saved db to " + save_path + index_name)


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | ### Document Retrieval with FAISS vectorstore, langchain and OpenAI
 2 | 
 3 | This is the repository to my tutorial on YouTube: https://youtu.be/N7TQgp18kA4
 4 | It will index your pdf documents from the data directory and store it in faiss_db.
 5 | 
 6 | You can then retrieve information from your documents, which will require an OpenAI api key.
 7 | You can run the document_chat.py file to query your documents, or run the 
 8 | telegram_bot to query them on Telegram. This will require an API token, that you will get
 9 | from the BotFather in Telegram.
10 | 
11 | You can add new documents to your index by placing new pdf files in the new_document directory and
12 | running the add_document.py file.
13 | 
14 | If you want to run everything on gpu, just install faiss-gpu (you must uninstall faiss-cpu then),
15 | and put the embeddings on the gpu by defining it in the config.yaml
16 | 
17 | ### Getting Started
18 | 
19 | To get started with The document chat, clone the repository and follow these simple steps:
20 | 
21 | 1. **Upgrade pip**: pip install --upgrade pip
22 | 
23 | 2. **Install Requirements**: enter your api keys in the .env file
24 | 
25 | 3. **Customize config file**: Check the config file and change accordingly to the models you downloaded.
26 | 
27 | 4. **Run file**: run ```python3 create_index.py``` , it will create the FAISS index with all the documents in the data folder
28 | 
29 | 5. **Start Chatting**: ```python3 document_chat.py```
30 | 
31 | 
32 | ### Changes I made to the code you see in the video
33 | 
34 | I added a config.yaml file where you can define the huggingface embeddings you want to use, and on which device to run it (cpu or cuda).
35 | Also there is the save_path and index_name for FAISS vectorstore defined.
36 | 
37 | I added the functions load_db, save_db, load_embeddings and load_config in the utils file, to reduce repetetive code in the other .py files.


--------------------------------------------------------------------------------