├── .env.example ├── README.md ├── pyproject.toml ├── datachatbot ├── ingest.py └── main.py └── .gitignore /.env.example: -------------------------------------------------------------------------------- 1 | HUGGINGFACEHUB_API_TOKEN=XXXXXXXXX -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | How to run it: 2 | 3 | - Rename `.env.example` to `.env` 4 | - Replace XXX with the hugging face token 5 | - Create a docs folder at the root of the project 6 | - Add any documents in it 7 | 8 | - Run `poetry install` 9 | - Run `poetry shell` 10 | - Open the project in VS Code and press Cmd + Shift + P 11 | - Select python interpreter 12 | 13 | - Run the `ingestion.py` file and wait for completion from VS Code 14 | - Run the `main.py` from VS Code 15 | 16 | You can now ask questions from your documents inside the terminal -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "datachatbot" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Donatien Thorez "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.11" 10 | langchain = "^0.0.302" 11 | tqdm = "^4.66.1" 12 | unstructured = {extras = ["all-docs"], version = "^0.10.12"} 13 | sentence-transformers = "^2.2.2" 14 | faiss-cpu = "^1.7.4" 15 | python-dotenv = "^1.0.0" 16 | 17 | 18 | [build-system] 19 | requires = ["poetry-core"] 20 | build-backend = "poetry.core.masonry.api" 21 | -------------------------------------------------------------------------------- /datachatbot/ingest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dotenv import load_dotenv 4 | from langchain.document_loaders import DirectoryLoader 5 | from langchain.text_splitter import RecursiveCharacterTextSplitter 6 | from langchain.embeddings import HuggingFaceEmbeddings 7 | from langchain.vectorstores import FAISS 8 | 9 | def load_and_split_documents(chunk_size, chunk_overlap): 10 | print ("Loading documents...") 11 | loader = DirectoryLoader( 12 | path='./docs', 13 | glob="**/*", 14 | show_progress=False, 15 | use_multithreading=True 16 | ) 17 | print ("Documents loaded") 18 | print ("Creating chunks...") 19 | text_splitter = RecursiveCharacterTextSplitter( 20 | chunk_size = chunk_size, 21 | chunk_overlap = chunk_overlap, 22 | length_function = len, 23 | ) 24 | 25 | chunks = loader.load_and_split( 26 | text_splitter=text_splitter 27 | ) 28 | print("Created", len(chunks), "chunks of data") 29 | return chunks 30 | 31 | def main(): 32 | load_dotenv() 33 | chunk_size = 1024 34 | chunk_overlap = 100 35 | 36 | chunks = load_and_split_documents(chunk_size, chunk_overlap) 37 | embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") 38 | vectorstore = FAISS.from_documents(documents = chunks, embedding=embeddings) 39 | vectorstore.save_local("faiss_vector_db") 40 | print("Success! Vector Store has been created!") 41 | 42 | 43 | if __name__ == "__main__": 44 | main() -------------------------------------------------------------------------------- /datachatbot/main.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | from langchain.llms import HuggingFaceHub 3 | from langchain.embeddings import HuggingFaceEmbeddings 4 | from langchain.vectorstores import FAISS 5 | from langchain.chains import RetrievalQA 6 | 7 | if __name__ == '__main__': 8 | load_dotenv() 9 | embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") 10 | vector_store = FAISS.load_local("faiss_vector_db", embeddings= embeddings) 11 | repo_id = "tiiuae/falcon-7b-instruct" 12 | llm = HuggingFaceHub( 13 | repo_id=repo_id, 14 | model_kwargs={ 15 | "temperature": 0.1, 16 | "max_new_tokens":100, 17 | }, 18 | ) 19 | 20 | qa = RetrievalQA.from_chain_type( 21 | llm=llm, 22 | chain_type="stuff", 23 | retriever=vector_store.as_retriever(), 24 | return_source_documents = True 25 | ) 26 | 27 | exit_conditions = (":q", "quit", "exit") 28 | while True: 29 | query = input("\nType your question\n") 30 | 31 | if query in exit_conditions: 32 | break 33 | else: 34 | result = qa(query) 35 | print("-" * 60) 36 | for index, doc in enumerate(result["source_documents"], start=1): 37 | print(f"Document {index}:") 38 | print("Page Content:") 39 | print(doc.page_content) 40 | print("Metadata:", doc.metadata) 41 | print("-" * 40) 42 | 43 | print("\n") 44 | print("Result:", result["result"]) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos 32 | into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since 88 | the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include 94 | Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific 96 | dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that 98 | don't work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # poetry 103 | # Similar to Pipfile.lock, it is generally recommended to include 104 | poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure 106 | reproducibility, and is more 107 | # commonly ignored for libraries. 108 | # 109 | https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 110 | #poetry.lock 111 | 112 | # pdm 113 | # Similar to Pipfile.lock, it is generally recommended to include 114 | pdm.lock in version control. 115 | #pdm.lock 116 | # pdm stores project-wide configurations in .pdm.toml, but it is 117 | recommended to not include it 118 | # in version control. 119 | # https://pdm.fming.dev/#use-with-ide 120 | .pdm.toml 121 | 122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and 123 | github.com/pdm-project/pdm 124 | __pypackages__/ 125 | 126 | # Celery stuff 127 | celerybeat-schedule 128 | celerybeat.pid 129 | 130 | # SageMath parsed files 131 | *.sage.py 132 | 133 | # Environments 134 | .env 135 | .venv 136 | env/ 137 | venv/ 138 | ENV/ 139 | env.bak/ 140 | venv.bak/ 141 | 142 | # Spyder project settings 143 | .spyderproject 144 | .spyproject 145 | 146 | # Rope project settings 147 | .ropeproject 148 | 149 | # mkdocs documentation 150 | /site 151 | 152 | # mypy 153 | .mypy_cache/ 154 | .dmypy.json 155 | dmypy.json 156 | 157 | # Pyre type checker 158 | .pyre/ 159 | 160 | # pytype static type analyzer 161 | .pytype/ 162 | 163 | # Cython debug symbols 164 | cython_debug/ 165 | 166 | # PyCharm 167 | # JetBrains specific template is maintained in a separate 168 | JetBrains.gitignore that can 169 | # be found at 170 | https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 171 | # and can be added to the global gitignore or merged into this file. For 172 | a more nuclear 173 | # option (not recommended) you can uncomment the following to ignore the 174 | entire idea folder. 175 | #.idea/ 176 | --------------------------------------------------------------------------------