├── .env.example
├── README.md
├── pyproject.toml
├── datachatbot
    ├── ingest.py
    └── main.py
└── .gitignore


/.env.example:
--------------------------------------------------------------------------------
1 | HUGGINGFACEHUB_API_TOKEN=XXXXXXXXX


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | How to run it:
 2 | 
 3 | - Rename `.env.example` to `.env` 
 4 | - Replace XXX with the hugging face token 
 5 | - Create a docs folder at the root of the project
 6 | - Add any documents in it
 7 | 
 8 | - Run `poetry install`
 9 | - Run `poetry shell`
10 | - Open the project in VS Code and press Cmd + Shift + P
11 | - Select python interpreter
12 | 
13 | - Run the `ingestion.py` file and wait for completion from VS Code
14 | - Run the `main.py` from VS Code
15 | 
16 | You can now ask questions from your documents inside the terminal


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "datachatbot"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Donatien Thorez <donatienthorez@gmail.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.11"
10 | langchain = "^0.0.302"
11 | tqdm = "^4.66.1"
12 | unstructured = {extras = ["all-docs"], version = "^0.10.12"}
13 | sentence-transformers = "^2.2.2"
14 | faiss-cpu = "^1.7.4"
15 | python-dotenv = "^1.0.0"
16 | 
17 | 
18 | [build-system]
19 | requires = ["poetry-core"]
20 | build-backend = "poetry.core.masonry.api"
21 | 


--------------------------------------------------------------------------------
/datachatbot/ingest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dotenv import load_dotenv
 4 | from langchain.document_loaders import DirectoryLoader
 5 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 6 | from langchain.embeddings import HuggingFaceEmbeddings
 7 | from langchain.vectorstores import FAISS
 8 | 
 9 | def load_and_split_documents(chunk_size, chunk_overlap):
10 |     print ("Loading documents...")
11 |     loader = DirectoryLoader(
12 |         path='./docs',
13 |         glob="**/*",
14 |         show_progress=False,
15 |         use_multithreading=True
16 |     )
17 |     print ("Documents loaded")
18 |     print ("Creating chunks...")
19 |     text_splitter = RecursiveCharacterTextSplitter(
20 |         chunk_size = chunk_size,
21 |         chunk_overlap = chunk_overlap,
22 |         length_function = len,
23 |     )
24 | 
25 |     chunks = loader.load_and_split(
26 |         text_splitter=text_splitter
27 |     )
28 |     print("Created", len(chunks), "chunks of data")
29 |     return chunks
30 | 
31 | def main():
32 |     load_dotenv()
33 |     chunk_size = 1024
34 |     chunk_overlap = 100
35 | 
36 |     chunks = load_and_split_documents(chunk_size, chunk_overlap)
37 |     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
38 |     vectorstore = FAISS.from_documents(documents = chunks, embedding=embeddings)
39 |     vectorstore.save_local("faiss_vector_db")
40 |     print("Success! Vector Store has been created!")
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     main()


--------------------------------------------------------------------------------
/datachatbot/main.py:
--------------------------------------------------------------------------------
 1 | from dotenv import load_dotenv
 2 | from langchain.llms import HuggingFaceHub
 3 | from langchain.embeddings import HuggingFaceEmbeddings
 4 | from langchain.vectorstores import FAISS
 5 | from langchain.chains import RetrievalQA
 6 | 
 7 | if __name__ == '__main__':
 8 |     load_dotenv()
 9 |     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
10 |     vector_store = FAISS.load_local("faiss_vector_db", embeddings= embeddings)
11 |     repo_id = "tiiuae/falcon-7b-instruct"
12 |     llm = HuggingFaceHub(
13 |         repo_id=repo_id,
14 |         model_kwargs={
15 |             "temperature": 0.1,
16 |             "max_new_tokens":100,
17 |         },
18 |     )
19 | 
20 |     qa = RetrievalQA.from_chain_type(
21 |         llm=llm, 
22 |         chain_type="stuff",
23 |         retriever=vector_store.as_retriever(), 
24 |         return_source_documents = True
25 |     )
26 | 
27 |     exit_conditions = (":q", "quit", "exit")
28 |     while True:
29 |         query = input("\nType your question\n")
30 | 
31 |         if query in exit_conditions:
32 |             break
33 |         else:
34 |             result = qa(query)
35 |             print("-" * 60)
36 |             for index, doc in enumerate(result["source_documents"], start=1):
37 |                 print(f"Document {index}:")
38 |                 print("Page Content:")
39 |                 print(doc.page_content)
40 |                 print("Metadata:", doc.metadata)
41 |                 print("-" * 40)
42 | 
43 |             print("\n")
44 |             print("Result:", result["result"])


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos 
 32 | into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since 
 88 | the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include 
 94 | Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific 
 96 | dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that 
 98 | don't work, or not
 99 | #   install all needed dependencies.
100 | #Pipfile.lock
101 | 
102 | # poetry
103 | #   Similar to Pipfile.lock, it is generally recommended to include 
104 | poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure 
106 | reproducibility, and is more
107 | #   commonly ignored for libraries.
108 | #   
109 | https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
110 | #poetry.lock
111 | 
112 | # pdm
113 | #   Similar to Pipfile.lock, it is generally recommended to include 
114 | pdm.lock in version control.
115 | #pdm.lock
116 | #   pdm stores project-wide configurations in .pdm.toml, but it is 
117 | recommended to not include it
118 | #   in version control.
119 | #   https://pdm.fming.dev/#use-with-ide
120 | .pdm.toml
121 | 
122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and 
123 | github.com/pdm-project/pdm
124 | __pypackages__/
125 | 
126 | # Celery stuff
127 | celerybeat-schedule
128 | celerybeat.pid
129 | 
130 | # SageMath parsed files
131 | *.sage.py
132 | 
133 | # Environments
134 | .env
135 | .venv
136 | env/
137 | venv/
138 | ENV/
139 | env.bak/
140 | venv.bak/
141 | 
142 | # Spyder project settings
143 | .spyderproject
144 | .spyproject
145 | 
146 | # Rope project settings
147 | .ropeproject
148 | 
149 | # mkdocs documentation
150 | /site
151 | 
152 | # mypy
153 | .mypy_cache/
154 | .dmypy.json
155 | dmypy.json
156 | 
157 | # Pyre type checker
158 | .pyre/
159 | 
160 | # pytype static type analyzer
161 | .pytype/
162 | 
163 | # Cython debug symbols
164 | cython_debug/
165 | 
166 | # PyCharm
167 | #  JetBrains specific template is maintained in a separate 
168 | JetBrains.gitignore that can
169 | #  be found at 
170 | https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
171 | #  and can be added to the global gitignore or merged into this file.  For 
172 | a more nuclear
173 | #  option (not recommended) you can uncomment the following to ignore the 
174 | entire idea folder.
175 | #.idea/
176 | 


--------------------------------------------------------------------------------