├── .github
    └── workflows
    │   ├── docker-image.yml
    │   ├── fly-deploy.yml
    │   └── fly-rollback.yml
├── .gitignore
├── .prefectignore
├── .run
    ├── create_secrets_blocks.run.xml
    ├── ingest_de.run.xml
    ├── ingest_ml.run.xml
    ├── ingest_mlops.run.xml
    ├── run_bot_datatalks.run.xml
    └── run_bot_local_ws.run.xml
├── Mlops_chatbot_diagram.png
├── README.md
├── dev.env
├── fly.toml
├── ingest
    ├── README.md
    ├── de.dockerfile
    ├── de
    │   └── ingest_de.py
    ├── dev.env
    ├── llm.dockerfile
    ├── llm
    │   └── ingest_llm.py
    ├── local_development.md
    ├── local_milvus
    │   └── docker-compose.yml
    ├── ml.dockerfile
    ├── ml
    │   └── ingest_ml.py
    ├── mlops.dockerfile
    ├── mlops
    │   ├── ingest_mlops.py
    │   └── ingest_mlops_old.py
    ├── prefect.md
    ├── prefect_infra
    │   └── create_secrets_blocks.py
    ├── readers
    │   ├── custom_faq_gdoc_reader.py
    │   ├── slack_reader.py
    │   └── youtube_reader.py
    ├── requirements.txt
    └── utils
    │   └── index_utils.py
├── prefect.yaml
├── requirements.txt
├── slack_bot
    ├── Dockerfile
    ├── README.md
    ├── app_manifest.json
    ├── bot_icon.png
    ├── dev.env
    ├── docker-compose-my-workspace.yml
    ├── docker-compose.yml
    ├── main.py
    └── requirements.txt
├── slack_bot_custom_ingestion.png
└── test.py


/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image CI
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   build-and-push-image:
 8 |     name: Push Docker image to Docker Hub
 9 |     runs-on: ubuntu-latest
10 |     permissions:
11 |       contents: read
12 |       packages: write
13 | 
14 |     steps:
15 |       - name: Check out the repo
16 |         uses: actions/checkout@v4
17 | 
18 |       - name: Log in to Docker Hub
19 |         uses: docker/login-action@v3
20 |         with:
21 |           username: ${{ secrets.DOCKER_USERNAME }}
22 |           password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
23 | 
24 |       - name: Pull currently used Docker image
25 |         run: docker pull aaalexlit/faq-slack-bot:main
26 | 
27 |       - name: Tag currently used Docker image as "previous" to enable easy rollback
28 |         run: docker tag aaalexlit/faq-slack-bot:main aaalexlit/faq-slack-bot:previous
29 | 
30 |       - name: Push tagged image to Docker Hub
31 |         run: docker push aaalexlit/faq-slack-bot:previous
32 | 
33 |       - name: Extract metadata (tags, labels) for Docker
34 |         id: meta
35 |         uses: docker/metadata-action@v5
36 |         with:
37 |           images: aaalexlit/faq-slack-bot
38 |           tags: |
39 |             type=sha
40 |             type=ref,event=branch
41 | 
42 |       - name: Build and push Docker image
43 |         uses: docker/build-push-action@v5
44 |         with:
45 |           context: ./slack_bot/
46 |           file: ./slack_bot/Dockerfile
47 |           push: true
48 |           tags: ${{ steps.meta.outputs.tags }}
49 |           labels: ${{ steps.meta.outputs.labels }}
50 | 


--------------------------------------------------------------------------------
/.github/workflows/fly-deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Fly Deploy
 2 | 
 3 | on:
 4 |   workflow_run:
 5 |     workflows: ["Docker Image CI"]
 6 |     types:
 7 |       - completed
 8 |   workflow_dispatch:
 9 |     inputs:
10 |       docker_tag:
11 |         description: 'Docker image tag to be deployed. by default `main`'
12 |         required: false
13 |         default: 'main'
14 | 
15 | jobs:
16 |   deploy:
17 |     name: Deploy app
18 |     runs-on: ubuntu-latest
19 |     if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
20 |     steps:
21 |       - uses: actions/checkout@v4
22 |       - uses: superfly/flyctl-actions/setup-flyctl@master
23 |       - run: |
24 |           if [ "${{ github.event_name }}" == "workflow_run" ]; then
25 |             DOCKER_TAG="main"
26 |           else
27 |             DOCKER_TAG="${{ github.event.workflow_run.event.inputs.docker_tag || github.event.inputs.docker_tag }}"
28 |           fi
29 |           flyctl deploy --remote-only --image aaalexlit/faq-slack-bot:${DOCKER_TAG}
30 |         env:
31 |           FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
32 | 


--------------------------------------------------------------------------------
/.github/workflows/fly-rollback.yml:
--------------------------------------------------------------------------------
 1 | name: Fly Rollback
 2 | on:
 3 |   workflow_dispatch:
 4 | jobs:
 5 |   rollback:
 6 |     name: Rollback to the previously deployed image
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v4
10 |       - uses: superfly/flyctl-actions/setup-flyctl@master
11 |       - run: flyctl deploy --remote-only --image aaalexlit/faq-slack-bot:previous
12 |         env:
13 |           FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ### PyCharm+all template
  2 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
  3 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  4 | 
  5 | # User-specific stuff
  6 | .idea/**/workspace.xml
  7 | .idea/**/tasks.xml
  8 | .idea/**/usage.statistics.xml
  9 | .idea/**/dictionaries
 10 | .idea/**/shelf
 11 | 
 12 | # AWS User-specific
 13 | .idea/**/aws.xml
 14 | 
 15 | # Generated files
 16 | .idea/**/contentModel.xml
 17 | 
 18 | # Sensitive or high-churn files
 19 | .idea/**/dataSources/
 20 | .idea/**/dataSources.ids
 21 | .idea/**/dataSources.local.xml
 22 | .idea/**/sqlDataSources.xml
 23 | .idea/**/dynamic.xml
 24 | .idea/**/uiDesigner.xml
 25 | .idea/**/dbnavigator.xml
 26 | 
 27 | # Gradle
 28 | .idea/**/gradle.xml
 29 | .idea/**/libraries
 30 | 
 31 | # Gradle and Maven with auto-import
 32 | # When using Gradle or Maven with auto-import, you should exclude module files,
 33 | # since they will be recreated, and may cause churn.  Uncomment if using
 34 | # auto-import.
 35 | # .idea/artifacts
 36 | # .idea/compiler.xml
 37 | # .idea/jarRepositories.xml
 38 | # .idea/modules.xml
 39 | # .idea/*.iml
 40 | # .idea/modules
 41 | # *.iml
 42 | # *.ipr
 43 | 
 44 | # CMake
 45 | cmake-build-*/
 46 | 
 47 | # Mongo Explorer plugin
 48 | .idea/**/mongoSettings.xml
 49 | 
 50 | # File-based project format
 51 | *.iws
 52 | 
 53 | # IntelliJ
 54 | out/
 55 | 
 56 | # mpeltonen/sbt-idea plugin
 57 | .idea_modules/
 58 | 
 59 | # JIRA plugin
 60 | atlassian-ide-plugin.xml
 61 | 
 62 | # Cursive Clojure plugin
 63 | .idea/replstate.xml
 64 | 
 65 | # SonarLint plugin
 66 | .idea/sonarlint/
 67 | 
 68 | # Crashlytics plugin (for Android Studio and IntelliJ)
 69 | com_crashlytics_export_strings.xml
 70 | crashlytics.properties
 71 | crashlytics-build.properties
 72 | fabric.properties
 73 | 
 74 | # Editor-based Rest Client
 75 | .idea/httpRequests
 76 | 
 77 | # Android studio 3.1+ serialized cache file
 78 | .idea/caches/build_file_checksums.ser
 79 | 
 80 | ### Python template
 81 | # Byte-compiled / optimized / DLL files
 82 | __pycache__/
 83 | *.py[cod]
 84 | *$py.class
 85 | 
 86 | # C extensions
 87 | *.so
 88 | 
 89 | # Distribution / packaging
 90 | .Python
 91 | build/
 92 | develop-eggs/
 93 | dist/
 94 | downloads/
 95 | eggs/
 96 | .eggs/
 97 | lib/
 98 | lib64/
 99 | parts/
100 | sdist/
101 | var/
102 | wheels/
103 | share/python-wheels/
104 | *.egg-info/
105 | .installed.cfg
106 | *.egg
107 | MANIFEST
108 | 
109 | # PyInstaller
110 | #  Usually these files are written by a python script from a template
111 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
112 | *.manifest
113 | *.spec
114 | 
115 | # Installer logs
116 | pip-log.txt
117 | pip-delete-this-directory.txt
118 | 
119 | # Unit test / coverage reports
120 | htmlcov/
121 | .tox/
122 | .nox/
123 | .coverage
124 | .coverage.*
125 | .cache
126 | nosetests.xml
127 | coverage.xml
128 | *.cover
129 | *.py,cover
130 | .hypothesis/
131 | .pytest_cache/
132 | cover/
133 | 
134 | # Translations
135 | *.mo
136 | *.pot
137 | 
138 | # Django stuff:
139 | *.log
140 | local_settings.py
141 | db.sqlite3
142 | db.sqlite3-journal
143 | 
144 | # Flask stuff:
145 | instance/
146 | .webassets-cache
147 | 
148 | # Scrapy stuff:
149 | .scrapy
150 | 
151 | # Sphinx documentation
152 | docs/_build/
153 | 
154 | # PyBuilder
155 | .pybuilder/
156 | target/
157 | 
158 | # Jupyter Notebook
159 | .ipynb_checkpoints
160 | 
161 | # IPython
162 | profile_default/
163 | ipython_config.py
164 | 
165 | # pyenv
166 | #   For a library or package, you might want to ignore these files since the code is
167 | #   intended to run in multiple environments; otherwise, check them in:
168 | # .python-version
169 | 
170 | # pipenv
171 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
172 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
173 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
174 | #   install all needed dependencies.
175 | #Pipfile.lock
176 | 
177 | # poetry
178 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
179 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
180 | #   commonly ignored for libraries.
181 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
182 | #poetry.lock
183 | 
184 | # pdm
185 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
186 | #pdm.lock
187 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
188 | #   in version control.
189 | #   https://pdm.fming.dev/#use-with-ide
190 | .pdm.toml
191 | 
192 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
193 | __pypackages__/
194 | 
195 | # Celery stuff
196 | celerybeat-schedule
197 | celerybeat.pid
198 | 
199 | # SageMath parsed files
200 | *.sage.py
201 | 
202 | # Environments
203 | .env
204 | .venv
205 | env/
206 | venv/
207 | ENV/
208 | env.bak/
209 | venv.bak/
210 | 
211 | # Spyder project settings
212 | .spyderproject
213 | .spyproject
214 | 
215 | # Rope project settings
216 | .ropeproject
217 | 
218 | # mkdocs documentation
219 | /site
220 | 
221 | # mypy
222 | .mypy_cache/
223 | .dmypy.json
224 | dmypy.json
225 | 
226 | # Pyre type checker
227 | .pyre/
228 | 
229 | # pytype static type analyzer
230 | .pytype/
231 | 
232 | # Cython debug symbols
233 | cython_debug/
234 | 
235 | # PyCharm
236 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
237 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
238 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
239 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
240 | #.idea/
241 | 
242 | ingest/keys
243 | wandb/
244 | .idea
245 | 
246 | # ignore local folders where the indexed repo gets cloned to
247 | **/git
248 | 
249 | # ignore local milvus volumes
250 | **/volumes
251 | 
252 | /ingest/ml/build_docker_image.md
253 | 
254 | *.ipynb


--------------------------------------------------------------------------------
/.prefectignore:
--------------------------------------------------------------------------------
 1 | # prefect artifacts
 2 | .prefectignore
 3 | 
 4 | # python artifacts
 5 | __pycache__/
 6 | *.py[cod]
 7 | *$py.class
 8 | *.egg-info/
 9 | *.egg
10 | 
11 | # Type checking artifacts
12 | .mypy_cache/
13 | .dmypy.json
14 | dmypy.json
15 | .pyre/
16 | 
17 | # IPython
18 | profile_default/
19 | ipython_config.py
20 | *.ipynb_checkpoints/*
21 | 
22 | # Environments
23 | .python-version
24 | .env
25 | .venv
26 | env/
27 | venv/
28 | 
29 | # MacOS
30 | .DS_Store
31 | 
32 | # Dask
33 | dask-worker-space/
34 | 
35 | # Editors
36 | .idea/
37 | .vscode/
38 | 
39 | # VCS
40 | .git/
41 | .hg/
42 | 


--------------------------------------------------------------------------------
/.run/create_secrets_blocks.run.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="create_secrets_blocks" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
 3 |     <module name="faq-slack-bot" />
 4 |     <option name="ENV_FILES" value="$PROJECT_DIR$/ingest/.env" />
 5 |     <option name="INTERPRETER_OPTIONS" value="" />
 6 |     <option name="PARENT_ENVS" value="true" />
 7 |     <envs>
 8 |       <env name="PYTHONUNBUFFERED" value="1" />
 9 |     </envs>
10 |     <option name="SDK_HOME" value="" />
11 |     <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/ingest/prefect_infra" />
12 |     <option name="IS_MODULE_SDK" value="true" />
13 |     <option name="ADD_CONTENT_ROOTS" value="true" />
14 |     <option name="ADD_SOURCE_ROOTS" value="true" />
15 |     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
16 |     <option name="SCRIPT_NAME" value="$PROJECT_DIR$/ingest/prefect_infra/create_secrets_blocks.py" />
17 |     <option name="PARAMETERS" value="" />
18 |     <option name="SHOW_COMMAND_LINE" value="false" />
19 |     <option name="EMULATE_TERMINAL" value="false" />
20 |     <option name="MODULE_MODE" value="false" />
21 |     <option name="REDIRECT_INPUT" value="false" />
22 |     <option name="INPUT_FILE" value="" />
23 |     <method v="2" />
24 |   </configuration>
25 | </component>


--------------------------------------------------------------------------------
/.run/ingest_de.run.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="ingest_de" type="PythonConfigurationType" factoryName="Python">
 3 |     <module name="faq-slack-bot" />
 4 |     <option name="ENV_FILES" value="" />
 5 |     <option name="INTERPRETER_OPTIONS" value="" />
 6 |     <option name="PARENT_ENVS" value="true" />
 7 |     <envs>
 8 |       <env name="PYTHONUNBUFFERED" value="1" />
 9 |       <env name="EMBEDDING_CACHE_NAMESPACE" value="de_zoomcamp" />
10 |     </envs>
11 |     <option name="SDK_HOME" value="" />
12 |     <option name="SDK_NAME" value="zoomcamp-bot" />
13 |     <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/ingest/de" />
14 |     <option name="IS_MODULE_SDK" value="false" />
15 |     <option name="ADD_CONTENT_ROOTS" value="true" />
16 |     <option name="ADD_SOURCE_ROOTS" value="true" />
17 |     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
18 |     <option name="SCRIPT_NAME" value="$PROJECT_DIR$/ingest/de/ingest_de.py" />
19 |     <option name="PARAMETERS" value="" />
20 |     <option name="SHOW_COMMAND_LINE" value="false" />
21 |     <option name="EMULATE_TERMINAL" value="false" />
22 |     <option name="MODULE_MODE" value="false" />
23 |     <option name="REDIRECT_INPUT" value="false" />
24 |     <option name="INPUT_FILE" value="" />
25 |     <method v="2" />
26 |   </configuration>
27 | </component>


--------------------------------------------------------------------------------
/.run/ingest_ml.run.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="ingest_ml" type="PythonConfigurationType" factoryName="Python">
 3 |     <module name="faq-slack-bot" />
 4 |     <option name="ENV_FILES" value="" />
 5 |     <option name="INTERPRETER_OPTIONS" value="" />
 6 |     <option name="PARENT_ENVS" value="true" />
 7 |     <envs>
 8 |       <env name="PYTHONUNBUFFERED" value="1" />
 9 |       <env name="EMBEDDING_CACHE_NAMESPACE" value="ml_zoomcamp" />
10 |     </envs>
11 |     <option name="SDK_HOME" value="" />
12 |     <option name="SDK_NAME" value="zoomcamp-bot" />
13 |     <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/ingest" />
14 |     <option name="IS_MODULE_SDK" value="false" />
15 |     <option name="ADD_CONTENT_ROOTS" value="true" />
16 |     <option name="ADD_SOURCE_ROOTS" value="true" />
17 |     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
18 |     <option name="SCRIPT_NAME" value="$PROJECT_DIR$/ingest/ml/ingest_ml.py" />
19 |     <option name="PARAMETERS" value="" />
20 |     <option name="SHOW_COMMAND_LINE" value="false" />
21 |     <option name="EMULATE_TERMINAL" value="false" />
22 |     <option name="MODULE_MODE" value="false" />
23 |     <option name="REDIRECT_INPUT" value="false" />
24 |     <option name="INPUT_FILE" value="" />
25 |     <method v="2" />
26 |   </configuration>
27 | </component>


--------------------------------------------------------------------------------
/.run/ingest_mlops.run.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="ingest_mlops" type="PythonConfigurationType" factoryName="Python">
 3 |     <module name="faq-slack-bot" />
 4 |     <option name="ENV_FILES" value="" />
 5 |     <option name="INTERPRETER_OPTIONS" value="" />
 6 |     <option name="PARENT_ENVS" value="true" />
 7 |     <envs>
 8 |       <env name="PYTHONUNBUFFERED" value="1" />
 9 |       <env name="EMBEDDING_CACHE_NAMESPACE" value="mlops_zoomcamp" />
10 |     </envs>
11 |     <option name="SDK_HOME" value="" />
12 |     <option name="SDK_NAME" value="zoomcamp-bot" />
13 |     <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/ingest" />
14 |     <option name="IS_MODULE_SDK" value="false" />
15 |     <option name="ADD_CONTENT_ROOTS" value="true" />
16 |     <option name="ADD_SOURCE_ROOTS" value="true" />
17 |     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
18 |     <option name="SCRIPT_NAME" value="$PROJECT_DIR$/ingest/mlops/ingest_mlops.py" />
19 |     <option name="PARAMETERS" value="" />
20 |     <option name="SHOW_COMMAND_LINE" value="false" />
21 |     <option name="EMULATE_TERMINAL" value="false" />
22 |     <option name="MODULE_MODE" value="false" />
23 |     <option name="REDIRECT_INPUT" value="false" />
24 |     <option name="INPUT_FILE" value="" />
25 |     <method v="2" />
26 |   </configuration>
27 | </component>


--------------------------------------------------------------------------------
/.run/run_bot_datatalks.run.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="run_bot_datatalks" type="PythonConfigurationType" factoryName="Python">
 3 |     <module name="faq-slack-bot" />
 4 |     <option name="ENV_FILES" value="$PROJECT_DIR$/slack_bot/.env" />
 5 |     <option name="INTERPRETER_OPTIONS" value="" />
 6 |     <option name="PARENT_ENVS" value="true" />
 7 |     <envs>
 8 |       <env name="PYTHONUNBUFFERED" value="1" />
 9 |     </envs>
10 |     <option name="SDK_HOME" value="" />
11 |     <option name="SDK_NAME" value="zoomcamp-bot" />
12 |     <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
13 |     <option name="IS_MODULE_SDK" value="false" />
14 |     <option name="ADD_CONTENT_ROOTS" value="true" />
15 |     <option name="ADD_SOURCE_ROOTS" value="true" />
16 |     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
17 |     <option name="SCRIPT_NAME" value="$PROJECT_DIR$/slack_bot/main.py" />
18 |     <option name="PARAMETERS" value="" />
19 |     <option name="SHOW_COMMAND_LINE" value="false" />
20 |     <option name="EMULATE_TERMINAL" value="false" />
21 |     <option name="MODULE_MODE" value="false" />
22 |     <option name="REDIRECT_INPUT" value="false" />
23 |     <option name="INPUT_FILE" value="" />
24 |     <method v="2" />
25 |   </configuration>
26 | </component>


--------------------------------------------------------------------------------
/.run/run_bot_local_ws.run.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="run_bot_local_ws" type="PythonConfigurationType" factoryName="Python">
 3 |     <module name="faq-slack-bot" />
 4 |     <option name="ENV_FILES" value="$PROJECT_DIR$/.env" />
 5 |     <option name="INTERPRETER_OPTIONS" value="" />
 6 |     <option name="PARENT_ENVS" value="true" />
 7 |     <envs>
 8 |       <env name="PYTHONUNBUFFERED" value="1" />
 9 |     </envs>
10 |     <option name="SDK_HOME" value="" />
11 |     <option name="SDK_NAME" value="slack-bot" />
12 |     <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
13 |     <option name="IS_MODULE_SDK" value="false" />
14 |     <option name="ADD_CONTENT_ROOTS" value="true" />
15 |     <option name="ADD_SOURCE_ROOTS" value="true" />
16 |     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
17 |     <option name="SCRIPT_NAME" value="$PROJECT_DIR$/slack_bot/main.py" />
18 |     <option name="PARAMETERS" value="" />
19 |     <option name="SHOW_COMMAND_LINE" value="false" />
20 |     <option name="EMULATE_TERMINAL" value="false" />
21 |     <option name="MODULE_MODE" value="false" />
22 |     <option name="REDIRECT_INPUT" value="false" />
23 |     <option name="INPUT_FILE" value="" />
24 |     <method v="2" />
25 |   </configuration>
26 | </component>


--------------------------------------------------------------------------------
/Mlops_chatbot_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaalexlit/faq-slack-bot/78962d17dbd68438bd443b7dfdb961ac9d13e574/Mlops_chatbot_diagram.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This is a repo for a human-interface LLM-powered QA Slack chatbot for answering questions
 2 | related to DataTalks.Club Zoomcamps
 3 | 
 4 | # Current version
 5 | 
 6 | Please follow [this report](https://api.wandb.ai/links/aaalex-lit/ii6tpid4) that
 7 | explains in detail how the bot works
 8 | 
 9 | # First (outdated) version
10 | ## Behind the scenes
11 | Course FAQ Google Document and the Course repo get indexed to the Pinecone vector store.
12 | Then semantic search retrieves the most similar (and hopefully most relevant) pieces to the question asked.
13 | Then this information is passed as a context to a conversational LLM to form the final answer.
14 | 
15 | ![Diagram](Mlops_chatbot_diagram.png)
16 | 
17 | # Before you start
18 | 
19 | Use any python virtual environment manager of your preference 
20 |     and install the dependencies from [requirements.txt](requirements.txt)
21 | 
22 | # Fill [Pinecone](https://www.pinecone.io/) index
23 | 1. Set `PINECONE_API_KEY` and `PINECONE_ENV` 
24 | environmental variables accordingly
25 |  
26 | 2. run [ingest/ingest.py](ingest/mlops/ingest_mlops_old.py) 
27 | 
28 |     ```bash
29 |     python ingest_mlops_old.py
30 |     ```
31 | # Test QA pipeline locally 
32 | 1. Set `OPENAI_API_KEY`, `PINECONE_API_KEY`, and `PINECONE_ENV` 
33 | environmental variables accordingly
34 | 1. Run [test.py](test.py)
35 | 
36 |     ```bash
37 |     python test.py
38 |     ```
39 | # Launch the bot
40 | 1. Set `OPENAI_API_KEY`, `SLACK_APP_TOKEN`, `SLACK_BOT_TOKEN`,
41 | `PINECONE_API_KEY`, and `PINECONE_ENV` 
42 | environmental variables accordingly
43 | 1. To launch the bot on the CLI run [slack_bot/main.py](slack_bot/main.py)
44 |     ```bash
45 |    python main.py
46 |    ```
47 | 
48 | Alternatively it can be launched with Docker, please follow 
49 | [this README](slack_bot/README.md)


--------------------------------------------------------------------------------
/dev.env:
--------------------------------------------------------------------------------
 1 | # test workspace slack token
 2 | SLACK_APP_TOKEN=xapp-..
 3 | SLACK_BOT_TOKEN=xoxb-..
 4 | 
 5 | # OpenAI API key
 6 | OPENAI_API_KEY=sk-..
 7 | 
 8 | PINECONE_API_KEY=..
 9 | PINECONE_ENV=..
10 | 
11 | WANDB_API_KEY=..
12 | 
13 | ZILLIZ_CLOUD_URI=https://..
14 | ZILLIZ_CLOUD_API_KEY=..
15 | 
16 | ZILLIZ_PUBLIC_ENDPOINT=https://..
17 | ZILLIZ_API_KEY=..
18 | 
19 | LANGCHAIN_API_KEY=lsv2_..
20 | 
21 | COHERE_API_KEY=..
22 | 
23 | # DEBUG log level
24 | #LOG_LEVEL=10
25 | 
26 | LOCAL_MILVUS=True


--------------------------------------------------------------------------------
/fly.toml:
--------------------------------------------------------------------------------
1 | app = "faq-slack-bot"
2 | primary_region = "mad"
3 | 
4 | [build]
5 |   image = "aaalexlit/faq-slack-bot:main"
6 | 
7 | [env]
8 |   PINECONE_ENV = "gcp-starter"
9 | 


--------------------------------------------------------------------------------
/ingest/README.md:
--------------------------------------------------------------------------------
 1 | # Execute indexing
 2 | ## For ML Zoomcamp
 3 | At the moment the indexing is scheduled to execute with [Prefect Cloud](https://app.prefect.cloud/)
 4 | via deployments every 24 hours at 23 CET
 5 | 
 6 | Steps to change/run the deployment are described in [prefect.md](prefect.md)
 7 | 
 8 | ## For MLOps Zoomcamp
 9 | 
10 | Execute [ingest.py](mlops/ingest_mlops_old.py)
11 | ```shell
12 | python ingest_mlops_old.py
13 | ```
14 | 
15 | # Setup Prefect
16 | 
17 | To run any ingestion, Prefect needs to be set up, 
18 | as the code relies on secrets stored in Prefect blocks.
19 | 
20 | ## Create a new profile to use with the cloud and use it (Optional)
21 | 
22 | ```bash
23 | prefect profile create cloud
24 | prefect profile use cloud
25 | ```
26 | 
27 | ## Log in to prefect cloud either though browser or using the API key
28 | ```bash
29 | prefect cloud login
30 | ```
31 | 
32 | Create the required prefect blocks. Make sure to set up corresponding environment
33 | variables.
34 | 
35 | ```shell
36 | python ingest/prefect_infra/create_secrets_blocks.py
37 | ```


--------------------------------------------------------------------------------
/ingest/de.dockerfile:
--------------------------------------------------------------------------------
 1 | FROM prefecthq/prefect:2-python3.10
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install -y gcc python3-dev
 5 | 
 6 | RUN pip install -U pip
 7 | 
 8 | WORKDIR /usr/src
 9 | 
10 | COPY ingest/requirements.txt ./
11 | RUN pip install --no-cache-dir -r requirements.txt
12 | 
13 | ENV EMBEDDING_CACHE_NAMESPACE=de_zoomcamp
14 | 
15 | COPY ingest/de/ingest_de.py ingest/de/
16 | COPY ingest/readers ingest/readers
17 | COPY ingest/utils ingest/utils
18 | 


--------------------------------------------------------------------------------
/ingest/de/ingest_de.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from prefect import flow, task
 4 | 
 5 | from ingest.utils.index_utils import index_spreadsheet, index_github_repo, \
 6 |     index_slack_history, index_faq, index_youtube
 7 | 
 8 | DE_CHANNEL_ID = 'C01FABYF2RG'
 9 | FAQ_COLLECTION_NAME = 'dezoomcamp_faq_git'
10 | 
11 | os.environ['PREFECT_LOGGING_EXTRA_LOGGERS'] = 'llama-index-core'
12 | 
13 | 
14 | @task(name="Index course github repo")
15 | def index_course_github_repo():
16 |     owner = 'DataTalksClub'
17 |     repo = 'data-engineering-zoomcamp'
18 |     branch = 'main'
19 |     index_github_repo(owner=owner,
20 |                       repo=repo,
21 |                       branch=branch,
22 |                       collection_name=FAQ_COLLECTION_NAME,
23 |                       ignore_directories=['.github', '.gitignore', 'cohorts/2022', 'cohorts/2023', 'cohorts/2024',
24 |                                           'images'],
25 |                       )
26 | 
27 | 
28 | @task(name="Index risingwave zoomcamp github repo")
29 | def index_risingwave_zoomcamp_github_repo():
30 |     owner = 'risingwavelabs'
31 |     repo = 'risingwave-data-talks-workshop-2024-03-04'
32 |     branch = 'main'
33 |     index_github_repo(owner=owner,
34 |                       repo=repo,
35 |                       branch=branch,
36 |                       collection_name=FAQ_COLLECTION_NAME,
37 |                       ignore_directories=['assets', 'data'],
38 |                       ignore_file_extensions=['.gitignore', '.parquet', '.csv'])
39 | 
40 | 
41 | @task(name="Index mage zoomcamp github repo")
42 | def index_mage_zoomcamp_github_repo():
43 |     owner = 'mage-ai'
44 |     repo = 'mage-zoomcamp'
45 |     branch = 'solutions'
46 |     index_github_repo(owner=owner,
47 |                       repo=repo,
48 |                       branch=branch,
49 |                       collection_name=FAQ_COLLECTION_NAME,
50 |                       ignore_directories=[],
51 |                       ignore_file_extensions=['.gitignore'])
52 | 
53 | 
54 | @task(name="Index FAQ Google Document")
55 | def index_google_doc():
56 |     document_ids = ["19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw"]
57 |     print('Loading google doc...')
58 |     index_faq(document_ids, FAQ_COLLECTION_NAME)
59 | 
60 | 
61 | @task(name="Index course schedule")
62 | def index_course_schedule():
63 |     url = (
64 |         'https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-'
65 |         'yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml')
66 |     title = 'DE Zoomcamp 2024 syllabus and deadlines'
67 |     index_spreadsheet(url, title, FAQ_COLLECTION_NAME)
68 | 
69 | 
70 | @task(name="Index slack messages")
71 | def index_slack_messages():
72 |     channel_ids = [DE_CHANNEL_ID]
73 |     index_slack_history(channel_ids, FAQ_COLLECTION_NAME)
74 | 
75 | 
76 | @task(name="Index QA videos subtitles")
77 | def index_yt_subtitles():
78 |     video_ids = ['X8cEEwi8DTM']
79 |     index_youtube(video_ids, FAQ_COLLECTION_NAME)
80 | 
81 | 
82 | @flow(name="Update DE info Milvus index", log_prints=True)
83 | def fill_de_index():
84 |     print(f"Execution environment is {os.getenv('EXECUTION_ENV', 'local')}")
85 |     index_google_doc()
86 |     index_slack_messages.submit(wait_for=[index_google_doc])
87 |     index_course_schedule.submit(wait_for=[index_google_doc])
88 |     # index_evaluation_criteria.submit(wait_for=[index_google_doc])
89 |     index_course_github_repo.submit(wait_for=[index_google_doc])
90 |     index_yt_subtitles.submit(wait_for=[index_google_doc])
91 | 
92 | 
93 | if __name__ == '__main__':
94 |     fill_de_index()
95 | 


--------------------------------------------------------------------------------
/ingest/dev.env:
--------------------------------------------------------------------------------
 1 | PINECONE_API_KEY=..
 2 | PINECONE_ENV=..
 3 | ZILLIZ_CLOUD_URI=https://..
 4 | ZILLIZ_CLOUD_API_KEY=..
 5 | SLACK_BOT_TOKEN=xoxb-..
 6 | GITHUB_TOKEN=ghp_..
 7 | UPSTASH_REDIS_REST_URL=https://..
 8 | UPSTASH_REDIS_REST_TOKEN=..
 9 | ZILLIZ_PUBLIC_ENDPOINT=https://..
10 | ZILLIZ_API_KEY=..
11 | 


--------------------------------------------------------------------------------
/ingest/llm.dockerfile:
--------------------------------------------------------------------------------
 1 | FROM prefecthq/prefect:2-python3.10
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install -y gcc python3-dev
 5 | 
 6 | RUN pip install -U pip
 7 | 
 8 | WORKDIR /usr/src
 9 | 
10 | COPY ingest/requirements.txt ./
11 | RUN pip install --no-cache-dir -r requirements.txt
12 | 
13 | ENV EMBEDDING_CACHE_NAMESPACE=llm_zoomcamp
14 | 
15 | COPY ingest/llm/ingest_llm.py ingest/llm/
16 | COPY ingest/readers ingest/readers
17 | COPY ingest/utils ingest/utils
18 | 


--------------------------------------------------------------------------------
/ingest/llm/ingest_llm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from prefect import flow, task
 4 | 
 5 | from ingest.utils.index_utils import index_github_repo, \
 6 |     index_slack_history, index_faq
 7 | 
 8 | SLACK_CHANNEL_ID = 'C06TEGTGM3J'
 9 | COLLECTION_NAME = 'llmzoomcamp'
10 | 
11 | 
12 | @task(name="Index course github repo")
13 | def index_course_github_repo():
14 |     owner = 'DataTalksClub'
15 |     repo = 'llm-zoomcamp'
16 |     branch = 'main'
17 |     index_github_repo(owner=owner,
18 |                       repo=repo,
19 |                       branch=branch,
20 |                       collection_name=COLLECTION_NAME,
21 |                       ignore_directories=['.github', '.gitignore', 'images'],
22 |                       )
23 | 
24 | 
25 | @task(name="Index FAQ Google Document")
26 | def index_google_doc():
27 |     document_ids = ["1m2KexowAXTmexfC5rVTCSnaShvdUQ8Ag2IEiwBDHxN0"]
28 |     print('Loading google doc...')
29 |     index_faq(document_ids, COLLECTION_NAME)
30 | 
31 | 
32 | @task(name="Index slack messages")
33 | def index_slack_messages():
34 |     channel_ids = [SLACK_CHANNEL_ID]
35 |     index_slack_history(channel_ids, COLLECTION_NAME)
36 | 
37 | 
38 | @flow(name="Update LLM info Milvus index", log_prints=True)
39 | def fill_llm_index():
40 |     print(f"Execution environment is {os.getenv('EXECUTION_ENV', 'local')}")
41 |     index_google_doc()
42 |     index_slack_messages.submit(wait_for=[index_google_doc])
43 |     index_course_github_repo.submit(wait_for=[index_google_doc])
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     fill_llm_index()
48 | 


--------------------------------------------------------------------------------
/ingest/local_development.md:
--------------------------------------------------------------------------------
 1 | # Run ingestion locally for ML and DE Zoomcamps
 2 | 
 3 | Steps to fill in the index locally:
 4 | 
 5 | 1. start dockerized [Milvus](https://milvus.io/) from [local_milvus](local_milvus) folder
 6 |     ```shell
 7 |      cd ingest/local_milvus
 8 |      docker compose up    
 9 |     ```
10 |    
11 | 1. Rename [dev.env](../dev.env) to `.env` and set all the required variables
12 | 
13 | 1. Create the prefect blocks (needs to be run once)
14 |    ```shell
15 |    python ingest/prefect_infra/create_secrets_blocks.py
16 |    ```
17 |    
18 | 1. execute ingestion script [ingest_ml.py](ml/ingest_ml.py) (for ML zoomcamp data) 
19 | or [ingest_de.py](de/ingest_de.py) (for DE zoomcamp data).  
20 | It will be executed with `EXECUTION_ENV` env var set to `local` by default
21 |    ```shell
22 |    export PYTHONPATH="${PYTHONPATH}:$(pwd)" 
23 |    python ingest/ml/ingest_ml.py
24 |    ```
25 |    
26 |    If you're using Pycharm IDE there are run configurations available:  
27 |    [ingest_de](../.run/ingest_de.run.xml)  
28 |    [ingest_ml](../.run/ingest_ml.run.xml)


--------------------------------------------------------------------------------
/ingest/local_milvus/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.5'
 2 | 
 3 | services:
 4 |   etcd:
 5 |     container_name: milvus-etcd
 6 |     image: quay.io/coreos/etcd:v3.5.16
 7 |     environment:
 8 |       - ETCD_AUTO_COMPACTION_MODE=revision
 9 |       - ETCD_AUTO_COMPACTION_RETENTION=1000
10 |       - ETCD_QUOTA_BACKEND_BYTES=4294967296
11 |       - ETCD_SNAPSHOT_COUNT=50000
12 |     volumes:
13 |       - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
14 |     command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
15 |     healthcheck:
16 |       test: ["CMD", "etcdctl", "endpoint", "health"]
17 |       interval: 30s
18 |       timeout: 20s
19 |       retries: 3
20 | 
21 |   minio:
22 |     container_name: milvus-minio
23 |     image: minio/minio:RELEASE.2023-03-20T20-16-18Z
24 |     environment:
25 |       MINIO_ACCESS_KEY: minioadmin
26 |       MINIO_SECRET_KEY: minioadmin
27 |     ports:
28 |       - "9001:9001"
29 |       - "9000:9000"
30 |     volumes:
31 |       - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
32 |     command: minio server /minio_data --console-address ":9001"
33 |     healthcheck:
34 |       test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
35 |       interval: 30s
36 |       timeout: 20s
37 |       retries: 3
38 | 
39 |   standalone:
40 |     container_name: milvus-standalone
41 |     image: milvusdb/milvus:v2.5.4
42 |     command: ["milvus", "run", "standalone"]
43 |     security_opt:
44 |     - seccomp:unconfined
45 |     environment:
46 |       ETCD_ENDPOINTS: etcd:2379
47 |       MINIO_ADDRESS: minio:9000
48 |     volumes:
49 |       - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
50 |     healthcheck:
51 |       test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
52 |       interval: 30s
53 |       start_period: 90s
54 |       timeout: 20s
55 |       retries: 3
56 |     ports:
57 |       - "19530:19530"
58 |       - "9091:9091"
59 |     depends_on:
60 |       - "etcd"
61 |       - "minio"
62 | 
63 | networks:
64 |   default:
65 |     name: milvus


--------------------------------------------------------------------------------
/ingest/ml.dockerfile:
--------------------------------------------------------------------------------
 1 | FROM prefecthq/prefect:2-python3.10
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install -y gcc python3-dev
 5 | 
 6 | RUN pip install -U pip
 7 | 
 8 | WORKDIR /usr/src
 9 | 
10 | COPY ingest/requirements.txt ./
11 | RUN pip install --no-cache-dir -r requirements.txt
12 | 
13 | ENV EMBEDDING_CACHE_NAMESPACE=ml_zoomcamp
14 | 
15 | COPY ingest/ml/ingest_ml.py ingest/ml/
16 | COPY ingest/readers ingest/readers
17 | COPY ingest/utils ingest/utils
18 | 


--------------------------------------------------------------------------------
/ingest/ml/ingest_ml.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from prefect import flow, task
 4 | 
 5 | from ingest.utils.index_utils import index_spreadsheet, index_github_repo, \
 6 |     index_slack_history, index_faq
 7 | 
 8 | ML_CHANNEL_ID = 'C0288NJ5XSA'
 9 | FAQ_COLLECTION_NAME = 'mlzoomcamp_faq_git'
10 | 
11 | 
12 | @task(name="Index course github repo")
13 | def index_course_github_repo():
14 |     owner = 'DataTalksClub'
15 |     repo = 'machine-learning-zoomcamp'
16 |     branch = 'master'
17 |     index_github_repo(owner=owner,
18 |                       repo=repo,
19 |                       branch=branch,
20 |                       collection_name=FAQ_COLLECTION_NAME)
21 | 
22 | 
23 | @task(name="Index book github repo")
24 | def index_book_github_repo():
25 |     owner = 'alexeygrigorev'
26 |     repo = 'mlbookcamp-code'
27 |     branch = 'master'
28 |     ignore_directories = ['.github', 'course-zoomcamp', 'images', 'util']
29 |     index_github_repo(owner=owner,
30 |                       repo=repo,
31 |                       branch=branch,
32 |                       ignore_directories=ignore_directories,
33 |                       collection_name=FAQ_COLLECTION_NAME)
34 | 
35 | 
36 | @task(name="Index FAQ Google Document")
37 | def index_google_doc():
38 |     document_ids = ["1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8"]
39 |     print('Loading google doc...')
40 |     index_faq(document_ids, FAQ_COLLECTION_NAME)
41 | 
42 | 
43 | @task(name="Index course schedule")
44 | def index_course_schedule():
45 |     url = ('https://docs.google.com/spreadsheets/d/e/2PACX'
46 |            '-1vSkEwMv5OKwCdPfW6LgqQvKk48dZjPcFDrjDstBqZfq38UPadh0Nws1b57qOVYwzAjSufKnVf7umGWH/pubhtml')
47 |     title = 'ML Zoomcamp 2023 syllabus and deadlines'
48 |     index_spreadsheet(url, title, FAQ_COLLECTION_NAME)
49 | 
50 | 
51 | @task(name="Index project evaluation criteria")
52 | def index_evaluation_criteria():
53 |     url = ('https://docs.google.com/spreadsheets/d/e/2PACX'
54 |            '-1vQCwqAtkjl07MTW-SxWUK9GUvMQ3Pv_fF8UadcuIYLgHa0PlNu9BRWtfLgivI8xSCncQs82HDwGXSm3/pubhtml')
55 |     title = 'ML Zoomcamp project evaluation criteria : Project criteria'
56 |     index_spreadsheet(url, title, FAQ_COLLECTION_NAME)
57 | 
58 | 
59 | @task(name="Index slack messages")
60 | def index_slack_messages():
61 |     channel_ids = [ML_CHANNEL_ID]
62 |     index_slack_history(channel_ids, FAQ_COLLECTION_NAME)
63 | 
64 | 
65 | @flow(name="Update ML info Milvus index", log_prints=True)
66 | def fill_ml_index():
67 |     print(f"Execution environment is {os.getenv('EXECUTION_ENV', 'local')}")
68 |     index_google_doc()
69 |     index_slack_messages.submit(wait_for=[index_google_doc])
70 |     index_course_schedule.submit(wait_for=[index_google_doc])
71 |     index_evaluation_criteria.submit(wait_for=[index_google_doc])
72 |     index_course_github_repo.submit(wait_for=[index_google_doc])
73 |     index_book_github_repo.submit(wait_for=[index_google_doc])
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     fill_ml_index()
78 | 


--------------------------------------------------------------------------------
/ingest/mlops.dockerfile:
--------------------------------------------------------------------------------
 1 | FROM prefecthq/prefect:2-python3.10
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install -y gcc python3-dev
 5 | 
 6 | RUN pip install -U pip
 7 | 
 8 | WORKDIR /usr/src
 9 | 
10 | COPY ingest/requirements.txt ./
11 | RUN pip install --no-cache-dir -r requirements.txt
12 | 
13 | ENV EMBEDDING_CACHE_NAMESPACE=mlops_zoomcamp
14 | 
15 | COPY ingest/mlops/ingest_mlops.py ingest/mlops/
16 | COPY ingest/readers ingest/readers
17 | COPY ingest/utils ingest/utils
18 | 


--------------------------------------------------------------------------------
/ingest/mlops/ingest_mlops.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from prefect import flow, task
 4 | 
 5 | from ingest.utils.index_utils import index_github_repo, \
 6 |     index_slack_history, index_faq
 7 | 
 8 | SLACK_CHANNEL_ID = 'C02R98X7DS9'
 9 | COLLECTION_NAME = 'mlopszoomcamp'
10 | 
11 | 
12 | @task(name="Index course github repo")
13 | def index_course_github_repo():
14 |     owner = 'DataTalksClub'
15 |     repo = 'mlops-zoomcamp'
16 |     branch = 'main'
17 |     index_github_repo(owner=owner,
18 |                       repo=repo,
19 |                       branch=branch,
20 |                       collection_name=COLLECTION_NAME,
21 |                       ignore_directories=['.github', '.gitignore', 'cohorts/2022', 'cohorts/2023', 'images'],
22 |                       )
23 | 
24 | 
25 | @task(name="Index FAQ Google Document")
26 | def index_google_doc():
27 |     document_ids = ["12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0"]
28 |     print('Loading google doc...')
29 |     index_faq(document_ids, COLLECTION_NAME)
30 | 
31 | 
32 | @task(name="Index slack messages")
33 | def index_slack_messages():
34 |     channel_ids = [SLACK_CHANNEL_ID]
35 |     index_slack_history(channel_ids, COLLECTION_NAME)
36 | 
37 | 
38 | @flow(name="Update MLOps info Milvus index", log_prints=True)
39 | def fill_mlops_index():
40 |     print(f"Execution environment is {os.getenv('EXECUTION_ENV', 'local')}")
41 |     index_google_doc()
42 |     index_slack_messages.submit(wait_for=[index_google_doc])
43 |     index_course_github_repo.submit(wait_for=[index_google_doc])
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     fill_mlops_index()
48 | 


--------------------------------------------------------------------------------
/ingest/mlops/ingest_mlops_old.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import shutil
  4 | import tempfile
  5 | import time
  6 | from pathlib import Path
  7 | 
  8 | import pinecone  # type: ignore
  9 | from langchain_community.document_loaders import GoogleDriveLoader, GitLoader
 10 | from langchain.embeddings import HuggingFaceEmbeddings
 11 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 12 | from langchain.vectorstores import Pinecone
 13 | from prefect import flow, task
 14 | from prefect.blocks.system import Secret
 15 | from prefect_gcp import GcpCredentials
 16 | 
 17 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
 18 | embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5')
 19 | embedding_dimension = len(embeddings.embed_query("test"))
 20 | print(f'embedding dimension = {embedding_dimension}')
 21 | 
 22 | 
 23 | @task(name="Index FAQ Google Document")
 24 | def ingest_google_doc(index_name: str,
 25 |                       document_ids: list[str],
 26 |                       ):
 27 |     print('Loading google doc...')
 28 |     temp_creds = tempfile.NamedTemporaryFile()
 29 |     creds_dict = GcpCredentials.load("google-drive-creds").service_account_info.get_secret_value()
 30 |     with open(temp_creds.name, 'w') as f_out:
 31 |         json.dump(creds_dict, f_out)
 32 |     loader = GoogleDriveLoader(service_account_key=Path(temp_creds.name),
 33 |                                document_ids=document_ids)
 34 |     # loader = GoogleDriveLoader(service_account_key=Path.cwd() / "keys" / "service_account_key.json",
 35 |     #                            document_ids=document_ids)
 36 | 
 37 |     raw_docs = loader.load()
 38 |     temp_creds.close()
 39 |     print('Splitting docs for indexing...')
 40 |     text_splitter = get_text_splitter()
 41 |     docs = text_splitter.split_documents(raw_docs)
 42 | 
 43 |     index_docs(docs, index_name)
 44 | 
 45 | 
 46 | def index_docs(docs, index_name):
 47 |     print('Filling the index up...')
 48 |     Pinecone.from_documents(docs, embeddings, index_name=index_name)
 49 |     time.sleep(10)
 50 |     print_index_status(index_name)
 51 | 
 52 | 
 53 | @task(name="Delete and Create Pinecone index")
 54 | def create_pinecone_index(index_name: str):
 55 |     if index_name in pinecone.list_indexes():
 56 |         print(f"Index {index_name} exists. Deleting...")
 57 |         pinecone.delete_index(index_name)
 58 | 
 59 |     if index_name not in pinecone.list_indexes():
 60 |         print(f"Creating index {index_name}...")
 61 |         pinecone.create_index(
 62 |             name=index_name,
 63 |             dimension=embedding_dimension
 64 |         )
 65 | 
 66 |     print_index_status(index_name)
 67 | 
 68 | 
 69 | def print_index_status(index_name):
 70 |     index = pinecone.GRPCIndex(index_name)
 71 |     index_stats = index.describe_index_stats()
 72 |     print(f"index stats: {index_stats}")
 73 | 
 74 | 
 75 | @task(name="Index git repo")
 76 | def ingest_git_repo(repo_url: str, index_name: str):
 77 |     local_dir_path = f"./git/{repo_url[repo_url.rindex('/') + 1:]}"
 78 |     if Path(local_dir_path).exists():
 79 |         remove_local_dir(local_dir_path)
 80 |     loader = GitLoader(
 81 |         clone_url=repo_url,
 82 |         repo_path=local_dir_path,
 83 |     )
 84 |     print('Loading and Splitting git repo for indexing...')
 85 |     text_splitter = get_text_splitter()
 86 |     docs = loader.load_and_split(text_splitter)
 87 |     index_docs(docs, index_name)
 88 |     remove_local_dir(local_dir_path)
 89 | 
 90 | 
 91 | def remove_local_dir(local_dir_path):
 92 |     print(f'Removing local files in {local_dir_path}')
 93 |     shutil.rmtree(local_dir_path)
 94 | 
 95 | 
 96 | def get_text_splitter():
 97 |     return RecursiveCharacterTextSplitter(
 98 |         chunk_size=1000,
 99 |         chunk_overlap=200,
100 |     )
101 | 
102 | 
103 | @flow(name="Update the index in Pinecone for MLOps Zoomcamp", log_prints=True)
104 | def create_and_fill_the_index(index_name: str,
105 |                               google_doc_ids: list[str],
106 |                               repo_url: str,
107 |                               overwrite: bool):
108 |     pinecone.init(
109 |         api_key=Secret.load('pinecone-api-key').get(),
110 |         environment=Secret.load('pinecone-env').get()
111 |     )
112 |     if overwrite:
113 |         create_pinecone_index(index_name=index_name)
114 |     ingest_google_doc(index_name,
115 |                       google_doc_ids)
116 |     ingest_git_repo(repo_url, index_name)
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     index_name = 'mlops-faq-bot'
121 |     google_doc_id = ["12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0"]
122 |     repo_url = 'https://github.com/DataTalksClub/mlops-zoomcamp'
123 |     overwrite = True
124 |     create_and_fill_the_index(index_name=index_name,
125 |                               google_doc_ids=google_doc_id,
126 |                               repo_url=repo_url,
127 |                               overwrite=overwrite)
128 | 


--------------------------------------------------------------------------------
/ingest/prefect.md:
--------------------------------------------------------------------------------
 1 | # Run the ingestion for ML with prefect deployments
 2 | 
 3 | ## Execute ingestion
 4 | 
 5 | Currently, indexing is scheduled to execute:
 6 | - Daily at 00:00 CET for **DE Zoomcamp** documents
 7 | - Weekly at 23:00 CET on Monday for **ML Zoomcamp** documents
 8 | 
 9 | Before running any execution make sure the worker is started:
10 | ```shell
11 | prefect worker start --pool zoomcamp-faq-bot
12 | ```
13 | 
14 | Ad-hoc executions can be run from the [Prefect Cloud UI](https://app.prefect.cloud/)
15 | by launching the corresponding deployment.  
16 | 
17 | 
18 | It's also possible to run it from the command line:
19 | 
20 | ### Run ingestion deployment for ML
21 | ```shell
22 | prefect deployment run 'Update ML info Milvus index/fill-index-zilliz-ml'
23 | ```
24 | 
25 | ### Run ingestion deployment for DE
26 | ```shell
27 | prefect deployment run 'Update DE info Milvus index/fill-index-zilliz-de'
28 | ```
29 | 
30 | ## Change the properties of a deployment 
31 | ### Bulk
32 | Depending on the nature of the changes, after modifying the code or 
33 | [prefect.yaml](../prefect.yaml) re-create both deployments by running
34 | 
35 | ```shell
36 | prefect deploy --all
37 | ```
38 | ### Individual
39 | Alternatively it can be done per deployment if the changes are not affecting both  
40 | **re-create deployment for ML ingestion**
41 | ```shell
42 | prefect deploy --name fill-index-zilliz-ml
43 | ```
44 | **re-create deployment for DE ingestion**
45 | ```shell
46 | prefect deploy --name fill-index-zilliz-de
47 | ```
48 | 
49 | ## Setup prefect from scratch
50 | 
51 | Login to prefect cloud:
52 | 
53 | ```shell
54 | prefect cloud login
55 | ```
56 | 
57 | Create the required blocks:
58 | 
59 | ```shell
60 | python ingest/prefect_infra/create_secrets_blocks.py
61 | ```
62 | 
63 | Create work pool
64 | 
65 | ```shell
66 | prefect work-pool create --type docker zoomcamp-faq-bot
67 | ```
68 | 
69 | Run the following command in this new terminal to start the worker:
70 | 
71 | ```shell
72 | prefect worker start --pool zoomcamp-faq-bot
73 | ```
74 | 
75 | Create all the deployments from [prefect.yaml](../prefect.yaml) file
76 | 
77 | ```shell
78 | prefect deploy --all
79 | ```
80 | 
81 | Run the ingestion by executing created deployments following the
82 | instructions above.


--------------------------------------------------------------------------------
/ingest/prefect_infra/create_secrets_blocks.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import time
 4 | 
 5 | from prefect.blocks.system import Secret
 6 | from prefect_gcp import GcpCredentials
 7 | 
 8 | 
 9 | def create_gcp_creds_block():
10 |     block_name = "google-drive-creds"
11 |     try:
12 |         GcpCredentials.load(block_name)
13 |         print(f"Block {block_name} exists")
14 |     except ValueError:
15 |         print(f"Creating Block {block_name}")
16 |         with open("../keys/service_account_key.json", 'r') as f_in:
17 |             service_account_info_str = f_in.read()
18 | 
19 |         service_account_info = json.loads(service_account_info_str)
20 | 
21 |         GcpCredentials(
22 |             service_account_info=service_account_info
23 |         ).save(block_name)
24 |         time.sleep(10)
25 | 
26 | 
27 | def create_secret_block(block_name: str, env_var_name: str) -> None:
28 |     try:
29 |         Secret.load(block_name)
30 |         print(f"Block {block_name} exists")
31 |     except ValueError:
32 |         print(f"Creating Block {block_name}")
33 |         Secret(value=os.getenv(env_var_name)).save(name=block_name)
34 |         time.sleep(10)
35 | 
36 | 
37 | def create_pinecone_secrets():
38 |     create_secret_block('pinecone-api-key', 'PINECONE_API_KEY')
39 |     create_secret_block('pinecone-env', 'PINECONE_ENV')
40 | 
41 | 
42 | def create_zilliz_secrets():
43 |     create_secret_block('zilliz-cloud-uri', 'ZILLIZ_CLOUD_URI')
44 |     create_secret_block('zilliz-cloud-api-key', 'ZILLIZ_CLOUD_API_KEY')
45 |     create_secret_block('zilliz-public-endpoint', 'ZILLIZ_PUBLIC_ENDPOINT')
46 |     create_secret_block('zilliz-api-key', 'ZILLIZ_API_KEY')
47 | 
48 | 
49 | def create_slack_secrets():
50 |     create_secret_block('slack-bot-token', 'SLACK_BOT_TOKEN')
51 | 
52 | 
53 | def create_github_secrets():
54 |     create_secret_block('github-token', 'GITHUB_TOKEN')
55 | 
56 | 
57 | def create_upstash_redis_secrets():
58 |     create_secret_block('upstash-redis-rest-url', 'UPSTASH_REDIS_REST_URL')
59 |     create_secret_block('upstash-redis-rest-token', 'UPSTASH_REDIS_REST_TOKEN')
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     create_gcp_creds_block()
64 |     create_pinecone_secrets()
65 |     create_zilliz_secrets()
66 |     create_slack_secrets()
67 |     create_github_secrets()
68 |     create_upstash_redis_secrets()
69 | 


--------------------------------------------------------------------------------
/ingest/readers/custom_faq_gdoc_reader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Any, Optional
  3 | 
  4 | from llama_index.core.readers.base import BasePydanticReader
  5 | from llama_index.core.schema import Document
  6 | 
  7 | DEFAULT_TOKEN_JSON_PATH = 'token.json'
  8 | DEFAULT_SERVICE_ACCOUNT_JSON_PATH = 'service_account.json'
  9 | DEFAULT_CREDENTIALS_JSON_PATH = 'credentials.json'
 10 | 
 11 | HEADING_STYLE_TEMPLATE = 'HEADING_{}'
 12 | DEFAULT_QUESTION_HEADING_STYLE_NUM = 2
 13 | 
 14 | EXCLUDED_LLM_METADATA_KEYS = ['source', 'title', 'section_name']
 15 | EXCLUDED_EMBED_METADATA_KEYS = ['source', 'title']
 16 | 
 17 | SCOPES = ["https://www.googleapis.com/auth/documents.readonly"]
 18 | 
 19 | 
 20 | class FAQGoogleDocsReader(BasePydanticReader):
 21 |     token_json_path: str = DEFAULT_TOKEN_JSON_PATH
 22 |     service_account_json_path: str = DEFAULT_SERVICE_ACCOUNT_JSON_PATH
 23 |     credentials_json_path: str = DEFAULT_CREDENTIALS_JSON_PATH
 24 |     question_heading_style_num: int = DEFAULT_QUESTION_HEADING_STYLE_NUM
 25 |     is_remote: bool = True
 26 | 
 27 |     def __init__(self,
 28 |                  token_json_path: Optional[str] = DEFAULT_TOKEN_JSON_PATH,
 29 |                  service_account_json_path: Optional[str] = DEFAULT_SERVICE_ACCOUNT_JSON_PATH,
 30 |                  credentials_json_path: Optional[str] = DEFAULT_CREDENTIALS_JSON_PATH,
 31 |                  question_heading_style_num: Optional[int] = DEFAULT_QUESTION_HEADING_STYLE_NUM
 32 |                  ) -> None:
 33 |         """Initialize with parameters."""
 34 |         try:
 35 |             import google  # noqa
 36 |             import google_auth_oauthlib  # noqa
 37 |             import googleapiclient  # noqa
 38 |         except ImportError as e:
 39 |             raise ImportError(
 40 |                 '`google_auth_oauthlib`, `googleapiclient` and `google` '
 41 |                 'must be installed to use the GoogleDocsReader.\n'
 42 |                 'Please run `pip install --upgrade google-api-python-client '
 43 |                 'google-auth-httplib2 google-auth-oauthlib`.'
 44 |             ) from e
 45 |         super().__init__(token_json_path=token_json_path,
 46 |                          service_account_json_path=service_account_json_path,
 47 |                          credentials_json_path=credentials_json_path,
 48 |                          question_heading_style_num=question_heading_style_num)
 49 | 
 50 |     @classmethod
 51 |     def class_name(cls) -> str:
 52 |         return 'CustomGoogleDocsReader'
 53 | 
 54 |     def load_data(self, document_ids: [str]) -> [Document]:
 55 |         """Load data from the input directory.
 56 | 
 57 |         Args:
 58 |             document_ids (List[str]): a list of document ids.
 59 |         """
 60 |         if document_ids is None:
 61 |             raise ValueError('Must specify a "document_ids" in `load_kwargs`.')
 62 | 
 63 |         results = []
 64 |         for document_id in document_ids:
 65 |             docs = self._load_docs(document_id)
 66 |             results.extend(docs)
 67 |         return results
 68 | 
 69 |     def _load_docs(self, document_id: str) -> [Document]:
 70 |         """Load a document from Google Docs.
 71 | 
 72 |         Args:
 73 |             document_id: the document id.
 74 | 
 75 |         Returns:
 76 |             The document text.
 77 |         """
 78 |         import googleapiclient.discovery as discovery
 79 | 
 80 |         credentials = self._get_credentials()
 81 |         docs_service = discovery.build('docs', 'v1', credentials=credentials)
 82 |         doc = docs_service.documents().get(documentId=document_id).execute()
 83 |         doc_content = doc.get('body').get('content')
 84 |         doc_source = f'https://docs.google.com/document/d/{document_id}/edit#heading='
 85 |         return self._structural_elements_to_docs(doc_content, doc_source)
 86 | 
 87 |     def _get_credentials(self) -> Any:
 88 |         """Get valid user credentials from storage.
 89 | 
 90 |         The file token.json stores the user's access and refresh tokens, and is
 91 |         created automatically when the authorization flow completes for the first
 92 |         time.
 93 | 
 94 |         Returns:
 95 |             Credentials, the obtained credential.
 96 |         """
 97 |         from google.auth.transport.requests import Request
 98 |         from google.oauth2 import service_account
 99 |         from google.oauth2.credentials import Credentials
100 |         from google_auth_oauthlib.flow import InstalledAppFlow
101 | 
102 |         creds = None
103 |         if os.path.exists(self.token_json_path):
104 |             creds = Credentials.from_authorized_user_file(self.token_json_path, SCOPES)
105 |         elif os.path.exists(self.service_account_json_path):
106 |             return service_account.Credentials.from_service_account_file(
107 |                 self.service_account_json_path, scopes=SCOPES
108 |             )
109 |         # If there are no (valid) credentials available, let the user log in.
110 |         if not creds or not creds.valid:
111 |             if creds and creds.expired and creds.refresh_token:
112 |                 creds.refresh(Request())
113 |             else:
114 |                 flow = InstalledAppFlow.from_client_secrets_file(
115 |                     self.credentials_json_path, SCOPES
116 |                 )
117 |                 creds = flow.run_local_server(port=8080)
118 |             # Save the credentials for the next run
119 |             with open(self.token_json_path, 'w') as token:
120 |                 token.write(creds.to_json())
121 | 
122 |         return creds
123 | 
124 |     @staticmethod
125 |     def _read_paragraph_element(element: Any) -> Any:
126 |         """Return the text in the given ParagraphElement.
127 | 
128 |         Args:
129 |             element: a ParagraphElement from a Google Doc.
130 |         """
131 |         text_run = element.get('textRun')
132 |         return text_run.get('content') if text_run else ''
133 | 
134 |     @staticmethod
135 |     def _get_text_from_paragraph_elements(elements: [Any]) -> Any:
136 |         return ''.join(FAQGoogleDocsReader._read_paragraph_element(elem) for elem in elements)
137 | 
138 |     def _structural_elements_to_docs(self,
139 |                                      doc_elements: [Any],
140 |                                      doc_source: str) -> [Document]:
141 |         """Recurse through a list of Structural Elements.
142 | 
143 |         Read a document's text where text may be in nested elements.
144 | 
145 |         Args:
146 |             doc_elements: a list of Structural Elements.
147 |         """
148 |         docs = []
149 |         text = ''
150 |         heading_id = ''
151 |         section_name = ''
152 |         question_heading_style = HEADING_STYLE_TEMPLATE.format(self.question_heading_style_num)
153 |         section_heading_style = HEADING_STYLE_TEMPLATE.format(self.question_heading_style_num - 1)
154 |         for value in doc_elements:
155 |             if 'paragraph' in value:
156 |                 paragraph = value['paragraph']
157 |                 elements = paragraph.get('elements')
158 |                 paragraph_text = FAQGoogleDocsReader._get_text_from_paragraph_elements(elements)
159 |                 if 'paragraphStyle' in paragraph and 'headingId' in paragraph['paragraphStyle']:
160 |                     named_style_type = paragraph['paragraphStyle']['namedStyleType']
161 |                     if named_style_type in [
162 |                         question_heading_style,
163 |                         section_heading_style,
164 |                     ]:
165 |                         # create previous document checking if it's not empty
166 |                         if text != '':
167 |                             node_metadata = {
168 |                                 'source': doc_source + heading_id,
169 |                                 'section_name': section_name,
170 |                                 'title': 'FAQ'
171 |                             }
172 |                             prev_doc = Document(text=text,
173 |                                                 metadata=node_metadata,
174 |                                                 excluded_embed_metadata_keys=EXCLUDED_EMBED_METADATA_KEYS,
175 |                                                 excluded_llm_metadata_keys=EXCLUDED_LLM_METADATA_KEYS)
176 |                             docs.append(prev_doc)
177 |                         if named_style_type == question_heading_style:
178 |                             heading_id = paragraph['paragraphStyle']['headingId']
179 |                             text = paragraph_text
180 |                         else:
181 |                             section_name = paragraph_text
182 |                             text = ''
183 |                 else:
184 |                     text += paragraph_text
185 |         return docs
186 | 
187 | 
188 | if __name__ == '__main__':
189 |     reader = FAQGoogleDocsReader(service_account_json_path='../keys/service_account_key.json')
190 |     docs = reader.load_data(['1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8'])
191 |     print(docs)
192 | 


--------------------------------------------------------------------------------
/ingest/readers/slack_reader.py:
--------------------------------------------------------------------------------
  1 | """Slack reader."""
  2 | import logging
  3 | import os
  4 | import sys
  5 | import time
  6 | from datetime import datetime, timedelta
  7 | from http.client import IncompleteRead
  8 | from ssl import SSLContext
  9 | from typing import Any, Optional
 10 | 
 11 | from llama_index.core.bridge.pydantic import PrivateAttr
 12 | from llama_index.core.readers.base import BasePydanticReader
 13 | from llama_index.core.schema import Document
 14 | 
 15 | logging.basicConfig(stream=sys.stdout, level=logging.INFO,
 16 |                     format='%(message)s')
 17 | logger = logging.getLogger(__name__)
 18 | EXCLUDED_METADATA_FIELDS = ['channel', 'thread_ts']
 19 | 
 20 | 
 21 | class SlackReader(BasePydanticReader):
 22 |     """Slack reader.
 23 | 
 24 |     Reads conversations from channels. If the earliest_date is provided, an
 25 |     optional latest_date can also be provided. If no latest_date is provided,
 26 |     we assume the latest date is the current timestamp.
 27 | 
 28 |     Args:
 29 |         slack_token (Optional[str]): Slack token. If not provided, we
 30 |             assume the environment variable `SLACK_BOT_TOKEN` is set.
 31 |         ssl (Optional[str]): Custom SSL context. If not provided, it is assumed
 32 |             there is already an SSL context available.
 33 |         earliest_date (Optional[datetime]): Earliest date from which
 34 |             to read conversations. If not provided, we read all messages.
 35 |         latest_date (Optional[datetime]): Latest date from which to
 36 |             read conversations. If not provided, defaults to current timestamp
 37 |             in combination with earliest_date.
 38 |     """
 39 | 
 40 |     is_remote: bool = True
 41 |     slack_token: str
 42 |     earliest_date_timestamp: Optional[float]
 43 |     latest_date_timestamp: float
 44 |     bot_user_id: Optional[str]
 45 |     not_ignore_users: Optional[list[str]] = []
 46 | 
 47 |     _client: Any = PrivateAttr()
 48 | 
 49 |     def __init__(
 50 |             self,
 51 |             slack_token: Optional[str] = None,
 52 |             ssl: Optional[SSLContext] = None,
 53 |             earliest_date: Optional[datetime] = None,
 54 |             latest_date: Optional[datetime] = None,
 55 |             earliest_date_timestamp: Optional[float] = None,
 56 |             latest_date_timestamp: Optional[float] = None,
 57 |             bot_user_id: Optional[str] = None,
 58 |             not_ignore_users: Optional[list[str]] = None
 59 |     ) -> None:
 60 |         """Initialize with parameters."""
 61 |         from slack_sdk import WebClient
 62 | 
 63 |         if slack_token is None:
 64 |             slack_token = os.environ["SLACK_BOT_TOKEN"]
 65 |         if slack_token is None:
 66 |             raise ValueError(
 67 |                 "Must specify `slack_token` or set environment "
 68 |                 "variable `SLACK_BOT_TOKEN`."
 69 |             )
 70 |         if ssl is None:
 71 |             self._client = WebClient(token=slack_token)
 72 |         else:
 73 |             self._client = WebClient(token=slack_token, ssl=ssl)
 74 |         if latest_date is not None and earliest_date is None:
 75 |             raise ValueError(
 76 |                 "Must specify `earliest_date` if `latest_date` is specified."
 77 |             )
 78 |         if not_ignore_users is None:
 79 |             not_ignore_users = []
 80 |         if earliest_date is not None:
 81 |             earliest_date_timestamp = earliest_date.timestamp()
 82 |         else:
 83 |             earliest_date_timestamp = None or earliest_date_timestamp
 84 |         if latest_date is not None:
 85 |             latest_date_timestamp = latest_date.timestamp()
 86 |         else:
 87 |             latest_date_timestamp = datetime.now().timestamp() or latest_date_timestamp
 88 |         res = self._client.api_test()
 89 |         if not res["ok"]:
 90 |             raise ValueError(f"Error initializing Slack API: {res['error']}")
 91 | 
 92 |         super().__init__(
 93 |             slack_token=slack_token,
 94 |             earliest_date_timestamp=earliest_date_timestamp,
 95 |             latest_date_timestamp=latest_date_timestamp,
 96 |             bot_user_id=bot_user_id,
 97 |             not_ignore_users=not_ignore_users,
 98 |         )
 99 | 
100 |     @classmethod
101 |     def class_name(cls) -> str:
102 |         """Get the name identifier of the class."""
103 |         return "SlackReader"
104 | 
105 |     def _read_message(self, channel_id: str, message_ts: str) -> Document:
106 |         from slack_sdk.errors import SlackApiError
107 | 
108 |         """Read a message."""
109 | 
110 |         messages_text: list[str] = []
111 |         next_cursor = None
112 |         while True:
113 |             try:
114 |                 # https://slack.com/api/conversations.replies
115 |                 # List all replies to a message, including the message itself.
116 |                 conversations_replies_kwargs = {
117 |                     "channel": channel_id,
118 |                     "ts": message_ts,
119 |                     "cursor": next_cursor,
120 |                 }
121 |                 if self.earliest_date_timestamp is not None:
122 |                     conversations_replies_kwargs |= {
123 |                         "latest": str(self.latest_date_timestamp),
124 |                         "oldest": str(self.earliest_date_timestamp),
125 |                     }
126 |                 result = self._client.conversations_replies(
127 |                     **conversations_replies_kwargs  # type: ignore
128 |                 )
129 |                 messages = result["messages"]
130 |                 messages_text.extend(message["text"] for message in messages if message['user'] != self.bot_user_id
131 |                                      and message['user'] not in self.not_ignore_users)
132 |                 messages_text.extend(message["attachments"][0]["text"] for message in messages if
133 |                                      message['user'] in self.not_ignore_users
134 |                                      and "attachments" in message
135 |                                      and "text" in message["attachments"][0])
136 | 
137 |                 if not result["has_more"]:
138 |                     break
139 | 
140 |                 next_cursor = result["response_metadata"]["next_cursor"]
141 |             except SlackApiError as e:
142 |                 self.sleep_on_ratelimit(e)
143 | 
144 |         return Document(text="\n\n".join(messages_text),
145 |                         metadata={"channel": channel_id, "thread_ts": float(message_ts)},
146 |                         excluded_embed_metadata_keys=EXCLUDED_METADATA_FIELDS,
147 |                         excluded_llm_metadata_keys=EXCLUDED_METADATA_FIELDS
148 |                         )
149 | 
150 |     def _read_channel(self, channel_id: str) -> list[Document]:
151 |         from slack_sdk.errors import SlackApiError
152 | 
153 |         """Read a channel."""
154 | 
155 |         thread_documents: list[Document] = []
156 |         next_cursor = None
157 |         while True:
158 |             try:
159 |                 # Call the conversations.history method using the WebClient
160 |                 # conversations.history returns the first 100 messages by default
161 |                 # These results are paginated,
162 |                 # see: https://api.slack.com/methods/conversations.history$pagination
163 |                 conversations_history_kwargs = {
164 |                     "channel": channel_id,
165 |                     "cursor": next_cursor,
166 |                     "latest": str(self.latest_date_timestamp),
167 |                 }
168 |                 if self.earliest_date_timestamp is not None:
169 |                     conversations_history_kwargs["oldest"] = str(
170 |                         self.earliest_date_timestamp
171 |                     )
172 |                 result = self._client.conversations_history(
173 |                     **conversations_history_kwargs  # type: ignore
174 |                 )
175 |                 conversation_history = result["messages"]
176 |                 # Print results
177 |                 logger.info(f"{len(conversation_history)} messages found in {channel_id}")
178 | 
179 |                 for message in conversation_history:
180 |                     if self.is_for_indexing(message):
181 |                         read_message: Document = self._read_message(channel_id, message["ts"])
182 |                         if read_message.text != "":
183 |                             thread_documents.append(read_message)
184 | 
185 |                 if not result["has_more"]:
186 |                     break
187 |                 next_cursor = result["response_metadata"]["next_cursor"]
188 | 
189 |             except SlackApiError as e:
190 |                 self.sleep_on_ratelimit(e)
191 |             except IncompleteRead:
192 |                 continue
193 | 
194 |         return thread_documents
195 | 
196 |     @staticmethod
197 |     def sleep_on_ratelimit(e):
198 |         if e.response["error"] == "ratelimited":
199 |             retry_after = e.response.headers["retry-after"]
200 |             logger.error(
201 |                 f'Rate limit error reached, sleeping for: {retry_after} seconds'
202 |             )
203 |             time.sleep(int(retry_after) + 1)
204 |         else:
205 |             logger.error(f"Error parsing conversation replies: {e}")
206 | 
207 |     def is_for_indexing(self, message):
208 |         # ignore unanswered messages
209 |         if 'reply_count' in message:
210 |             # if bot user id isn't specified or bot hasn't replied the message
211 |             if not self.bot_user_id or self.bot_user_id not in message['reply_users']:
212 |                 return True
213 |             if message['reply_users_count'] > 1:
214 |                 return True
215 |         # even if it's a single message but from a user in un-ignore list, index it
216 |         elif message['user'] in self.not_ignore_users:
217 |             return True
218 |         return False
219 | 
220 |     def load_data(self, channel_ids: list[str]) -> list[Document]:
221 |         """Load data from the input directory.
222 | 
223 |         Args:
224 |             channel_ids (List[str]): List of channel ids to read.
225 |         Returns:
226 |             List[Document]: List of documents.
227 |         """
228 |         results = []
229 |         for channel_id in channel_ids:
230 |             results.extend(self._read_channel(channel_id))
231 |         return results
232 | 
233 | 
234 | if __name__ == "__main__":
235 |     reader = SlackReader(earliest_date=datetime.now() - timedelta(days=2),
236 |                          bot_user_id='U05DM3PEJA2',
237 |                          not_ignore_users=['U01S08W6Z9T'])
238 |     for thread in reader.load_data(channel_ids=["C02R98X7DS9"]):
239 |             logger.info(f'Text: {thread.text}')
240 |             logger.info(f'Metadata: {thread.metadata}')
241 |             logger.info('----------------------------')
242 | 


--------------------------------------------------------------------------------
/ingest/readers/youtube_reader.py:
--------------------------------------------------------------------------------
 1 | """YouTube reader."""
 2 | 
 3 | from llama_index.core.readers.base import BasePydanticReader
 4 | from llama_index.core.schema import Document
 5 | 
 6 | 
 7 | class YoutubeReader(BasePydanticReader):
 8 | 
 9 |     def __init__(self) -> None:
10 |         try:
11 |             from youtube_transcript_api import YouTubeTranscriptApi
12 |         except ImportError as e:
13 |             raise ImportError(
14 |                 '`youtube_transcript_api` must be installed to use the YoutubeReader.\n'
15 |                 'Please run `pip install --upgrade youtube-transcript-api`.'
16 |             ) from e
17 | 
18 |         super().__init__()
19 | 
20 |     @classmethod
21 |     def class_name(cls) -> str:
22 |         """Get the name identifier of the class."""
23 |         return "YoutubeReader"
24 | 
25 |     def load_data(self, video_ids: list[str], tokenizer) -> list[Document]:
26 |         from youtube_transcript_api import YouTubeTranscriptApi
27 | 
28 |         documents: list[Document] = []
29 |         for video_id in video_ids:
30 |             yt_title = YoutubeReader._read_title(video_id)
31 |             current_start = None
32 |             current_text = ""
33 |             current_token_count = 0
34 |             transcript_array = YouTubeTranscriptApi.get_transcript(video_id)
35 | 
36 |             for segment in transcript_array:
37 |                 # Get the token count of the current segment text
38 |                 token_count = len(tokenizer(segment["text"], truncation=False, add_special_tokens=False)['input_ids'])
39 | 
40 |                 # If adding this segment exceeds 512 tokens, finalize the current document
41 |                 if current_token_count + token_count > 512:
42 |                     documents.append(Document(
43 |                         text=current_text.strip(),
44 |                         metadata=YoutubeReader._get_node_metadata(video_id, int(current_start), yt_title),
45 |                         excluded_embed_metadata_keys=['yt_link'],
46 |                         excluded_llm_metadata_keys=['yt_link']
47 |                     ))
48 | 
49 |                     # Start a new chunk
50 |                     current_start = segment["start"]
51 |                     current_text = segment["text"]
52 |                     current_token_count = token_count
53 |                 else:
54 |                     # Concatenate to the current chunk
55 |                     if not current_text:
56 |                         current_start = segment["start"]
57 |                     current_text += " " + segment["text"]
58 |                     current_token_count += token_count
59 | 
60 |             # Append the last chunk if it exists
61 |             if current_text:
62 |                 documents.append(Document(
63 |                     text=current_text.strip(),
64 |                     metadata=YoutubeReader._get_node_metadata(video_id, int(current_start), yt_title),
65 |                     excluded_embed_metadata_keys=['yt_link'],
66 |                     excluded_llm_metadata_keys=['yt_link']
67 |                 ))
68 | 
69 |         return documents
70 | 
71 |     @staticmethod
72 |     def _get_node_metadata(video_id: str, pos: int, yt_title: str) -> dict:
73 |         return {
74 |             'yt_link': f"https://www.youtube.com/watch?v={video_id}&t={pos}s",
75 |             'yt_title': yt_title
76 |         }
77 | 
78 |     @staticmethod
79 |     def _read_title(video_id: str) -> str:
80 |         params = {
81 |             "format": "json",
82 |             "url": f"https://www.youtube.com/watch?v={video_id}"
83 |         }
84 |         url = "https://www.youtube.com/oembed"
85 | 
86 |         import requests
87 |         response = requests.get(url, params=params)
88 |         if response.status_code == 200:
89 |             data = response.json()
90 |             return data['title']
91 |         else:
92 |             print(f"Failed to retrieve data: {response.status_code}")
93 |             return ''
94 | 


--------------------------------------------------------------------------------
/ingest/requirements.txt:
--------------------------------------------------------------------------------
 1 | slack-sdk==3.30.0
 2 | langchain==0.1.20
 3 | google-api-python-client==2.134.0
 4 | google-auth-httplib2==0.2.0
 5 | google-auth-oauthlib==1.2.0
 6 | sentence-transformers==3.0.1
 7 | prefect-gcp==0.5.12
 8 | GitPython==3.1.43
 9 | pymilvus==2.4.4
10 | llama-index-core==0.10.48
11 | llama-index-readers-web==0.1.19
12 | llama-index-readers-github==0.1.9
13 | llama-index-vector-stores-milvus==0.1.20
14 | llama-index-embeddings-langchain==0.1.2
15 | trafilatura==1.10.0
16 | nbconvert==7.16.4
17 | ipython==8.25.0
18 | upstash-redis==1.1.0
19 | jupyter-notebook-parser==0.1.4
20 | youtube-transcript-api==0.6.3
21 | 


--------------------------------------------------------------------------------
/ingest/utils/index_utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import tempfile
  4 | from datetime import datetime, timedelta
  5 | 
  6 | from jupyter_notebook_parser import JupyterNotebookParser
  7 | from langchain.embeddings import CacheBackedEmbeddings
  8 | from langchain_community.embeddings import HuggingFaceEmbeddings
  9 | from langchain_community.storage import UpstashRedisByteStore
 10 | from llama_index.core import Settings
 11 | from llama_index.core.indices import VectorStoreIndex
 12 | from llama_index.core.node_parser import NodeParser, SentenceSplitter, MarkdownNodeParser
 13 | from llama_index.core.schema import Document
 14 | from llama_index.core.storage import StorageContext
 15 | from llama_index.readers.github import GithubRepositoryReader, GithubClient
 16 | from llama_index.readers.web import TrafilaturaWebReader
 17 | from llama_index.vector_stores.milvus import MilvusVectorStore
 18 | from prefect.blocks.system import Secret
 19 | from prefect_gcp import GcpCredentials
 20 | from upstash_redis import Redis
 21 | 
 22 | from ingest.readers.custom_faq_gdoc_reader import FAQGoogleDocsReader
 23 | from ingest.readers.slack_reader import SlackReader
 24 | from ingest.readers.youtube_reader import YoutubeReader
 25 | 
 26 | BOT_USER_ID = 'U05DM3PEJA2'
 27 | AU_TOMATOR_USER_ID = 'U01S08W6Z9T'
 28 | 
 29 | EXCLUDE_FILTER_TYPE = GithubRepositoryReader.FilterType.EXCLUDE
 30 | 
 31 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
 32 | 
 33 | embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5')
 34 | 
 35 | embedding_dimension = len(embeddings.embed_query("test"))
 36 | print(f'embedding dimension = {embedding_dimension}')
 37 | 
 38 | 
 39 | def load_embeddings() -> CacheBackedEmbeddings:
 40 |     redis_client = Redis(url=Secret.load('upstash-redis-rest-url').get(),
 41 |                          token=Secret.load('upstash-redis-rest-token').get())
 42 |     embeddings_cache = UpstashRedisByteStore(client=redis_client,
 43 |                                              ttl=None,
 44 |                                              namespace=os.getenv('EMBEDDING_CACHE_NAMESPACE'))
 45 | 
 46 |     cached_embedder = CacheBackedEmbeddings.from_bytes_store(
 47 |         embeddings,
 48 |         embeddings_cache,
 49 |         namespace=embeddings.model_name + "/",
 50 |     )
 51 |     return cached_embedder
 52 | 
 53 | 
 54 | Settings.embed_model = load_embeddings()
 55 | Settings.llm = None
 56 | 
 57 | 
 58 | def index_spreadsheet(url: str, title: str, collection_name: str):
 59 |     documents = TrafilaturaWebReader().load_data([url])
 60 |     for doc in documents:
 61 |         doc.metadata['title'] = title
 62 |         doc.metadata['source'] = url
 63 |     add_route_to_docs(documents, 'faq')
 64 |     add_to_index(documents, collection_name=collection_name)
 65 | 
 66 | 
 67 | def add_route_to_docs(docs: [Document], route_name: str):
 68 |     route_key_name = 'route'
 69 |     for doc in docs:
 70 |         doc.metadata[route_key_name] = route_name
 71 |         doc.excluded_embed_metadata_keys.append(route_key_name)
 72 |         doc.excluded_llm_metadata_keys.append(route_key_name)
 73 | 
 74 | 
 75 | def add_to_index(documents: list[Document],
 76 |                  collection_name: str,
 77 |                  overwrite: bool = False,
 78 |                  node_parser: NodeParser = None):
 79 |     sentence_splitter = SentenceSplitter.from_defaults(chunk_size=512, chunk_overlap=50,
 80 |                                                        tokenizer=embeddings.client.tokenizer)
 81 |     environment = os.getenv('EXECUTION_ENV', 'local')
 82 |     if environment == 'local':
 83 |         milvus_vector_store = MilvusVectorStore(uri='http://localhost:19530',
 84 |                                                 collection_name=collection_name,
 85 |                                                 dim=embedding_dimension,
 86 |                                                 overwrite=overwrite)
 87 |     elif environment == 'zilliz-cluster':
 88 |         milvus_vector_store = MilvusVectorStore(
 89 |             uri=Secret.load('zilliz-public-endpoint').get(),
 90 |             token=Secret.load('zilliz-api-key').get(),
 91 |             collection_name=collection_name,
 92 |             dim=embedding_dimension,
 93 |             overwrite=overwrite)
 94 |     else:
 95 |         milvus_vector_store = MilvusVectorStore(collection_name=collection_name,
 96 |                                                 uri=Secret.load('zilliz-cloud-uri').get(),
 97 |                                                 token=Secret.load('zilliz-cloud-api-key').get(),
 98 |                                                 dim=embedding_dimension,
 99 |                                                 overwrite=overwrite)
100 |     storage_context = StorageContext.from_defaults(vector_store=milvus_vector_store)
101 |     transformations = [t for t in [node_parser, sentence_splitter] if t is not None]
102 | 
103 |     VectorStoreIndex.from_documents(documents,
104 |                                     transformations=transformations,
105 |                                     storage_context=storage_context,
106 |                                     show_progress=True)
107 | 
108 | 
109 | def index_github_repo(owner: str,
110 |                       repo: str,
111 |                       branch: str,
112 |                       collection_name: str,
113 |                       ignore_file_extensions: [str] = None,
114 |                       ignore_directories: [str] = None,
115 |                       ):
116 |     if ignore_file_extensions is None:
117 |         ignore_file_extensions = ['.jpg', '.png', '.svg', '.gitignore', '.csv', '.jar']
118 |     if ignore_directories is None:
119 |         ignore_directories = ['.github', '.gitignore', '2021', '2022', 'images']
120 |     github_client = GithubClient(Secret.load('github-token').get(), verbose=True)
121 |     documents = GithubRepositoryReader(
122 |         github_client=github_client,
123 |         owner=owner,
124 |         repo=repo,
125 |         filter_directories=(ignore_directories, EXCLUDE_FILTER_TYPE),
126 |         filter_file_extensions=(ignore_file_extensions, EXCLUDE_FILTER_TYPE),
127 |     ).load_data(branch=branch)
128 |     for doc in documents:
129 |         doc.metadata['branch'] = branch
130 |         doc.metadata['owner'] = owner
131 |         doc.metadata['repo'] = repo
132 |     add_route_to_docs(documents, 'github')
133 | 
134 |     ipynb_docs = [parse_ipynb_doc(doc) for doc in documents if doc.metadata.get('file_name', '').endswith('.ipynb')]
135 |     md_docs = [doc for doc in documents if doc.metadata.get('file_name', '').endswith('.md')]
136 |     other_docs = [doc for doc in documents if not doc.metadata.get('file_name', '').endswith(('.ipynb', '.md'))]
137 | 
138 |     add_to_index(other_docs, collection_name=collection_name)
139 |     add_to_index(md_docs, collection_name=collection_name, node_parser=MarkdownNodeParser())
140 |     add_to_index(ipynb_docs, collection_name=collection_name)
141 | 
142 | 
143 | def parse_ipynb_doc(ipynb_doc: Document) -> Document:
144 |     ipynb_json = json.loads(ipynb_doc.text)
145 |     temp_ipynb = tempfile.NamedTemporaryFile(suffix='.ipynb')
146 |     try:
147 |         with open(temp_ipynb.name, 'w') as f_out:
148 |             json.dump(ipynb_json, f_out)
149 |         parsed = JupyterNotebookParser(temp_ipynb.name)
150 |         all_cells = parsed.get_all_cells()
151 |         parsed_text = ''.join([JupyterNotebookParser._join_source_lines(cell.get('source', ''))
152 |                                for cell in all_cells])
153 |         ipynb_doc.text = parsed_text
154 |         return ipynb_doc
155 |     finally:
156 |         temp_ipynb.close()
157 | 
158 | 
159 | def index_slack_history(channel_ids: [str], collection_name: str):
160 |     earliest_date = datetime.now() - timedelta(days=90)
161 |     slack_reader = SlackReader(earliest_date=earliest_date,
162 |                                bot_user_id=BOT_USER_ID,
163 |                                not_ignore_users=[AU_TOMATOR_USER_ID],
164 |                                slack_token=Secret.load('slack-bot-token').get())
165 |     print('Starting to load slack messages from the last 90 days')
166 |     documents = slack_reader.load_data(channel_ids=channel_ids)
167 |     add_route_to_docs(documents, 'slack')
168 |     print('Starting to add loaded Slack messages to the index')
169 |     add_to_index(documents, collection_name=collection_name)
170 | 
171 | 
172 | def index_faq(document_ids: [str], collection_name: str):
173 |     temp_creds = tempfile.NamedTemporaryFile()
174 |     creds_dict = GcpCredentials.load("google-drive-creds").service_account_info.get_secret_value()
175 |     with open(temp_creds.name, 'w') as f_out:
176 |         json.dump(creds_dict, f_out)
177 |     gdocs_reader = FAQGoogleDocsReader(service_account_json_path=temp_creds.name)
178 |     print('Starting to load FAQ document')
179 |     documents = gdocs_reader.load_data(document_ids=document_ids)
180 |     temp_creds.close()
181 |     add_route_to_docs(documents, 'faq')
182 |     print('Starting to add loaded FAQ document to the index')
183 |     add_to_index(documents,
184 |                  collection_name=collection_name,
185 |                  overwrite=True,
186 |                  )
187 | 
188 | 
189 | def index_youtube(video_ids: list[str], collection_name: str):
190 |     yt_reader = YoutubeReader()
191 |     documents = yt_reader.load_data(video_ids=video_ids, tokenizer=embeddings.client.tokenizer)
192 |     print('Starting to add loaded Video transcripts to the index')
193 |     add_to_index(documents, collection_name=collection_name)
194 | 


--------------------------------------------------------------------------------
/prefect.yaml:
--------------------------------------------------------------------------------
  1 | # Welcome to your prefect.yaml file! You can use this file for storing and managing
  2 | # configuration for deploying your flows. We recommend committing this file to source
  3 | # control along with your flow code.
  4 | 
  5 | # Generic metadata about this project
  6 | name: zoomcamp-bot-index
  7 | prefect-version: 2.19.5
  8 | 
  9 | # build section allows you to manage and build docker images
 10 | build:
 11 | 
 12 | # push section allows you to manage if and how this project is uploaded to remote locations
 13 | push:
 14 |   - prefect_docker.deployments.steps.push_docker_image:
 15 |       requires: prefect-docker>=0.3.1
 16 |       image_name: '{{ build_image.image_name }}'
 17 |       tag: '{{ build_image.tag }}'
 18 | 
 19 | # pull section allows you to provide instructions for cloning this project in remote locations
 20 | pull:
 21 | 
 22 | 
 23 | definitions:
 24 |   work_pools:
 25 |     zoomcamp_faq_bot_workpool: &zoomcamp-faq-bot-pool
 26 |       name: zoomcamp-faq-bot
 27 |       work_queue_name: docker_queue
 28 |       job_variables: &job-variables
 29 |         image: '{{ build_image.image }}'
 30 |         env:
 31 |           EXECUTION_ENV: zilliz
 32 |         auto_remove: true
 33 |   schedules:
 34 |     at_0_daily: &at_0_daily
 35 |       cron: 0 0 * * *
 36 |       timezone: Europe/Madrid
 37 |       day_or: true
 38 |     at_1_daily: &at_1_daily
 39 |       cron: 0 1 * * *
 40 |       timezone: Europe/Madrid
 41 |       day_or: true
 42 |     at_23_monday: &at_23_monday
 43 |       cron: 0 23 * * 1
 44 |       timezone: Europe/Madrid
 45 |       day_or: true
 46 |     at_23_tuesday: &at_23_tuesday
 47 |       cron: 0 23 * * 2
 48 |       timezone: Europe/Madrid
 49 |       day_or: true
 50 |     at_23_wednesday: &at_23_wednesday
 51 |       cron: 0 23 * * 3
 52 |       timezone: Europe/Madrid
 53 |       day_or: true
 54 |   actions:
 55 |     docker_build:
 56 |       - prefect.deployments.steps.run_shell_script: &shell-script-config
 57 |           id: get-commit-hash
 58 |           script: git rev-parse --short HEAD
 59 |           stream_output: false
 60 |       - prefect_docker.deployments.steps.build_docker_image: &docker-build-config
 61 |           id: build_image
 62 |           requires: prefect-docker>=0.3.1
 63 |           tag: '{{ get-commit-hash.stdout }}'
 64 |           platform: linux/amd64
 65 | 
 66 | 
 67 | # the deployments section allows you to provide configuration for deploying flows
 68 | deployments:
 69 |   - name: fill-index-zilliz-ml
 70 |     tags:
 71 |       - ml-ingest
 72 |       - zoomcamp-faq-bot
 73 |     description: Fill Zilliz index for ML Zoomcamp
 74 |     schedules:
 75 |       - *at_23_tuesday
 76 |     entrypoint: ingest/ml/ingest_ml.py:fill_ml_index
 77 |     work_pool: *zoomcamp-faq-bot-pool
 78 |     build:
 79 |       - prefect.deployments.steps.run_shell_script: *shell-script-config
 80 |       - prefect_docker.deployments.steps.build_docker_image:
 81 |           <<: *docker-build-config # Uses the docker_build_config and overrides the dockerfile and image_name fields
 82 |           dockerfile: ingest/ml.dockerfile
 83 |           image_name: aaalexlit/zoomcamp-faq-ingest-ml
 84 |     pull:
 85 |       - prefect.deployments.steps.set_working_directory:
 86 |           directory: /usr/src
 87 |   - name: fill-index-zilliz-de
 88 |     tags:
 89 |       - de-ingest
 90 |       - zoomcamp-faq-bot
 91 |     description: Fill Zilliz index for DE Zoomcamp
 92 |     schedules:
 93 |       - *at_23_monday
 94 |     entrypoint: ingest/de/ingest_de.py:fill_de_index
 95 |     work_pool: *zoomcamp-faq-bot-pool
 96 |     build:
 97 |       - prefect.deployments.steps.run_shell_script: *shell-script-config
 98 |       - prefect_docker.deployments.steps.build_docker_image:
 99 |           <<: *docker-build-config
100 |           # Uses the docker_build_config and overrides the dockerfile and image_name fields
101 |           dockerfile: ingest/de.dockerfile
102 |           image_name: aaalexlit/zoomcamp-faq-ingest-de
103 |     pull:
104 |       - prefect.deployments.steps.set_working_directory:
105 |           directory: /usr/src
106 |   - name: fill-index-zilliz-mlops
107 |     tags:
108 |       - mlops-ingest
109 |       - zoomcamp-faq-bot
110 |     description: Fill Zilliz index for MLOps Zoomcamp
111 |     schedules:
112 |       - *at_0_daily
113 |     entrypoint: ingest/mlops/ingest_mlops.py:fill_mlops_index
114 |     work_pool:
115 |       <<: *zoomcamp-faq-bot-pool
116 |       job_variables:
117 |         <<: *job-variables
118 |         env:
119 |           EXECUTION_ENV: zilliz-cluster
120 |     build:
121 |       - prefect.deployments.steps.run_shell_script: *shell-script-config
122 |       - prefect_docker.deployments.steps.build_docker_image:
123 |           <<: *docker-build-config
124 |           # Uses the docker_build_config and overrides the dockerfile and image_name fields
125 |           dockerfile: ingest/mlops.dockerfile
126 |           image_name: aaalexlit/zoomcamp-faq-ingest-mlops
127 |     pull:
128 |       - prefect.deployments.steps.set_working_directory:
129 |           directory: /usr/src
130 |   - name: fill-index-zilliz-llm
131 |     tags:
132 |       - llm-ingest
133 |       - zoomcamp-faq-bot
134 |     description: Fill Zilliz index for LLM Zoomcamp
135 |     schedules:
136 |       - *at_23_wednesday
137 |     entrypoint: ingest/llm/ingest_llm.py:fill_llm_index
138 |     work_pool:
139 |       <<: *zoomcamp-faq-bot-pool
140 |       job_variables:
141 |         <<: *job-variables
142 |         env:
143 |           EXECUTION_ENV: zilliz-cluster
144 |     build:
145 |       - prefect.deployments.steps.run_shell_script: *shell-script-config
146 |       - prefect_docker.deployments.steps.build_docker_image:
147 |           <<: *docker-build-config
148 |           # Uses the docker_build_config and overrides the dockerfile and image_name fields
149 |           dockerfile: ingest/llm.dockerfile
150 |           image_name: aaalexlit/zoomcamp-faq-ingest-llm
151 |     pull:
152 |       - prefect.deployments.steps.set_working_directory:
153 |           directory: /usr/src
154 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | openai
 2 | slack-bolt
 3 | slack-sdk
 4 | langchain
 5 | google-api-python-client
 6 | google-auth-httplib2
 7 | google-auth-oauthlib
 8 | sentence-transformers
 9 | prefect
10 | prefect-gcp
11 | GitPython
12 | pymilvus
13 | llama-index-core
14 | llama-index-readers-web
15 | llama-index-readers-github
16 | llama-index-vector-stores-milvus
17 | llama-index-embeddings-langchain
18 | llama-index-postprocessor-cohere-rerank
19 | llama-index-llms-langchain
20 | llama-index-llms-fireworks
21 | ipython
22 | cohere
23 | trafilatura
24 | nbconvert
25 | prefect-docker
26 | langchain-openai
27 | upstash-redis
28 | jupyter-notebook-parser
29 | requests==2.31.0
30 | youtube-transcript-api
31 | 


--------------------------------------------------------------------------------
/slack_bot/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-slim
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install -y gcc python3-dev
 5 | 
 6 | WORKDIR /usr/src/app
 7 | 
 8 | COPY requirements.txt ./
 9 | RUN pip install --no-cache-dir -r requirements.txt
10 | 
11 | COPY main.py ./
12 | 
13 | CMD [ "python", "-u", "./main.py" ]


--------------------------------------------------------------------------------
/slack_bot/README.md:
--------------------------------------------------------------------------------
 1 | # Running the bot locally
 2 | 
 3 | 1. re-create separate conda environment using [slack_bot/requirements.txt](../slack_bot/requirements.txt)
 4 |     ```shell
 5 |     conda activate base
 6 |     conda remove --name slack-bot --all
 7 |     conda create --name slack-bot python=3.10
 8 |     conda activate slack-bot
 9 |     cd slack_bot
10 |     pip install -r requirements.txt
11 |     ```
12 | 1. Rename [dev.env](../dev.env) to `.env` and set all the required variables
13 | 
14 | 1. Run ingestion with local milvus following [local_development.md](../ingest/local_development.md)
15 | 
16 | 1. Run [main.py](main.py)
17 | 
18 |     ```shell
19 |     source .env
20 |     python main.py
21 |     ```
22 |     In Pycharm IDE use a provided run configuration [run_bot_local_ws.run.xml](../.run/run_bot_local_ws.run.xml)
23 | 


--------------------------------------------------------------------------------
/slack_bot/app_manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "display_information": {
 3 |         "name": "FAQBotForMLOps",
 4 |         "description": "MLOps FAQ as a bot",
 5 |         "background_color": "#2e7898"
 6 |     },
 7 |     "features": {
 8 |         "bot_user": {
 9 |             "display_name": "QABotForMLOps",
10 |             "always_online": false
11 |         }
12 |     },
13 |     "oauth_config": {
14 |         "scopes": {
15 |             "bot": [
16 |                 "app_mentions:read",
17 |                 "channels:history",
18 |                 "channels:read",
19 |                 "chat:write"
20 |             ]
21 |         }
22 |     },
23 |     "settings": {
24 |         "event_subscriptions": {
25 |             "bot_events": [
26 |                 "app_mention"
27 |             ]
28 |         },
29 |         "interactivity": {
30 |             "is_enabled": true
31 |         },
32 |         "org_deploy_enabled": false,
33 |         "socket_mode_enabled": true,
34 |         "token_rotation_enabled": false
35 |     }
36 | }


--------------------------------------------------------------------------------
/slack_bot/bot_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaalexlit/faq-slack-bot/78962d17dbd68438bd443b7dfdb961ac9d13e574/slack_bot/bot_icon.png


--------------------------------------------------------------------------------
/slack_bot/dev.env:
--------------------------------------------------------------------------------
 1 | # datatalks slack token
 2 | SLACK_APP_TOKEN=xapp-..
 3 | SLACK_BOT_TOKEN=xoxb-..
 4 | 
 5 | # OpenAI API key
 6 | OPENAI_API_KEY=sk-..
 7 | 
 8 | PINECONE_API_KEY=..
 9 | PINECONE_ENV=..
10 | 
11 | WANDB_API_KEY=..
12 | LANGCHAIN_API_KEY=lsv2_..
13 | 
14 | ZILLIZ_CLOUD_URI=https://..
15 | ZILLIZ_CLOUD_API_KEY=..
16 | 
17 | ZILLIZ_PUBLIC_ENDPOINT=https://..
18 | ZILLIZ_API_KEY=..
19 | 
20 | COHERE_API_KEY=..
21 | 
22 | # DEBUG log level
23 | #LOG_LEVEL=10
24 | 


--------------------------------------------------------------------------------
/slack_bot/docker-compose-my-workspace.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   faq-slack-bot:
 3 |     build:
 4 |       context: .
 5 |     platform: linux/amd64
 6 |     env_file:
 7 |       - ../.env
 8 |     environment:
 9 |       - LOCALHOST=host.docker.internal
10 | 


--------------------------------------------------------------------------------
/slack_bot/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 |   faq-slack-bot:
3 |     build:
4 |       context: .
5 |     platform: linux/amd64
6 |     env_file:
7 |       - .env
8 | 


--------------------------------------------------------------------------------
/slack_bot/main.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import hashlib
  3 | import logging
  4 | import os
  5 | import re
  6 | import sys
  7 | import uuid
  8 | 
  9 | from cohere.core import ApiError as CohereAPIError
 10 | from langchain import callbacks
 11 | from langchain_openai import ChatOpenAI
 12 | from langsmith import Client
 13 | from llama_index.core import ChatPromptTemplate
 14 | from llama_index.core import VectorStoreIndex, Settings
 15 | from llama_index.core import get_response_synthesizer
 16 | from llama_index.core.llms import ChatMessage, MessageRole
 17 | from llama_index.core.postprocessor import TimeWeightedPostprocessor
 18 | from llama_index.core.query_engine import RetrieverQueryEngine
 19 | # from llama_index.postprocessor.cohere_rerank import CohereRerank
 20 | from llama_index.vector_stores.milvus import MilvusVectorStore
 21 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 22 | from requests.exceptions import ChunkedEncodingError
 23 | from slack_bolt import App
 24 | from slack_bolt.adapter.socket_mode import SocketModeHandler
 25 | from slack_sdk.models.views import View
 26 | from slack_sdk.web import WebClient
 27 | 
 28 | logging.basicConfig(stream=sys.stdout,
 29 |                     level=os.getenv('LOG_LEVEL', logging.INFO),
 30 |                     format='%(asctime)s %(message)s',
 31 |                     datefmt='%d-%m-%Y %H:%M:%S', )
 32 | logger = logging.getLogger(__name__)
 33 | 
 34 | DE_CHANNELS = ['C01FABYF2RG', 'C06CBSE16JC', 'C06BZJX8PSP']
 35 | ML_CHANNELS = ['C0288NJ5XSA', 'C05C3SGMLBB', 'C05DTQECY66']
 36 | MLOPS_CHANNELS = ['C02R98X7DS9', 'C06C1N46CQ1', 'C0735558X52']
 37 | LLM_CHANNELS = ['C079QE5NAMP', 'C078X7REVN3', 'C06TEGTGM3J']
 38 | 
 39 | ALLOWED_CHANNELS = DE_CHANNELS + ML_CHANNELS + MLOPS_CHANNELS + LLM_CHANNELS
 40 | 
 41 | PROJECT_NAME = 'datatalks-faq-slackbot'
 42 | ML_ZOOMCAMP_PROJECT_NAME = 'ml-zoomcamp-slack-bot'
 43 | DE_ZOOMCAMP_PROJECT_NAME = 'de-zoomcamp-slack-bot'
 44 | 
 45 | ML_COLLECTION_NAME = 'mlzoomcamp_faq_git'
 46 | DE_COLLECTION_NAME = 'dezoomcamp_faq_git'
 47 | MLOPS_COLLECTION_NAME = 'mlopszoomcamp'
 48 | LLM_COLLECTION_NAME = 'llmzoomcamp'
 49 | 
 50 | GPT_MODEL_NAME = 'gpt-4o-mini-2024-07-18'
 51 | 
 52 | # Event API & Web API
 53 | SLACK_BOT_TOKEN = os.getenv('SLACK_BOT_TOKEN')
 54 | SLACK_APP_TOKEN = os.getenv('SLACK_APP_TOKEN')
 55 | app = App(token=SLACK_BOT_TOKEN)
 56 | langsmith_client = Client()
 57 | 
 58 | 
 59 | @app.action('upvote')
 60 | def add_positive_feedback(ack, body):
 61 |     ack()
 62 |     add_feedback(body, 'upvote')
 63 | 
 64 | 
 65 | @app.action('downvote')
 66 | def add_negative_feedback(ack, body):
 67 |     ack()
 68 |     add_feedback(body, 'downvote')
 69 | 
 70 | 
 71 | def add_feedback(body, feedback_type: str):
 72 |     run_id = None
 73 |     feedback_id = None
 74 |     try:
 75 |         original_blocks = body['message']['blocks']
 76 |         actions_block_elements = [block for block in original_blocks if block.get('type') == 'actions'][0]['elements']
 77 |         element_to_update = \
 78 |             [element for element in actions_block_elements if element.get('action_id') == feedback_type][0]
 79 |         element_text_to_update = element_to_update['text']['text']
 80 |         updated_text, updated_number = increment_number_in_string(element_text_to_update)
 81 |         element_to_update['text']['text'] = updated_text
 82 | 
 83 |         run_id = body['actions'][0]['value']
 84 |         feedback_id = get_feedback_id_from_run_id_and_feedback_type(run_id, feedback_type)
 85 | 
 86 |         user_id = body['user']['id']
 87 |         user_name = body['user']['username']
 88 | 
 89 |         logger.info(f'run_id {run_id} {feedback_type}d by {user_name}({user_id})')
 90 | 
 91 |         if updated_number > 1:
 92 |             langsmith_client.update_feedback(
 93 |                 feedback_id=feedback_id,
 94 |                 score=updated_number
 95 |             )
 96 |         else:
 97 |             langsmith_client.create_feedback(
 98 |                 run_id=run_id,
 99 |                 key=feedback_type,
100 |                 score=updated_number,
101 |                 feedback_id=feedback_id
102 |             )
103 | 
104 |         client.chat_update(
105 |             channel=body['channel']['id'],
106 |             ts=body['message']['ts'],
107 |             blocks=original_blocks,
108 |             text=body['message']['text']
109 |         )
110 |     except Exception as ex:
111 |         error_message = f'An error occurred when trying to record user feedback with action body =\n{body}\n'
112 |         if run_id:
113 |             error_message += f'for run_id = {run_id}\n'
114 |         if feedback_id:
115 |             error_message += f'and feedback_id = {feedback_id}\n'
116 | 
117 |         logger.error(f'{error_message}'
118 |                      f'Error: {ex}')
119 |         show_feedback_logging_error_modal(body['trigger_id'])
120 | 
121 | 
122 | def show_feedback_logging_error_modal(trigger_id):
123 |     client.views_open(trigger_id=trigger_id,
124 |                       view=View(type='modal',
125 |                                 title='Error recording feedback',
126 |                                 blocks=[
127 |                                     {
128 |                                         "type": "section",
129 |                                         "text": {
130 |                                             "type": "mrkdwn",
131 |                                             "text": (
132 |                                                 "An error occurred while attempting to capture your feedback.\n"
133 |                                                 "Please try again later. Apologies for the inconvenience.")
134 |                                         }
135 |                                     }
136 |                                 ]))
137 | 
138 | 
139 | def get_feedback_id_from_run_id_and_feedback_type(run_id, feedback_type):
140 |     # Combine run_id UUID bytes and action bytes
141 |     combined_bytes = uuid.UUID(run_id).bytes + feedback_type.encode('utf-8')
142 |     # Hash the combined bytes
143 |     hashed_bytes = hashlib.sha1(combined_bytes).digest()
144 |     # Convert hashed bytes to UUID
145 |     return uuid.UUID(bytes=hashed_bytes[:16])
146 | 
147 | 
148 | # This gets activated when the bot is tagged in a channel
149 | @app.event("app_mention")
150 | def handle_message_events(body):
151 |     channel_id = body["event"]["channel"]
152 |     event_ts = body["event"]["event_ts"]
153 |     user = body["event"]["user"]
154 | 
155 |     if channel_id not in ALLOWED_CHANNELS:
156 |         client.chat_postMessage(channel=channel_id,
157 |                                 thread_ts=event_ts,
158 |                                 text="Apologies, I can't answer questions in this channel")
159 |         return
160 | 
161 |     # Extract question from the message text
162 |     question = remove_mentions(str(body["event"]["text"]))
163 |     if question.strip() == '':
164 |         client.chat_postMessage(channel=channel_id,
165 |                                 thread_ts=event_ts,
166 |                                 text=('Ooops! It seems like your question is empty. '
167 |                                       'Please make sure to tag me in your message along with your question.')
168 |                                 )
169 |         return
170 |     logger.info(question)
171 | 
172 |     # Let the user know that we are busy with the request
173 |     greeting_message = get_greeting_message(channel_id)
174 | 
175 |     posted_greeting_message = client.chat_postMessage(channel=channel_id,
176 |                                                       thread_ts=event_ts,
177 |                                                       text=greeting_message,
178 |                                                       unfurl_links=False)
179 |     try:
180 |         with callbacks.collect_runs() as cb:
181 |             if channel_id in MLOPS_CHANNELS:
182 |                 response = mlops_query_engine.query(question)
183 |             elif channel_id in ML_CHANNELS:
184 |                 response = ml_query_engine.query(question)
185 |             elif channel_id in LLM_CHANNELS:
186 |                 response = llm_query_engine.query(question)
187 |             else:
188 |                 response = de_query_engine.query(question)
189 |             # get the id of the last run that's supposedly a run that delivers the final answer
190 |             run_id = cb.traced_runs[-1].id
191 | 
192 |         response_text = f"Hey, <@{user}>! Here you go: \n{response}"
193 | 
194 |         response_blocks = [
195 |             {
196 |                 "type": "section",
197 |                 "text": {
198 |                     "type": "mrkdwn",
199 |                     "text": response_text
200 |                 }
201 |             },
202 |             {
203 |                 "type": "divider"
204 |             }]
205 |         if hasattr(response, "source_nodes"):
206 |             sources = links_to_source_nodes(response)
207 |             references = f"References:\n{sources}"
208 |             references_blocks = [{
209 |                 "type": "section",
210 |                 "text": {
211 |                     "type": "mrkdwn",
212 |                     "text": references
213 |                 }
214 |             },
215 |                 {
216 |                     "type": "divider"
217 |                 }]
218 |             response_blocks.extend(references_blocks)
219 | 
220 |         response_blocks.extend([{
221 |             "type": "context",
222 |             "elements": [
223 |                 {
224 |                     "type": "mrkdwn",
225 |                     "text": ":pray: Please leave your feedback to help me improve "
226 |                 }
227 |             ]
228 |         },
229 |             {
230 |                 "type": "actions",
231 |                 "elements": [
232 |                     {
233 |                         "type": "button",
234 |                         "text": {
235 |                             "type": "plain_text",
236 |                             "text": ":thumbsup: 0"
237 |                         },
238 |                         "style": "primary",
239 |                         "value": f"{run_id}",
240 |                         "action_id": "upvote"
241 |                     },
242 |                     {
243 |                         "type": "button",
244 |                         "text": {
245 |                             "type": "plain_text",
246 |                             "text": ":thumbsdown: 0"
247 |                         },
248 |                         "style": "danger",
249 |                         "value": f"{run_id}",
250 |                         "action_id": "downvote"
251 |                     }
252 |                 ]
253 |             }
254 |         ])
255 | 
256 |         client.chat_postMessage(channel=channel_id,
257 |                                 thread_ts=event_ts,
258 |                                 blocks=response_blocks,
259 |                                 text=response_text,
260 |                                 unfurl_media=False
261 |                                 )
262 |         client.chat_delete(channel=channel_id,
263 |                            ts=posted_greeting_message.data['ts'])
264 |     except CohereAPIError:
265 |         client.chat_postMessage(channel=channel_id,
266 |                                 thread_ts=event_ts,
267 |                                 text="There was an error, please try again later")
268 |     except Exception as e:
269 |         logger.error(f'Error responding to a query\n{e}')
270 |         client.chat_postMessage(channel=channel_id,
271 |                                 thread_ts=event_ts,
272 |                                 text=f"There was an error: {e}")
273 | 
274 | 
275 | def links_to_source_nodes(response):
276 |     res = set()
277 |     source_nodes = response.source_nodes
278 |     link_template = 'https://datatalks-club.slack.com/archives/{}/p{}'
279 |     for node in source_nodes:
280 |         # Slack
281 |         if 'channel' in node.metadata:
282 |             channel_id = node.metadata['channel']
283 |             thread_ts = node.metadata['thread_ts']
284 |             thread_ts_str = str(thread_ts).replace('.', '')
285 |             link_template.format(channel_id, thread_ts_str)
286 |             res.add(link_template.format(channel_id, thread_ts_str))
287 |         # Google doc
288 |         elif 'source' in node.metadata:
289 |             title = node.metadata['title']
290 |             if title == 'FAQ':
291 |                 section_title = node.text.split('\n', 1)[0]
292 |                 res.add(f"<{node.metadata['source']}|"
293 |                         f" {title}-{section_title}...> ")
294 |             else:
295 |                 res.add(f"<{node.metadata['source']}| {title}>")
296 |         # GitHub
297 |         elif 'repo' in node.metadata:
298 |             repo = node.metadata['repo']
299 |             owner = node.metadata['owner']
300 |             branch = node.metadata['branch']
301 |             file_path = node.metadata['file_path']
302 |             link_to_file = build_repo_path(owner=owner, repo=repo, branch=branch, file_path=file_path)
303 |             res.add(f'<{link_to_file}| GitHub-{repo}-{file_path.split("/")[-1]}>')
304 |         elif 'yt_link' in node.metadata:
305 |             yt_link = node.metadata['yt_link']
306 |             yt_title = node.metadata['yt_title']
307 |             res.add(f'<{yt_link}| Youtube-{yt_title}>')
308 |     return '\n'.join(res)
309 | 
310 | 
311 | def increment_number_in_string(source_string):
312 |     # Regular expression to find any sequence of digits (\d+)
313 |     pattern = r'(\d+)'
314 | 
315 |     # Define a lambda function to replace matched digits with the incremented value
316 |     replacer = lambda match: str(int(match.group(0)) + 1)
317 | 
318 |     # Use re.sub() to replace matched digits with the incremented value
319 |     result_string = re.sub(pattern, replacer, source_string)
320 |     result_number = int(re.search(pattern, result_string).group(0))
321 | 
322 |     return result_string, result_number
323 | 
324 | 
325 | def build_repo_path(owner: str, repo: str, branch: str, file_path: str):
326 |     return f'https://github.com/{owner}/{repo}/blob/{branch}/{file_path}'
327 | 
328 | 
329 | def remove_mentions(input_text):
330 |     # Define a regular expression pattern to match the mention
331 |     mention_pattern = r'<@U[0-9A-Z]+>'
332 | 
333 |     return re.sub(mention_pattern, '', input_text)
334 | 
335 | 
336 | def get_greeting_message(channel_id):
337 |     message_template = "Hello from {name} FAQ Bot! :robot_face: \n" \
338 |                        "Please note that I'm under active development. " \
339 |                        "The answers might not be accurate since I'm " \
340 |                        "just a human-friendly interface to the " \
341 |                        "<https://docs.google.com/document/d/{link}| {name} Zoomcamp FAQ>" \
342 |                        ", this Slack channel, and this course's <https://github.com/DataTalksClub/{repo}|GitHub repo>." \
343 |                        "\nThanks for your request, I'm on it!"
344 |     if channel_id in MLOPS_CHANNELS:
345 |         name = 'MLOps'
346 |         link = '12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit#heading=h.uwpp1jrsj0d'
347 |         repo = 'mlops-zoomcamp'
348 |     elif channel_id in ML_CHANNELS:
349 |         name = 'ML'
350 |         link = '1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8/edit#heading=h.98qq6wfuzeck'
351 |         repo = 'machine-learning-zoomcamp'
352 |     elif channel_id in LLM_CHANNELS:
353 |         name = 'LLM'
354 |         link = '1m2KexowAXTmexfC5rVTCSnaShvdUQ8Ag2IEiwBDHxN0/edit#heading=h.o29af0z8xx88'
355 |         repo = 'llm-zoomcamp'
356 |     else:
357 |         name = 'DE'
358 |         link = '19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit#heading=h.o29af0z8xx88'
359 |         repo = 'data-engineering-zoomcamp'
360 |     return message_template.format(name=name, link=link, repo=repo)
361 | 
362 | 
363 | def log_to_langsmith():
364 |     os.environ["LANGCHAIN_TRACING_V2"] = "true"
365 |     os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
366 |     os.environ["LANGCHAIN_PROJECT"] = PROJECT_NAME
367 | 
368 | 
369 | def get_prompt_template(zoomcamp_name: str, cohort_year: int, course_start_date: str) -> ChatPromptTemplate:
370 |     system_prompt = ChatMessage(
371 |         content=(
372 |             "You are a helpful AI assistant for the {zoomcamp_name} ZoomCamp course at DataTalksClub, "
373 |             "and you can be found in the course's Slack channel.\n"
374 |             "As a trustworthy assistant, you must provide helpful answers to students' questions about the course, "
375 |             "and assist them in finding solutions when they encounter problems/errors while following the course. \n"
376 |             "You must do it using only the excerpts from the course FAQ document, Slack threads, and GitHub repository "
377 |             "that are provided to you, without relying on prior knowledge.\n"
378 |             "Current cohort is year {cohort_year} one and the course start date is {course_start_date}. \n"
379 |             "Today is {current_date}. Take this into account when answering questions with temporal aspect. \n"
380 |             "Here are your guidelines:\n"
381 |             "- Provide clear and concise explanations for your conclusions, including relevant evidences, and "
382 |             "relevant code snippets if the question pertains to code. \n"
383 |             "- Avoid starting your answer with 'Based on the provided ...' or 'The context information ...' "
384 |             "or anything like this, instead, provide the information directly in the response.\n"
385 |             "- Justify your response in detail by explaining why you made the conclusions you actually made.\n"
386 |             "- In your response, refrain from rephrasing the user's question or problem; simply provide an answer.\n"
387 |             "- Make sure that the code examples you provide are accurate and runnable.\n"
388 |             "- If the question requests confirmation, avoid repeating the question. Instead, conduct your own "
389 |             "analysis based on the provided sources.\n"
390 |             "- In cases where the provided information is insufficient and you are uncertain about the response, "
391 |             "reply with: 'I don't think I have an answer for this; you'll have to ask your fellows or instructors.\n"
392 |             "- All the hyperlinks need to be taken from the provided excerpts, not from prior knowledge. "
393 |             "If there are no hyperlinks provided, abstain from adding hyperlinks to the answer.\n"
394 |             "- The hyperlinks need to be formatted the following way: <hyperlink|displayed text> \n"
395 |             "Example of the correctly formatted link to github: \n"
396 |             "<https://github.com/DataTalksClub/data-engineering-zoomcamp|DE zoomcamp GitHub repo>"
397 |         ),
398 |         role=MessageRole.SYSTEM,
399 |     )
400 |     user_prompt = ChatMessage(content=("Excerpts from the course FAQ document, Slack threads, and "
401 |                                        "GitHub repository are below delimited by the dashed lines:\n"
402 |                                        "---------------------\n"
403 |                                        "{context_str}\n"
404 |                                        "---------------------\n"
405 |                                        "Question: {query_str}\n"
406 |                                        "Answer: "),
407 |                               role=MessageRole.USER, )
408 |     return ChatPromptTemplate(message_templates=[
409 |         system_prompt,
410 |         user_prompt,
411 |     ],
412 |         function_mappings={'zoomcamp_name': lambda **kwargs: zoomcamp_name,
413 |                            'cohort_year': lambda **kwargs: cohort_year,
414 |                            'current_date': lambda **kwargs: datetime.datetime.now().strftime("%d %B %Y"),
415 |                            'course_start_date': lambda **kwargs: course_start_date})
416 | 
417 | 
418 | def get_retriever_query_engine(collection_name: str,
419 |                                zoomcamp_name: str,
420 |                                cohort_year: int,
421 |                                course_start_date: str):
422 |     if os.getenv('LOCAL_MILVUS', None):
423 |         localhost = os.getenv('LOCALHOST', 'localhost')
424 |         vector_store = MilvusVectorStore(collection_name=collection_name,
425 |                                          dim=embedding_dimension,
426 |                                          overwrite=False,
427 |                                          uri=f'http://{localhost}:19530')
428 |     else:
429 |         if collection_name in [MLOPS_COLLECTION_NAME, LLM_COLLECTION_NAME]:
430 |             vector_store = MilvusVectorStore(collection_name=collection_name,
431 |                                              uri=os.getenv("ZILLIZ_PUBLIC_ENDPOINT"),
432 |                                              token=os.getenv("ZILLIZ_API_KEY"),
433 |                                              dim=embedding_dimension,
434 |                                              overwrite=False)
435 |         else:
436 |             vector_store = MilvusVectorStore(collection_name=collection_name,
437 |                                              uri=os.getenv("ZILLIZ_CLOUD_URI"),
438 |                                              token=os.getenv("ZILLIZ_CLOUD_API_KEY"),
439 |                                              dim=embedding_dimension,
440 |                                              overwrite=False)
441 |     vector_store_index = VectorStoreIndex.from_vector_store(vector_store,
442 |                                                             embed_model=embeddings)
443 |     # cohere_rerank = CohereRerank(api_key=os.getenv('COHERE_API_KEY'), top_n=4)
444 |     recency_postprocessor = get_time_weighted_postprocessor()
445 |     # node_postprocessors = [recency_postprocessor, cohere_rerank]
446 |     node_postprocessors = [recency_postprocessor]
447 |     qa_prompt_template = get_prompt_template(zoomcamp_name=zoomcamp_name,
448 |                                              cohort_year=cohort_year,
449 |                                              course_start_date=course_start_date)
450 |     Settings.llm = ChatOpenAI(model=GPT_MODEL_NAME,
451 |                               temperature=0.7)
452 | 
453 |     response_synthesizer = get_response_synthesizer(text_qa_template=qa_prompt_template,
454 |                                                     verbose=True,
455 |                                                     )
456 |     return RetrieverQueryEngine(vector_store_index.as_retriever(similarity_top_k=15),
457 |                                 node_postprocessors=node_postprocessors,
458 |                                 response_synthesizer=response_synthesizer,
459 |                                 )
460 | 
461 | 
462 | def get_time_weighted_postprocessor():
463 |     return TimeWeightedPostprocessor(
464 |         last_accessed_key='thread_ts',
465 |         time_decay=0.4,
466 |         time_access_refresh=False,
467 |         top_k=10,
468 |     )
469 | 
470 | 
471 | if __name__ == "__main__":
472 |     client = WebClient(SLACK_BOT_TOKEN)
473 | 
474 |     logger.info('Downloading embeddings...')
475 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
476 |     while True:
477 |         try:
478 |             embeddings = HuggingFaceEmbedding(model_name='BAAI/bge-base-en-v1.5')
479 |             embedding_dimension = len(embeddings.get_text_embedding("test"))
480 |         except ChunkedEncodingError as e:
481 |             continue
482 |         break
483 | 
484 |     log_to_langsmith()
485 | 
486 |     ml_query_engine = get_retriever_query_engine(collection_name=ML_COLLECTION_NAME,
487 |                                                  zoomcamp_name='Machine Learning',
488 |                                                  cohort_year=2024,
489 |                                                  course_start_date='16 September 2024')
490 | 
491 |     de_query_engine = get_retriever_query_engine(collection_name=DE_COLLECTION_NAME,
492 |                                                  zoomcamp_name='Data Engineering',
493 |                                                  cohort_year=2025,
494 |                                                  course_start_date='13 January 2025')
495 | 
496 |     mlops_query_engine = get_retriever_query_engine(collection_name=MLOPS_COLLECTION_NAME,
497 |                                                     zoomcamp_name='MLOps',
498 |                                                     cohort_year=2024,
499 |                                                     course_start_date='13 May 2024')
500 | 
501 |     llm_query_engine = get_retriever_query_engine(collection_name=LLM_COLLECTION_NAME,
502 |                                                   zoomcamp_name='LLM',
503 |                                                   cohort_year=2024,
504 |                                                   course_start_date='17 June 2024')
505 |     SocketModeHandler(app, SLACK_APP_TOKEN).start()
506 | 


--------------------------------------------------------------------------------
/slack_bot/requirements.txt:
--------------------------------------------------------------------------------
 1 | openai==1.61.0
 2 | slack-bolt==1.22.0
 3 | slack-sdk==3.34.0
 4 | langchain==0.3.17
 5 | langchain-community==0.3.16
 6 | sentence-transformers==3.4.1
 7 | cohere==5.15.0
 8 | pymilvus==2.5.4
 9 | langchain-openai==0.3.3
10 | llama-index-core==0.12.15
11 | llama-index-vector-stores-milvus==0.5.0
12 | llama-index-embeddings-huggingface==0.5.1
13 | llama-index-postprocessor-cohere-rerank==0.4.0
14 | llama-index-llms-langchain==0.5.1
15 | 


--------------------------------------------------------------------------------
/slack_bot_custom_ingestion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaalexlit/faq-slack-bot/78962d17dbd68438bd443b7dfdb961ac9d13e574/slack_bot_custom_ingestion.png


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import langchain
 4 | import pinecone
 5 | from langchain.chains import RetrievalQA
 6 | from langchain.embeddings import HuggingFaceEmbeddings
 7 | from langchain.vectorstores import Pinecone
 8 | from langchain.chat_models import ChatOpenAI
 9 | 
10 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
11 | embeddings = HuggingFaceEmbeddings()
12 | langchain.debug = True
13 | 
14 | 
15 | def main(question):
16 |     pinecone.init(
17 |         api_key=os.getenv('PINECONE_API_KEY'),
18 |         environment=os.getenv('PINECONE_ENV')
19 |     )
20 | 
21 |     pinecone_index = Pinecone.from_existing_index(index_name='mlops-faq-bot',
22 |                                                   embedding=embeddings)
23 |     qa = RetrievalQA.from_chain_type(
24 |         llm=ChatOpenAI(model_name='gpt-3.5-turbo-1106'),
25 |         retriever=pinecone_index.as_retriever()
26 |     )
27 |     qa.return_source_documents = True
28 |     print(f"Question: {question}")
29 | 
30 |     result = qa.apply([question])
31 |     for res in result:
32 |         print(res.keys())
33 |         print(f"Question: {res['query']}")
34 |         print(f"Answer: {res['result']}")
35 |         for doc in res['source_documents']:
36 |             print("----------------------------------------------------")
37 |             print(f"Metadata: {doc.metadata}")
38 |             print(f"Content: {doc.page_content}")
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     # main("How can I solve connection in use problem with mlflow?")
43 |     main("MLflow UI throws an error on the browser 'Access to localhost was denied'. Any idea how to resolve this?")
44 | 


--------------------------------------------------------------------------------