├── .github
└── workflows
│ ├── docker-image.yml
│ ├── fly-deploy.yml
│ └── fly-rollback.yml
├── .gitignore
├── .prefectignore
├── .run
├── create_secrets_blocks.run.xml
├── ingest_de.run.xml
├── ingest_ml.run.xml
├── ingest_mlops.run.xml
├── run_bot_datatalks.run.xml
└── run_bot_local_ws.run.xml
├── Mlops_chatbot_diagram.png
├── README.md
├── dev.env
├── fly.toml
├── ingest
├── README.md
├── de.dockerfile
├── de
│ └── ingest_de.py
├── dev.env
├── llm.dockerfile
├── llm
│ └── ingest_llm.py
├── local_development.md
├── local_milvus
│ └── docker-compose.yml
├── ml.dockerfile
├── ml
│ └── ingest_ml.py
├── mlops.dockerfile
├── mlops
│ ├── ingest_mlops.py
│ └── ingest_mlops_old.py
├── prefect.md
├── prefect_infra
│ └── create_secrets_blocks.py
├── readers
│ ├── custom_faq_gdoc_reader.py
│ ├── slack_reader.py
│ └── youtube_reader.py
├── requirements.txt
└── utils
│ └── index_utils.py
├── prefect.yaml
├── requirements.txt
├── slack_bot
├── Dockerfile
├── README.md
├── app_manifest.json
├── bot_icon.png
├── dev.env
├── docker-compose-my-workspace.yml
├── docker-compose.yml
├── main.py
└── requirements.txt
├── slack_bot_custom_ingestion.png
└── test.py
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
1 | name: Docker Image CI
2 |
3 | on:
4 | workflow_dispatch:
5 |
6 | jobs:
7 | build-and-push-image:
8 | name: Push Docker image to Docker Hub
9 | runs-on: ubuntu-latest
10 | permissions:
11 | contents: read
12 | packages: write
13 |
14 | steps:
15 | - name: Check out the repo
16 | uses: actions/checkout@v4
17 |
18 | - name: Log in to Docker Hub
19 | uses: docker/login-action@v3
20 | with:
21 | username: ${{ secrets.DOCKER_USERNAME }}
22 | password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
23 |
24 | - name: Pull currently used Docker image
25 | run: docker pull aaalexlit/faq-slack-bot:main
26 |
27 | - name: Tag currently used Docker image as "previous" to enable easy rollback
28 | run: docker tag aaalexlit/faq-slack-bot:main aaalexlit/faq-slack-bot:previous
29 |
30 | - name: Push tagged image to Docker Hub
31 | run: docker push aaalexlit/faq-slack-bot:previous
32 |
33 | - name: Extract metadata (tags, labels) for Docker
34 | id: meta
35 | uses: docker/metadata-action@v5
36 | with:
37 | images: aaalexlit/faq-slack-bot
38 | tags: |
39 | type=sha
40 | type=ref,event=branch
41 |
42 | - name: Build and push Docker image
43 | uses: docker/build-push-action@v5
44 | with:
45 | context: ./slack_bot/
46 | file: ./slack_bot/Dockerfile
47 | push: true
48 | tags: ${{ steps.meta.outputs.tags }}
49 | labels: ${{ steps.meta.outputs.labels }}
50 |
--------------------------------------------------------------------------------
/.github/workflows/fly-deploy.yml:
--------------------------------------------------------------------------------
1 | name: Fly Deploy
2 |
3 | on:
4 | workflow_run:
5 | workflows: ["Docker Image CI"]
6 | types:
7 | - completed
8 | workflow_dispatch:
9 | inputs:
10 | docker_tag:
11 | description: 'Docker image tag to be deployed. by default `main`'
12 | required: false
13 | default: 'main'
14 |
15 | jobs:
16 | deploy:
17 | name: Deploy app
18 | runs-on: ubuntu-latest
19 | if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
20 | steps:
21 | - uses: actions/checkout@v4
22 | - uses: superfly/flyctl-actions/setup-flyctl@master
23 | - run: |
24 | if [ "${{ github.event_name }}" == "workflow_run" ]; then
25 | DOCKER_TAG="main"
26 | else
27 | DOCKER_TAG="${{ github.event.workflow_run.event.inputs.docker_tag || github.event.inputs.docker_tag }}"
28 | fi
29 | flyctl deploy --remote-only --image aaalexlit/faq-slack-bot:${DOCKER_TAG}
30 | env:
31 | FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
32 |
--------------------------------------------------------------------------------
/.github/workflows/fly-rollback.yml:
--------------------------------------------------------------------------------
1 | name: Fly Rollback
2 | on:
3 | workflow_dispatch:
4 | jobs:
5 | rollback:
6 | name: Rollback to the previously deployed image
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v4
10 | - uses: superfly/flyctl-actions/setup-flyctl@master
11 | - run: flyctl deploy --remote-only --image aaalexlit/faq-slack-bot:previous
12 | env:
13 | FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ### PyCharm+all template
2 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
3 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
4 |
5 | # User-specific stuff
6 | .idea/**/workspace.xml
7 | .idea/**/tasks.xml
8 | .idea/**/usage.statistics.xml
9 | .idea/**/dictionaries
10 | .idea/**/shelf
11 |
12 | # AWS User-specific
13 | .idea/**/aws.xml
14 |
15 | # Generated files
16 | .idea/**/contentModel.xml
17 |
18 | # Sensitive or high-churn files
19 | .idea/**/dataSources/
20 | .idea/**/dataSources.ids
21 | .idea/**/dataSources.local.xml
22 | .idea/**/sqlDataSources.xml
23 | .idea/**/dynamic.xml
24 | .idea/**/uiDesigner.xml
25 | .idea/**/dbnavigator.xml
26 |
27 | # Gradle
28 | .idea/**/gradle.xml
29 | .idea/**/libraries
30 |
31 | # Gradle and Maven with auto-import
32 | # When using Gradle or Maven with auto-import, you should exclude module files,
33 | # since they will be recreated, and may cause churn. Uncomment if using
34 | # auto-import.
35 | # .idea/artifacts
36 | # .idea/compiler.xml
37 | # .idea/jarRepositories.xml
38 | # .idea/modules.xml
39 | # .idea/*.iml
40 | # .idea/modules
41 | # *.iml
42 | # *.ipr
43 |
44 | # CMake
45 | cmake-build-*/
46 |
47 | # Mongo Explorer plugin
48 | .idea/**/mongoSettings.xml
49 |
50 | # File-based project format
51 | *.iws
52 |
53 | # IntelliJ
54 | out/
55 |
56 | # mpeltonen/sbt-idea plugin
57 | .idea_modules/
58 |
59 | # JIRA plugin
60 | atlassian-ide-plugin.xml
61 |
62 | # Cursive Clojure plugin
63 | .idea/replstate.xml
64 |
65 | # SonarLint plugin
66 | .idea/sonarlint/
67 |
68 | # Crashlytics plugin (for Android Studio and IntelliJ)
69 | com_crashlytics_export_strings.xml
70 | crashlytics.properties
71 | crashlytics-build.properties
72 | fabric.properties
73 |
74 | # Editor-based Rest Client
75 | .idea/httpRequests
76 |
77 | # Android studio 3.1+ serialized cache file
78 | .idea/caches/build_file_checksums.ser
79 |
80 | ### Python template
81 | # Byte-compiled / optimized / DLL files
82 | __pycache__/
83 | *.py[cod]
84 | *$py.class
85 |
86 | # C extensions
87 | *.so
88 |
89 | # Distribution / packaging
90 | .Python
91 | build/
92 | develop-eggs/
93 | dist/
94 | downloads/
95 | eggs/
96 | .eggs/
97 | lib/
98 | lib64/
99 | parts/
100 | sdist/
101 | var/
102 | wheels/
103 | share/python-wheels/
104 | *.egg-info/
105 | .installed.cfg
106 | *.egg
107 | MANIFEST
108 |
109 | # PyInstaller
110 | # Usually these files are written by a python script from a template
111 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
112 | *.manifest
113 | *.spec
114 |
115 | # Installer logs
116 | pip-log.txt
117 | pip-delete-this-directory.txt
118 |
119 | # Unit test / coverage reports
120 | htmlcov/
121 | .tox/
122 | .nox/
123 | .coverage
124 | .coverage.*
125 | .cache
126 | nosetests.xml
127 | coverage.xml
128 | *.cover
129 | *.py,cover
130 | .hypothesis/
131 | .pytest_cache/
132 | cover/
133 |
134 | # Translations
135 | *.mo
136 | *.pot
137 |
138 | # Django stuff:
139 | *.log
140 | local_settings.py
141 | db.sqlite3
142 | db.sqlite3-journal
143 |
144 | # Flask stuff:
145 | instance/
146 | .webassets-cache
147 |
148 | # Scrapy stuff:
149 | .scrapy
150 |
151 | # Sphinx documentation
152 | docs/_build/
153 |
154 | # PyBuilder
155 | .pybuilder/
156 | target/
157 |
158 | # Jupyter Notebook
159 | .ipynb_checkpoints
160 |
161 | # IPython
162 | profile_default/
163 | ipython_config.py
164 |
165 | # pyenv
166 | # For a library or package, you might want to ignore these files since the code is
167 | # intended to run in multiple environments; otherwise, check them in:
168 | # .python-version
169 |
170 | # pipenv
171 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
172 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
173 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
174 | # install all needed dependencies.
175 | #Pipfile.lock
176 |
177 | # poetry
178 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
179 | # This is especially recommended for binary packages to ensure reproducibility, and is more
180 | # commonly ignored for libraries.
181 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
182 | #poetry.lock
183 |
184 | # pdm
185 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
186 | #pdm.lock
187 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
188 | # in version control.
189 | # https://pdm.fming.dev/#use-with-ide
190 | .pdm.toml
191 |
192 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
193 | __pypackages__/
194 |
195 | # Celery stuff
196 | celerybeat-schedule
197 | celerybeat.pid
198 |
199 | # SageMath parsed files
200 | *.sage.py
201 |
202 | # Environments
203 | .env
204 | .venv
205 | env/
206 | venv/
207 | ENV/
208 | env.bak/
209 | venv.bak/
210 |
211 | # Spyder project settings
212 | .spyderproject
213 | .spyproject
214 |
215 | # Rope project settings
216 | .ropeproject
217 |
218 | # mkdocs documentation
219 | /site
220 |
221 | # mypy
222 | .mypy_cache/
223 | .dmypy.json
224 | dmypy.json
225 |
226 | # Pyre type checker
227 | .pyre/
228 |
229 | # pytype static type analyzer
230 | .pytype/
231 |
232 | # Cython debug symbols
233 | cython_debug/
234 |
235 | # PyCharm
236 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
237 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
238 | # and can be added to the global gitignore or merged into this file. For a more nuclear
239 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
240 | #.idea/
241 |
242 | ingest/keys
243 | wandb/
244 | .idea
245 |
246 | # ignore local folders where the indexed repo gets cloned to
247 | **/git
248 |
249 | # ignore local milvus volumes
250 | **/volumes
251 |
252 | /ingest/ml/build_docker_image.md
253 |
254 | *.ipynb
--------------------------------------------------------------------------------
/.prefectignore:
--------------------------------------------------------------------------------
1 | # prefect artifacts
2 | .prefectignore
3 |
4 | # python artifacts
5 | __pycache__/
6 | *.py[cod]
7 | *$py.class
8 | *.egg-info/
9 | *.egg
10 |
11 | # Type checking artifacts
12 | .mypy_cache/
13 | .dmypy.json
14 | dmypy.json
15 | .pyre/
16 |
17 | # IPython
18 | profile_default/
19 | ipython_config.py
20 | *.ipynb_checkpoints/*
21 |
22 | # Environments
23 | .python-version
24 | .env
25 | .venv
26 | env/
27 | venv/
28 |
29 | # MacOS
30 | .DS_Store
31 |
32 | # Dask
33 | dask-worker-space/
34 |
35 | # Editors
36 | .idea/
37 | .vscode/
38 |
39 | # VCS
40 | .git/
41 | .hg/
42 |
--------------------------------------------------------------------------------
/.run/create_secrets_blocks.run.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/.run/ingest_de.run.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/.run/ingest_ml.run.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/.run/ingest_mlops.run.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/.run/run_bot_datatalks.run.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/.run/run_bot_local_ws.run.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/Mlops_chatbot_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaalexlit/faq-slack-bot/78962d17dbd68438bd443b7dfdb961ac9d13e574/Mlops_chatbot_diagram.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This is a repo for a human-interface LLM-powered QA Slack chatbot for answering questions
2 | related to DataTalks.Club Zoomcamps
3 |
4 | # Current version
5 |
6 | Please follow [this report](https://api.wandb.ai/links/aaalex-lit/ii6tpid4) that
7 | explains in detail how the bot works
8 |
9 | # First (outdated) version
10 | ## Behind the scenes
11 | Course FAQ Google Document and the Course repo get indexed to the Pinecone vector store.
12 | Then semantic search retrieves the most similar (and hopefully most relevant) pieces to the question asked.
13 | Then this information is passed as a context to a conversational LLM to form the final answer.
14 |
15 | 
16 |
17 | # Before you start
18 |
19 | Use any python virtual environment manager of your preference
20 | and install the dependencies from [requirements.txt](requirements.txt)
21 |
22 | # Fill [Pinecone](https://www.pinecone.io/) index
23 | 1. Set `PINECONE_API_KEY` and `PINECONE_ENV`
24 | environmental variables accordingly
25 |
26 | 2. run [ingest/ingest.py](ingest/mlops/ingest_mlops_old.py)
27 |
28 | ```bash
29 | python ingest_mlops_old.py
30 | ```
31 | # Test QA pipeline locally
32 | 1. Set `OPENAI_API_KEY`, `PINECONE_API_KEY`, and `PINECONE_ENV`
33 | environmental variables accordingly
34 | 1. Run [test.py](test.py)
35 |
36 | ```bash
37 | python test.py
38 | ```
39 | # Launch the bot
40 | 1. Set `OPENAI_API_KEY`, `SLACK_APP_TOKEN`, `SLACK_BOT_TOKEN`,
41 | `PINECONE_API_KEY`, and `PINECONE_ENV`
42 | environmental variables accordingly
43 | 1. To launch the bot on the CLI run [slack_bot/main.py](slack_bot/main.py)
44 | ```bash
45 | python main.py
46 | ```
47 |
48 | Alternatively it can be launched with Docker, please follow
49 | [this README](slack_bot/README.md)
--------------------------------------------------------------------------------
/dev.env:
--------------------------------------------------------------------------------
1 | # test workspace slack token
2 | SLACK_APP_TOKEN=xapp-..
3 | SLACK_BOT_TOKEN=xoxb-..
4 |
5 | # OpenAI API key
6 | OPENAI_API_KEY=sk-..
7 |
8 | PINECONE_API_KEY=..
9 | PINECONE_ENV=..
10 |
11 | WANDB_API_KEY=..
12 |
13 | ZILLIZ_CLOUD_URI=https://..
14 | ZILLIZ_CLOUD_API_KEY=..
15 |
16 | ZILLIZ_PUBLIC_ENDPOINT=https://..
17 | ZILLIZ_API_KEY=..
18 |
19 | LANGCHAIN_API_KEY=lsv2_..
20 |
21 | COHERE_API_KEY=..
22 |
23 | # DEBUG log level
24 | #LOG_LEVEL=10
25 |
26 | LOCAL_MILVUS=True
--------------------------------------------------------------------------------
/fly.toml:
--------------------------------------------------------------------------------
1 | app = "faq-slack-bot"
2 | primary_region = "mad"
3 |
4 | [build]
5 | image = "aaalexlit/faq-slack-bot:main"
6 |
7 | [env]
8 | PINECONE_ENV = "gcp-starter"
9 |
--------------------------------------------------------------------------------
/ingest/README.md:
--------------------------------------------------------------------------------
1 | # Execute indexing
2 | ## For ML Zoomcamp
3 | At the moment the indexing is scheduled to execute with [Prefect Cloud](https://app.prefect.cloud/)
4 | via deployments every 24 hours at 23 CET
5 |
6 | Steps to change/run the deployment are described in [prefect.md](prefect.md)
7 |
8 | ## For MLOps Zoomcamp
9 |
10 | Execute [ingest.py](mlops/ingest_mlops_old.py)
11 | ```shell
12 | python ingest_mlops_old.py
13 | ```
14 |
15 | # Setup Prefect
16 |
17 | To run any ingestion, Prefect needs to be set up,
18 | as the code relies on secrets stored in Prefect blocks.
19 |
20 | ## Create a new profile to use with the cloud and use it (Optional)
21 |
22 | ```bash
23 | prefect profile create cloud
24 | prefect profile use cloud
25 | ```
26 |
27 | ## Log in to prefect cloud either though browser or using the API key
28 | ```bash
29 | prefect cloud login
30 | ```
31 |
32 | Create the required prefect blocks. Make sure to set up corresponding environment
33 | variables.
34 |
35 | ```shell
36 | python ingest/prefect_infra/create_secrets_blocks.py
37 | ```
--------------------------------------------------------------------------------
/ingest/de.dockerfile:
--------------------------------------------------------------------------------
1 | FROM prefecthq/prefect:2-python3.10
2 |
3 | RUN apt-get update && \
4 | apt-get install -y gcc python3-dev
5 |
6 | RUN pip install -U pip
7 |
8 | WORKDIR /usr/src
9 |
10 | COPY ingest/requirements.txt ./
11 | RUN pip install --no-cache-dir -r requirements.txt
12 |
13 | ENV EMBEDDING_CACHE_NAMESPACE=de_zoomcamp
14 |
15 | COPY ingest/de/ingest_de.py ingest/de/
16 | COPY ingest/readers ingest/readers
17 | COPY ingest/utils ingest/utils
18 |
--------------------------------------------------------------------------------
/ingest/de/ingest_de.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from prefect import flow, task
4 |
5 | from ingest.utils.index_utils import index_spreadsheet, index_github_repo, \
6 | index_slack_history, index_faq, index_youtube
7 |
8 | DE_CHANNEL_ID = 'C01FABYF2RG'
9 | FAQ_COLLECTION_NAME = 'dezoomcamp_faq_git'
10 |
11 | os.environ['PREFECT_LOGGING_EXTRA_LOGGERS'] = 'llama-index-core'
12 |
13 |
14 | @task(name="Index course github repo")
15 | def index_course_github_repo():
16 | owner = 'DataTalksClub'
17 | repo = 'data-engineering-zoomcamp'
18 | branch = 'main'
19 | index_github_repo(owner=owner,
20 | repo=repo,
21 | branch=branch,
22 | collection_name=FAQ_COLLECTION_NAME,
23 | ignore_directories=['.github', '.gitignore', 'cohorts/2022', 'cohorts/2023', 'cohorts/2024',
24 | 'images'],
25 | )
26 |
27 |
28 | @task(name="Index risingwave zoomcamp github repo")
29 | def index_risingwave_zoomcamp_github_repo():
30 | owner = 'risingwavelabs'
31 | repo = 'risingwave-data-talks-workshop-2024-03-04'
32 | branch = 'main'
33 | index_github_repo(owner=owner,
34 | repo=repo,
35 | branch=branch,
36 | collection_name=FAQ_COLLECTION_NAME,
37 | ignore_directories=['assets', 'data'],
38 | ignore_file_extensions=['.gitignore', '.parquet', '.csv'])
39 |
40 |
41 | @task(name="Index mage zoomcamp github repo")
42 | def index_mage_zoomcamp_github_repo():
43 | owner = 'mage-ai'
44 | repo = 'mage-zoomcamp'
45 | branch = 'solutions'
46 | index_github_repo(owner=owner,
47 | repo=repo,
48 | branch=branch,
49 | collection_name=FAQ_COLLECTION_NAME,
50 | ignore_directories=[],
51 | ignore_file_extensions=['.gitignore'])
52 |
53 |
54 | @task(name="Index FAQ Google Document")
55 | def index_google_doc():
56 | document_ids = ["19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw"]
57 | print('Loading google doc...')
58 | index_faq(document_ids, FAQ_COLLECTION_NAME)
59 |
60 |
61 | @task(name="Index course schedule")
62 | def index_course_schedule():
63 | url = (
64 | 'https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-'
65 | 'yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml')
66 | title = 'DE Zoomcamp 2024 syllabus and deadlines'
67 | index_spreadsheet(url, title, FAQ_COLLECTION_NAME)
68 |
69 |
70 | @task(name="Index slack messages")
71 | def index_slack_messages():
72 | channel_ids = [DE_CHANNEL_ID]
73 | index_slack_history(channel_ids, FAQ_COLLECTION_NAME)
74 |
75 |
76 | @task(name="Index QA videos subtitles")
77 | def index_yt_subtitles():
78 | video_ids = ['X8cEEwi8DTM']
79 | index_youtube(video_ids, FAQ_COLLECTION_NAME)
80 |
81 |
82 | @flow(name="Update DE info Milvus index", log_prints=True)
83 | def fill_de_index():
84 | print(f"Execution environment is {os.getenv('EXECUTION_ENV', 'local')}")
85 | index_google_doc()
86 | index_slack_messages.submit(wait_for=[index_google_doc])
87 | index_course_schedule.submit(wait_for=[index_google_doc])
88 | # index_evaluation_criteria.submit(wait_for=[index_google_doc])
89 | index_course_github_repo.submit(wait_for=[index_google_doc])
90 | index_yt_subtitles.submit(wait_for=[index_google_doc])
91 |
92 |
93 | if __name__ == '__main__':
94 | fill_de_index()
95 |
--------------------------------------------------------------------------------
/ingest/dev.env:
--------------------------------------------------------------------------------
1 | PINECONE_API_KEY=..
2 | PINECONE_ENV=..
3 | ZILLIZ_CLOUD_URI=https://..
4 | ZILLIZ_CLOUD_API_KEY=..
5 | SLACK_BOT_TOKEN=xoxb-..
6 | GITHUB_TOKEN=ghp_..
7 | UPSTASH_REDIS_REST_URL=https://..
8 | UPSTASH_REDIS_REST_TOKEN=..
9 | ZILLIZ_PUBLIC_ENDPOINT=https://..
10 | ZILLIZ_API_KEY=..
11 |
--------------------------------------------------------------------------------
/ingest/llm.dockerfile:
--------------------------------------------------------------------------------
1 | FROM prefecthq/prefect:2-python3.10
2 |
3 | RUN apt-get update && \
4 | apt-get install -y gcc python3-dev
5 |
6 | RUN pip install -U pip
7 |
8 | WORKDIR /usr/src
9 |
10 | COPY ingest/requirements.txt ./
11 | RUN pip install --no-cache-dir -r requirements.txt
12 |
13 | ENV EMBEDDING_CACHE_NAMESPACE=llm_zoomcamp
14 |
15 | COPY ingest/llm/ingest_llm.py ingest/llm/
16 | COPY ingest/readers ingest/readers
17 | COPY ingest/utils ingest/utils
18 |
--------------------------------------------------------------------------------
/ingest/llm/ingest_llm.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from prefect import flow, task
4 |
5 | from ingest.utils.index_utils import index_github_repo, \
6 | index_slack_history, index_faq
7 |
8 | SLACK_CHANNEL_ID = 'C06TEGTGM3J'
9 | COLLECTION_NAME = 'llmzoomcamp'
10 |
11 |
12 | @task(name="Index course github repo")
13 | def index_course_github_repo():
14 | owner = 'DataTalksClub'
15 | repo = 'llm-zoomcamp'
16 | branch = 'main'
17 | index_github_repo(owner=owner,
18 | repo=repo,
19 | branch=branch,
20 | collection_name=COLLECTION_NAME,
21 | ignore_directories=['.github', '.gitignore', 'images'],
22 | )
23 |
24 |
25 | @task(name="Index FAQ Google Document")
26 | def index_google_doc():
27 | document_ids = ["1m2KexowAXTmexfC5rVTCSnaShvdUQ8Ag2IEiwBDHxN0"]
28 | print('Loading google doc...')
29 | index_faq(document_ids, COLLECTION_NAME)
30 |
31 |
32 | @task(name="Index slack messages")
33 | def index_slack_messages():
34 | channel_ids = [SLACK_CHANNEL_ID]
35 | index_slack_history(channel_ids, COLLECTION_NAME)
36 |
37 |
38 | @flow(name="Update LLM info Milvus index", log_prints=True)
39 | def fill_llm_index():
40 | print(f"Execution environment is {os.getenv('EXECUTION_ENV', 'local')}")
41 | index_google_doc()
42 | index_slack_messages.submit(wait_for=[index_google_doc])
43 | index_course_github_repo.submit(wait_for=[index_google_doc])
44 |
45 |
46 | if __name__ == '__main__':
47 | fill_llm_index()
48 |
--------------------------------------------------------------------------------
/ingest/local_development.md:
--------------------------------------------------------------------------------
1 | # Run ingestion locally for ML and DE Zoomcamps
2 |
3 | Steps to fill in the index locally:
4 |
5 | 1. start dockerized [Milvus](https://milvus.io/) from [local_milvus](local_milvus) folder
6 | ```shell
7 | cd ingest/local_milvus
8 | docker compose up
9 | ```
10 |
11 | 1. Rename [dev.env](../dev.env) to `.env` and set all the required variables
12 |
13 | 1. Create the prefect blocks (needs to be run once)
14 | ```shell
15 | python ingest/prefect_infra/create_secrets_blocks.py
16 | ```
17 |
18 | 1. execute ingestion script [ingest_ml.py](ml/ingest_ml.py) (for ML zoomcamp data)
19 | or [ingest_de.py](de/ingest_de.py) (for DE zoomcamp data).
20 | It will be executed with `EXECUTION_ENV` env var set to `local` by default
21 | ```shell
22 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
23 | python ingest/ml/ingest_ml.py
24 | ```
25 |
26 | If you're using Pycharm IDE there are run configurations available:
27 | [ingest_de](../.run/ingest_de.run.xml)
28 | [ingest_ml](../.run/ingest_ml.run.xml)
--------------------------------------------------------------------------------
/ingest/local_milvus/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.5'
2 |
3 | services:
4 | etcd:
5 | container_name: milvus-etcd
6 | image: quay.io/coreos/etcd:v3.5.16
7 | environment:
8 | - ETCD_AUTO_COMPACTION_MODE=revision
9 | - ETCD_AUTO_COMPACTION_RETENTION=1000
10 | - ETCD_QUOTA_BACKEND_BYTES=4294967296
11 | - ETCD_SNAPSHOT_COUNT=50000
12 | volumes:
13 | - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
14 | command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
15 | healthcheck:
16 | test: ["CMD", "etcdctl", "endpoint", "health"]
17 | interval: 30s
18 | timeout: 20s
19 | retries: 3
20 |
21 | minio:
22 | container_name: milvus-minio
23 | image: minio/minio:RELEASE.2023-03-20T20-16-18Z
24 | environment:
25 | MINIO_ACCESS_KEY: minioadmin
26 | MINIO_SECRET_KEY: minioadmin
27 | ports:
28 | - "9001:9001"
29 | - "9000:9000"
30 | volumes:
31 | - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
32 | command: minio server /minio_data --console-address ":9001"
33 | healthcheck:
34 | test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
35 | interval: 30s
36 | timeout: 20s
37 | retries: 3
38 |
39 | standalone:
40 | container_name: milvus-standalone
41 | image: milvusdb/milvus:v2.5.4
42 | command: ["milvus", "run", "standalone"]
43 | security_opt:
44 | - seccomp:unconfined
45 | environment:
46 | ETCD_ENDPOINTS: etcd:2379
47 | MINIO_ADDRESS: minio:9000
48 | volumes:
49 | - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
50 | healthcheck:
51 | test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
52 | interval: 30s
53 | start_period: 90s
54 | timeout: 20s
55 | retries: 3
56 | ports:
57 | - "19530:19530"
58 | - "9091:9091"
59 | depends_on:
60 | - "etcd"
61 | - "minio"
62 |
63 | networks:
64 | default:
65 | name: milvus
--------------------------------------------------------------------------------
/ingest/ml.dockerfile:
--------------------------------------------------------------------------------
1 | FROM prefecthq/prefect:2-python3.10
2 |
3 | RUN apt-get update && \
4 | apt-get install -y gcc python3-dev
5 |
6 | RUN pip install -U pip
7 |
8 | WORKDIR /usr/src
9 |
10 | COPY ingest/requirements.txt ./
11 | RUN pip install --no-cache-dir -r requirements.txt
12 |
13 | ENV EMBEDDING_CACHE_NAMESPACE=ml_zoomcamp
14 |
15 | COPY ingest/ml/ingest_ml.py ingest/ml/
16 | COPY ingest/readers ingest/readers
17 | COPY ingest/utils ingest/utils
18 |
--------------------------------------------------------------------------------
/ingest/ml/ingest_ml.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from prefect import flow, task
4 |
5 | from ingest.utils.index_utils import index_spreadsheet, index_github_repo, \
6 | index_slack_history, index_faq
7 |
8 | ML_CHANNEL_ID = 'C0288NJ5XSA'
9 | FAQ_COLLECTION_NAME = 'mlzoomcamp_faq_git'
10 |
11 |
12 | @task(name="Index course github repo")
13 | def index_course_github_repo():
14 | owner = 'DataTalksClub'
15 | repo = 'machine-learning-zoomcamp'
16 | branch = 'master'
17 | index_github_repo(owner=owner,
18 | repo=repo,
19 | branch=branch,
20 | collection_name=FAQ_COLLECTION_NAME)
21 |
22 |
23 | @task(name="Index book github repo")
24 | def index_book_github_repo():
25 | owner = 'alexeygrigorev'
26 | repo = 'mlbookcamp-code'
27 | branch = 'master'
28 | ignore_directories = ['.github', 'course-zoomcamp', 'images', 'util']
29 | index_github_repo(owner=owner,
30 | repo=repo,
31 | branch=branch,
32 | ignore_directories=ignore_directories,
33 | collection_name=FAQ_COLLECTION_NAME)
34 |
35 |
36 | @task(name="Index FAQ Google Document")
37 | def index_google_doc():
38 | document_ids = ["1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8"]
39 | print('Loading google doc...')
40 | index_faq(document_ids, FAQ_COLLECTION_NAME)
41 |
42 |
43 | @task(name="Index course schedule")
44 | def index_course_schedule():
45 | url = ('https://docs.google.com/spreadsheets/d/e/2PACX'
46 | '-1vSkEwMv5OKwCdPfW6LgqQvKk48dZjPcFDrjDstBqZfq38UPadh0Nws1b57qOVYwzAjSufKnVf7umGWH/pubhtml')
47 | title = 'ML Zoomcamp 2023 syllabus and deadlines'
48 | index_spreadsheet(url, title, FAQ_COLLECTION_NAME)
49 |
50 |
51 | @task(name="Index project evaluation criteria")
52 | def index_evaluation_criteria():
53 | url = ('https://docs.google.com/spreadsheets/d/e/2PACX'
54 | '-1vQCwqAtkjl07MTW-SxWUK9GUvMQ3Pv_fF8UadcuIYLgHa0PlNu9BRWtfLgivI8xSCncQs82HDwGXSm3/pubhtml')
55 | title = 'ML Zoomcamp project evaluation criteria : Project criteria'
56 | index_spreadsheet(url, title, FAQ_COLLECTION_NAME)
57 |
58 |
59 | @task(name="Index slack messages")
60 | def index_slack_messages():
61 | channel_ids = [ML_CHANNEL_ID]
62 | index_slack_history(channel_ids, FAQ_COLLECTION_NAME)
63 |
64 |
65 | @flow(name="Update ML info Milvus index", log_prints=True)
66 | def fill_ml_index():
67 | print(f"Execution environment is {os.getenv('EXECUTION_ENV', 'local')}")
68 | index_google_doc()
69 | index_slack_messages.submit(wait_for=[index_google_doc])
70 | index_course_schedule.submit(wait_for=[index_google_doc])
71 | index_evaluation_criteria.submit(wait_for=[index_google_doc])
72 | index_course_github_repo.submit(wait_for=[index_google_doc])
73 | index_book_github_repo.submit(wait_for=[index_google_doc])
74 |
75 |
76 | if __name__ == '__main__':
77 | fill_ml_index()
78 |
--------------------------------------------------------------------------------
/ingest/mlops.dockerfile:
--------------------------------------------------------------------------------
1 | FROM prefecthq/prefect:2-python3.10
2 |
3 | RUN apt-get update && \
4 | apt-get install -y gcc python3-dev
5 |
6 | RUN pip install -U pip
7 |
8 | WORKDIR /usr/src
9 |
10 | COPY ingest/requirements.txt ./
11 | RUN pip install --no-cache-dir -r requirements.txt
12 |
13 | ENV EMBEDDING_CACHE_NAMESPACE=mlops_zoomcamp
14 |
15 | COPY ingest/mlops/ingest_mlops.py ingest/mlops/
16 | COPY ingest/readers ingest/readers
17 | COPY ingest/utils ingest/utils
18 |
--------------------------------------------------------------------------------
/ingest/mlops/ingest_mlops.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from prefect import flow, task
4 |
5 | from ingest.utils.index_utils import index_github_repo, \
6 | index_slack_history, index_faq
7 |
8 | SLACK_CHANNEL_ID = 'C02R98X7DS9'
9 | COLLECTION_NAME = 'mlopszoomcamp'
10 |
11 |
12 | @task(name="Index course github repo")
13 | def index_course_github_repo():
14 | owner = 'DataTalksClub'
15 | repo = 'mlops-zoomcamp'
16 | branch = 'main'
17 | index_github_repo(owner=owner,
18 | repo=repo,
19 | branch=branch,
20 | collection_name=COLLECTION_NAME,
21 | ignore_directories=['.github', '.gitignore', 'cohorts/2022', 'cohorts/2023', 'images'],
22 | )
23 |
24 |
25 | @task(name="Index FAQ Google Document")
26 | def index_google_doc():
27 | document_ids = ["12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0"]
28 | print('Loading google doc...')
29 | index_faq(document_ids, COLLECTION_NAME)
30 |
31 |
32 | @task(name="Index slack messages")
33 | def index_slack_messages():
34 | channel_ids = [SLACK_CHANNEL_ID]
35 | index_slack_history(channel_ids, COLLECTION_NAME)
36 |
37 |
38 | @flow(name="Update MLOps info Milvus index", log_prints=True)
39 | def fill_mlops_index():
40 | print(f"Execution environment is {os.getenv('EXECUTION_ENV', 'local')}")
41 | index_google_doc()
42 | index_slack_messages.submit(wait_for=[index_google_doc])
43 | index_course_github_repo.submit(wait_for=[index_google_doc])
44 |
45 |
46 | if __name__ == '__main__':
47 | fill_mlops_index()
48 |
--------------------------------------------------------------------------------
/ingest/mlops/ingest_mlops_old.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import shutil
4 | import tempfile
5 | import time
6 | from pathlib import Path
7 |
8 | import pinecone # type: ignore
9 | from langchain_community.document_loaders import GoogleDriveLoader, GitLoader
10 | from langchain.embeddings import HuggingFaceEmbeddings
11 | from langchain.text_splitter import RecursiveCharacterTextSplitter
12 | from langchain.vectorstores import Pinecone
13 | from prefect import flow, task
14 | from prefect.blocks.system import Secret
15 | from prefect_gcp import GcpCredentials
16 |
17 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
18 | embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5')
19 | embedding_dimension = len(embeddings.embed_query("test"))
20 | print(f'embedding dimension = {embedding_dimension}')
21 |
22 |
23 | @task(name="Index FAQ Google Document")
24 | def ingest_google_doc(index_name: str,
25 | document_ids: list[str],
26 | ):
27 | print('Loading google doc...')
28 | temp_creds = tempfile.NamedTemporaryFile()
29 | creds_dict = GcpCredentials.load("google-drive-creds").service_account_info.get_secret_value()
30 | with open(temp_creds.name, 'w') as f_out:
31 | json.dump(creds_dict, f_out)
32 | loader = GoogleDriveLoader(service_account_key=Path(temp_creds.name),
33 | document_ids=document_ids)
34 | # loader = GoogleDriveLoader(service_account_key=Path.cwd() / "keys" / "service_account_key.json",
35 | # document_ids=document_ids)
36 |
37 | raw_docs = loader.load()
38 | temp_creds.close()
39 | print('Splitting docs for indexing...')
40 | text_splitter = get_text_splitter()
41 | docs = text_splitter.split_documents(raw_docs)
42 |
43 | index_docs(docs, index_name)
44 |
45 |
46 | def index_docs(docs, index_name):
47 | print('Filling the index up...')
48 | Pinecone.from_documents(docs, embeddings, index_name=index_name)
49 | time.sleep(10)
50 | print_index_status(index_name)
51 |
52 |
53 | @task(name="Delete and Create Pinecone index")
54 | def create_pinecone_index(index_name: str):
55 | if index_name in pinecone.list_indexes():
56 | print(f"Index {index_name} exists. Deleting...")
57 | pinecone.delete_index(index_name)
58 |
59 | if index_name not in pinecone.list_indexes():
60 | print(f"Creating index {index_name}...")
61 | pinecone.create_index(
62 | name=index_name,
63 | dimension=embedding_dimension
64 | )
65 |
66 | print_index_status(index_name)
67 |
68 |
69 | def print_index_status(index_name):
70 | index = pinecone.GRPCIndex(index_name)
71 | index_stats = index.describe_index_stats()
72 | print(f"index stats: {index_stats}")
73 |
74 |
75 | @task(name="Index git repo")
76 | def ingest_git_repo(repo_url: str, index_name: str):
77 | local_dir_path = f"./git/{repo_url[repo_url.rindex('/') + 1:]}"
78 | if Path(local_dir_path).exists():
79 | remove_local_dir(local_dir_path)
80 | loader = GitLoader(
81 | clone_url=repo_url,
82 | repo_path=local_dir_path,
83 | )
84 | print('Loading and Splitting git repo for indexing...')
85 | text_splitter = get_text_splitter()
86 | docs = loader.load_and_split(text_splitter)
87 | index_docs(docs, index_name)
88 | remove_local_dir(local_dir_path)
89 |
90 |
91 | def remove_local_dir(local_dir_path):
92 | print(f'Removing local files in {local_dir_path}')
93 | shutil.rmtree(local_dir_path)
94 |
95 |
96 | def get_text_splitter():
97 | return RecursiveCharacterTextSplitter(
98 | chunk_size=1000,
99 | chunk_overlap=200,
100 | )
101 |
102 |
103 | @flow(name="Update the index in Pinecone for MLOps Zoomcamp", log_prints=True)
104 | def create_and_fill_the_index(index_name: str,
105 | google_doc_ids: list[str],
106 | repo_url: str,
107 | overwrite: bool):
108 | pinecone.init(
109 | api_key=Secret.load('pinecone-api-key').get(),
110 | environment=Secret.load('pinecone-env').get()
111 | )
112 | if overwrite:
113 | create_pinecone_index(index_name=index_name)
114 | ingest_google_doc(index_name,
115 | google_doc_ids)
116 | ingest_git_repo(repo_url, index_name)
117 |
118 |
119 | if __name__ == "__main__":
120 | index_name = 'mlops-faq-bot'
121 | google_doc_id = ["12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0"]
122 | repo_url = 'https://github.com/DataTalksClub/mlops-zoomcamp'
123 | overwrite = True
124 | create_and_fill_the_index(index_name=index_name,
125 | google_doc_ids=google_doc_id,
126 | repo_url=repo_url,
127 | overwrite=overwrite)
128 |
--------------------------------------------------------------------------------
/ingest/prefect.md:
--------------------------------------------------------------------------------
1 | # Run the ingestion for ML with prefect deployments
2 |
3 | ## Execute ingestion
4 |
5 | Currently, indexing is scheduled to execute:
6 | - Daily at 00:00 CET for **DE Zoomcamp** documents
7 | - Weekly at 23:00 CET on Monday for **ML Zoomcamp** documents
8 |
9 | Before running any execution make sure the worker is started:
10 | ```shell
11 | prefect worker start --pool zoomcamp-faq-bot
12 | ```
13 |
14 | Ad-hoc executions can be run from the [Prefect Cloud UI](https://app.prefect.cloud/)
15 | by launching the corresponding deployment.
16 |
17 |
18 | It's also possible to run it from the command line:
19 |
20 | ### Run ingestion deployment for ML
21 | ```shell
22 | prefect deployment run 'Update ML info Milvus index/fill-index-zilliz-ml'
23 | ```
24 |
25 | ### Run ingestion deployment for DE
26 | ```shell
27 | prefect deployment run 'Update DE info Milvus index/fill-index-zilliz-de'
28 | ```
29 |
30 | ## Change the properties of a deployment
31 | ### Bulk
32 | Depending on the nature of the changes, after modifying the code or
33 | [prefect.yaml](../prefect.yaml) re-create both deployments by running
34 |
35 | ```shell
36 | prefect deploy --all
37 | ```
38 | ### Individual
39 | Alternatively it can be done per deployment if the changes are not affecting both
40 | **re-create deployment for ML ingestion**
41 | ```shell
42 | prefect deploy --name fill-index-zilliz-ml
43 | ```
44 | **re-create deployment for DE ingestion**
45 | ```shell
46 | prefect deploy --name fill-index-zilliz-de
47 | ```
48 |
49 | ## Setup prefect from scratch
50 |
51 | Login to prefect cloud:
52 |
53 | ```shell
54 | prefect cloud login
55 | ```
56 |
57 | Create the required blocks:
58 |
59 | ```shell
60 | python ingest/prefect_infra/create_secrets_blocks.py
61 | ```
62 |
63 | Create work pool
64 |
65 | ```shell
66 | prefect work-pool create --type docker zoomcamp-faq-bot
67 | ```
68 |
69 | Run the following command in this new terminal to start the worker:
70 |
71 | ```shell
72 | prefect worker start --pool zoomcamp-faq-bot
73 | ```
74 |
75 | Create all the deployments from [prefect.yaml](../prefect.yaml) file
76 |
77 | ```shell
78 | prefect deploy --all
79 | ```
80 |
81 | Run the ingestion by executing created deployments following the
82 | instructions above.
--------------------------------------------------------------------------------
/ingest/prefect_infra/create_secrets_blocks.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import time
4 |
5 | from prefect.blocks.system import Secret
6 | from prefect_gcp import GcpCredentials
7 |
8 |
9 | def create_gcp_creds_block():
10 | block_name = "google-drive-creds"
11 | try:
12 | GcpCredentials.load(block_name)
13 | print(f"Block {block_name} exists")
14 | except ValueError:
15 | print(f"Creating Block {block_name}")
16 | with open("../keys/service_account_key.json", 'r') as f_in:
17 | service_account_info_str = f_in.read()
18 |
19 | service_account_info = json.loads(service_account_info_str)
20 |
21 | GcpCredentials(
22 | service_account_info=service_account_info
23 | ).save(block_name)
24 | time.sleep(10)
25 |
26 |
27 | def create_secret_block(block_name: str, env_var_name: str) -> None:
28 | try:
29 | Secret.load(block_name)
30 | print(f"Block {block_name} exists")
31 | except ValueError:
32 | print(f"Creating Block {block_name}")
33 | Secret(value=os.getenv(env_var_name)).save(name=block_name)
34 | time.sleep(10)
35 |
36 |
37 | def create_pinecone_secrets():
38 | create_secret_block('pinecone-api-key', 'PINECONE_API_KEY')
39 | create_secret_block('pinecone-env', 'PINECONE_ENV')
40 |
41 |
42 | def create_zilliz_secrets():
43 | create_secret_block('zilliz-cloud-uri', 'ZILLIZ_CLOUD_URI')
44 | create_secret_block('zilliz-cloud-api-key', 'ZILLIZ_CLOUD_API_KEY')
45 | create_secret_block('zilliz-public-endpoint', 'ZILLIZ_PUBLIC_ENDPOINT')
46 | create_secret_block('zilliz-api-key', 'ZILLIZ_API_KEY')
47 |
48 |
49 | def create_slack_secrets():
50 | create_secret_block('slack-bot-token', 'SLACK_BOT_TOKEN')
51 |
52 |
53 | def create_github_secrets():
54 | create_secret_block('github-token', 'GITHUB_TOKEN')
55 |
56 |
57 | def create_upstash_redis_secrets():
58 | create_secret_block('upstash-redis-rest-url', 'UPSTASH_REDIS_REST_URL')
59 | create_secret_block('upstash-redis-rest-token', 'UPSTASH_REDIS_REST_TOKEN')
60 |
61 |
62 | if __name__ == '__main__':
63 | create_gcp_creds_block()
64 | create_pinecone_secrets()
65 | create_zilliz_secrets()
66 | create_slack_secrets()
67 | create_github_secrets()
68 | create_upstash_redis_secrets()
69 |
--------------------------------------------------------------------------------
/ingest/readers/custom_faq_gdoc_reader.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Any, Optional
3 |
4 | from llama_index.core.readers.base import BasePydanticReader
5 | from llama_index.core.schema import Document
6 |
7 | DEFAULT_TOKEN_JSON_PATH = 'token.json'
8 | DEFAULT_SERVICE_ACCOUNT_JSON_PATH = 'service_account.json'
9 | DEFAULT_CREDENTIALS_JSON_PATH = 'credentials.json'
10 |
11 | HEADING_STYLE_TEMPLATE = 'HEADING_{}'
12 | DEFAULT_QUESTION_HEADING_STYLE_NUM = 2
13 |
14 | EXCLUDED_LLM_METADATA_KEYS = ['source', 'title', 'section_name']
15 | EXCLUDED_EMBED_METADATA_KEYS = ['source', 'title']
16 |
17 | SCOPES = ["https://www.googleapis.com/auth/documents.readonly"]
18 |
19 |
20 | class FAQGoogleDocsReader(BasePydanticReader):
21 | token_json_path: str = DEFAULT_TOKEN_JSON_PATH
22 | service_account_json_path: str = DEFAULT_SERVICE_ACCOUNT_JSON_PATH
23 | credentials_json_path: str = DEFAULT_CREDENTIALS_JSON_PATH
24 | question_heading_style_num: int = DEFAULT_QUESTION_HEADING_STYLE_NUM
25 | is_remote: bool = True
26 |
27 | def __init__(self,
28 | token_json_path: Optional[str] = DEFAULT_TOKEN_JSON_PATH,
29 | service_account_json_path: Optional[str] = DEFAULT_SERVICE_ACCOUNT_JSON_PATH,
30 | credentials_json_path: Optional[str] = DEFAULT_CREDENTIALS_JSON_PATH,
31 | question_heading_style_num: Optional[int] = DEFAULT_QUESTION_HEADING_STYLE_NUM
32 | ) -> None:
33 | """Initialize with parameters."""
34 | try:
35 | import google # noqa
36 | import google_auth_oauthlib # noqa
37 | import googleapiclient # noqa
38 | except ImportError as e:
39 | raise ImportError(
40 | '`google_auth_oauthlib`, `googleapiclient` and `google` '
41 | 'must be installed to use the GoogleDocsReader.\n'
42 | 'Please run `pip install --upgrade google-api-python-client '
43 | 'google-auth-httplib2 google-auth-oauthlib`.'
44 | ) from e
45 | super().__init__(token_json_path=token_json_path,
46 | service_account_json_path=service_account_json_path,
47 | credentials_json_path=credentials_json_path,
48 | question_heading_style_num=question_heading_style_num)
49 |
50 | @classmethod
51 | def class_name(cls) -> str:
52 | return 'CustomGoogleDocsReader'
53 |
54 | def load_data(self, document_ids: [str]) -> [Document]:
55 | """Load data from the input directory.
56 |
57 | Args:
58 | document_ids (List[str]): a list of document ids.
59 | """
60 | if document_ids is None:
61 | raise ValueError('Must specify a "document_ids" in `load_kwargs`.')
62 |
63 | results = []
64 | for document_id in document_ids:
65 | docs = self._load_docs(document_id)
66 | results.extend(docs)
67 | return results
68 |
69 | def _load_docs(self, document_id: str) -> [Document]:
70 | """Load a document from Google Docs.
71 |
72 | Args:
73 | document_id: the document id.
74 |
75 | Returns:
76 | The document text.
77 | """
78 | import googleapiclient.discovery as discovery
79 |
80 | credentials = self._get_credentials()
81 | docs_service = discovery.build('docs', 'v1', credentials=credentials)
82 | doc = docs_service.documents().get(documentId=document_id).execute()
83 | doc_content = doc.get('body').get('content')
84 | doc_source = f'https://docs.google.com/document/d/{document_id}/edit#heading='
85 | return self._structural_elements_to_docs(doc_content, doc_source)
86 |
87 | def _get_credentials(self) -> Any:
88 | """Get valid user credentials from storage.
89 |
90 | The file token.json stores the user's access and refresh tokens, and is
91 | created automatically when the authorization flow completes for the first
92 | time.
93 |
94 | Returns:
95 | Credentials, the obtained credential.
96 | """
97 | from google.auth.transport.requests import Request
98 | from google.oauth2 import service_account
99 | from google.oauth2.credentials import Credentials
100 | from google_auth_oauthlib.flow import InstalledAppFlow
101 |
102 | creds = None
103 | if os.path.exists(self.token_json_path):
104 | creds = Credentials.from_authorized_user_file(self.token_json_path, SCOPES)
105 | elif os.path.exists(self.service_account_json_path):
106 | return service_account.Credentials.from_service_account_file(
107 | self.service_account_json_path, scopes=SCOPES
108 | )
109 | # If there are no (valid) credentials available, let the user log in.
110 | if not creds or not creds.valid:
111 | if creds and creds.expired and creds.refresh_token:
112 | creds.refresh(Request())
113 | else:
114 | flow = InstalledAppFlow.from_client_secrets_file(
115 | self.credentials_json_path, SCOPES
116 | )
117 | creds = flow.run_local_server(port=8080)
118 | # Save the credentials for the next run
119 | with open(self.token_json_path, 'w') as token:
120 | token.write(creds.to_json())
121 |
122 | return creds
123 |
124 | @staticmethod
125 | def _read_paragraph_element(element: Any) -> Any:
126 | """Return the text in the given ParagraphElement.
127 |
128 | Args:
129 | element: a ParagraphElement from a Google Doc.
130 | """
131 | text_run = element.get('textRun')
132 | return text_run.get('content') if text_run else ''
133 |
134 | @staticmethod
135 | def _get_text_from_paragraph_elements(elements: [Any]) -> Any:
136 | return ''.join(FAQGoogleDocsReader._read_paragraph_element(elem) for elem in elements)
137 |
138 | def _structural_elements_to_docs(self,
139 | doc_elements: [Any],
140 | doc_source: str) -> [Document]:
141 | """Recurse through a list of Structural Elements.
142 |
143 | Read a document's text where text may be in nested elements.
144 |
145 | Args:
146 | doc_elements: a list of Structural Elements.
147 | """
148 | docs = []
149 | text = ''
150 | heading_id = ''
151 | section_name = ''
152 | question_heading_style = HEADING_STYLE_TEMPLATE.format(self.question_heading_style_num)
153 | section_heading_style = HEADING_STYLE_TEMPLATE.format(self.question_heading_style_num - 1)
154 | for value in doc_elements:
155 | if 'paragraph' in value:
156 | paragraph = value['paragraph']
157 | elements = paragraph.get('elements')
158 | paragraph_text = FAQGoogleDocsReader._get_text_from_paragraph_elements(elements)
159 | if 'paragraphStyle' in paragraph and 'headingId' in paragraph['paragraphStyle']:
160 | named_style_type = paragraph['paragraphStyle']['namedStyleType']
161 | if named_style_type in [
162 | question_heading_style,
163 | section_heading_style,
164 | ]:
165 | # create previous document checking if it's not empty
166 | if text != '':
167 | node_metadata = {
168 | 'source': doc_source + heading_id,
169 | 'section_name': section_name,
170 | 'title': 'FAQ'
171 | }
172 | prev_doc = Document(text=text,
173 | metadata=node_metadata,
174 | excluded_embed_metadata_keys=EXCLUDED_EMBED_METADATA_KEYS,
175 | excluded_llm_metadata_keys=EXCLUDED_LLM_METADATA_KEYS)
176 | docs.append(prev_doc)
177 | if named_style_type == question_heading_style:
178 | heading_id = paragraph['paragraphStyle']['headingId']
179 | text = paragraph_text
180 | else:
181 | section_name = paragraph_text
182 | text = ''
183 | else:
184 | text += paragraph_text
185 | return docs
186 |
187 |
188 | if __name__ == '__main__':
189 | reader = FAQGoogleDocsReader(service_account_json_path='../keys/service_account_key.json')
190 | docs = reader.load_data(['1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8'])
191 | print(docs)
192 |
--------------------------------------------------------------------------------
/ingest/readers/slack_reader.py:
--------------------------------------------------------------------------------
1 | """Slack reader."""
2 | import logging
3 | import os
4 | import sys
5 | import time
6 | from datetime import datetime, timedelta
7 | from http.client import IncompleteRead
8 | from ssl import SSLContext
9 | from typing import Any, Optional
10 |
11 | from llama_index.core.bridge.pydantic import PrivateAttr
12 | from llama_index.core.readers.base import BasePydanticReader
13 | from llama_index.core.schema import Document
14 |
15 | logging.basicConfig(stream=sys.stdout, level=logging.INFO,
16 | format='%(message)s')
17 | logger = logging.getLogger(__name__)
18 | EXCLUDED_METADATA_FIELDS = ['channel', 'thread_ts']
19 |
20 |
21 | class SlackReader(BasePydanticReader):
22 | """Slack reader.
23 |
24 | Reads conversations from channels. If the earliest_date is provided, an
25 | optional latest_date can also be provided. If no latest_date is provided,
26 | we assume the latest date is the current timestamp.
27 |
28 | Args:
29 | slack_token (Optional[str]): Slack token. If not provided, we
30 | assume the environment variable `SLACK_BOT_TOKEN` is set.
31 | ssl (Optional[str]): Custom SSL context. If not provided, it is assumed
32 | there is already an SSL context available.
33 | earliest_date (Optional[datetime]): Earliest date from which
34 | to read conversations. If not provided, we read all messages.
35 | latest_date (Optional[datetime]): Latest date from which to
36 | read conversations. If not provided, defaults to current timestamp
37 | in combination with earliest_date.
38 | """
39 |
40 | is_remote: bool = True
41 | slack_token: str
42 | earliest_date_timestamp: Optional[float]
43 | latest_date_timestamp: float
44 | bot_user_id: Optional[str]
45 | not_ignore_users: Optional[list[str]] = []
46 |
47 | _client: Any = PrivateAttr()
48 |
49 | def __init__(
50 | self,
51 | slack_token: Optional[str] = None,
52 | ssl: Optional[SSLContext] = None,
53 | earliest_date: Optional[datetime] = None,
54 | latest_date: Optional[datetime] = None,
55 | earliest_date_timestamp: Optional[float] = None,
56 | latest_date_timestamp: Optional[float] = None,
57 | bot_user_id: Optional[str] = None,
58 | not_ignore_users: Optional[list[str]] = None
59 | ) -> None:
60 | """Initialize with parameters."""
61 | from slack_sdk import WebClient
62 |
63 | if slack_token is None:
64 | slack_token = os.environ["SLACK_BOT_TOKEN"]
65 | if slack_token is None:
66 | raise ValueError(
67 | "Must specify `slack_token` or set environment "
68 | "variable `SLACK_BOT_TOKEN`."
69 | )
70 | if ssl is None:
71 | self._client = WebClient(token=slack_token)
72 | else:
73 | self._client = WebClient(token=slack_token, ssl=ssl)
74 | if latest_date is not None and earliest_date is None:
75 | raise ValueError(
76 | "Must specify `earliest_date` if `latest_date` is specified."
77 | )
78 | if not_ignore_users is None:
79 | not_ignore_users = []
80 | if earliest_date is not None:
81 | earliest_date_timestamp = earliest_date.timestamp()
82 | else:
83 | earliest_date_timestamp = None or earliest_date_timestamp
84 | if latest_date is not None:
85 | latest_date_timestamp = latest_date.timestamp()
86 | else:
87 | latest_date_timestamp = datetime.now().timestamp() or latest_date_timestamp
88 | res = self._client.api_test()
89 | if not res["ok"]:
90 | raise ValueError(f"Error initializing Slack API: {res['error']}")
91 |
92 | super().__init__(
93 | slack_token=slack_token,
94 | earliest_date_timestamp=earliest_date_timestamp,
95 | latest_date_timestamp=latest_date_timestamp,
96 | bot_user_id=bot_user_id,
97 | not_ignore_users=not_ignore_users,
98 | )
99 |
100 | @classmethod
101 | def class_name(cls) -> str:
102 | """Get the name identifier of the class."""
103 | return "SlackReader"
104 |
105 | def _read_message(self, channel_id: str, message_ts: str) -> Document:
106 | from slack_sdk.errors import SlackApiError
107 |
108 | """Read a message."""
109 |
110 | messages_text: list[str] = []
111 | next_cursor = None
112 | while True:
113 | try:
114 | # https://slack.com/api/conversations.replies
115 | # List all replies to a message, including the message itself.
116 | conversations_replies_kwargs = {
117 | "channel": channel_id,
118 | "ts": message_ts,
119 | "cursor": next_cursor,
120 | }
121 | if self.earliest_date_timestamp is not None:
122 | conversations_replies_kwargs |= {
123 | "latest": str(self.latest_date_timestamp),
124 | "oldest": str(self.earliest_date_timestamp),
125 | }
126 | result = self._client.conversations_replies(
127 | **conversations_replies_kwargs # type: ignore
128 | )
129 | messages = result["messages"]
130 | messages_text.extend(message["text"] for message in messages if message['user'] != self.bot_user_id
131 | and message['user'] not in self.not_ignore_users)
132 | messages_text.extend(message["attachments"][0]["text"] for message in messages if
133 | message['user'] in self.not_ignore_users
134 | and "attachments" in message
135 | and "text" in message["attachments"][0])
136 |
137 | if not result["has_more"]:
138 | break
139 |
140 | next_cursor = result["response_metadata"]["next_cursor"]
141 | except SlackApiError as e:
142 | self.sleep_on_ratelimit(e)
143 |
144 | return Document(text="\n\n".join(messages_text),
145 | metadata={"channel": channel_id, "thread_ts": float(message_ts)},
146 | excluded_embed_metadata_keys=EXCLUDED_METADATA_FIELDS,
147 | excluded_llm_metadata_keys=EXCLUDED_METADATA_FIELDS
148 | )
149 |
150 | def _read_channel(self, channel_id: str) -> list[Document]:
151 | from slack_sdk.errors import SlackApiError
152 |
153 | """Read a channel."""
154 |
155 | thread_documents: list[Document] = []
156 | next_cursor = None
157 | while True:
158 | try:
159 | # Call the conversations.history method using the WebClient
160 | # conversations.history returns the first 100 messages by default
161 | # These results are paginated,
162 | # see: https://api.slack.com/methods/conversations.history$pagination
163 | conversations_history_kwargs = {
164 | "channel": channel_id,
165 | "cursor": next_cursor,
166 | "latest": str(self.latest_date_timestamp),
167 | }
168 | if self.earliest_date_timestamp is not None:
169 | conversations_history_kwargs["oldest"] = str(
170 | self.earliest_date_timestamp
171 | )
172 | result = self._client.conversations_history(
173 | **conversations_history_kwargs # type: ignore
174 | )
175 | conversation_history = result["messages"]
176 | # Print results
177 | logger.info(f"{len(conversation_history)} messages found in {channel_id}")
178 |
179 | for message in conversation_history:
180 | if self.is_for_indexing(message):
181 | read_message: Document = self._read_message(channel_id, message["ts"])
182 | if read_message.text != "":
183 | thread_documents.append(read_message)
184 |
185 | if not result["has_more"]:
186 | break
187 | next_cursor = result["response_metadata"]["next_cursor"]
188 |
189 | except SlackApiError as e:
190 | self.sleep_on_ratelimit(e)
191 | except IncompleteRead:
192 | continue
193 |
194 | return thread_documents
195 |
196 | @staticmethod
197 | def sleep_on_ratelimit(e):
198 | if e.response["error"] == "ratelimited":
199 | retry_after = e.response.headers["retry-after"]
200 | logger.error(
201 | f'Rate limit error reached, sleeping for: {retry_after} seconds'
202 | )
203 | time.sleep(int(retry_after) + 1)
204 | else:
205 | logger.error(f"Error parsing conversation replies: {e}")
206 |
207 | def is_for_indexing(self, message):
208 | # ignore unanswered messages
209 | if 'reply_count' in message:
210 | # if bot user id isn't specified or bot hasn't replied the message
211 | if not self.bot_user_id or self.bot_user_id not in message['reply_users']:
212 | return True
213 | if message['reply_users_count'] > 1:
214 | return True
215 | # even if it's a single message but from a user in un-ignore list, index it
216 | elif message['user'] in self.not_ignore_users:
217 | return True
218 | return False
219 |
220 | def load_data(self, channel_ids: list[str]) -> list[Document]:
221 | """Load data from the input directory.
222 |
223 | Args:
224 | channel_ids (List[str]): List of channel ids to read.
225 | Returns:
226 | List[Document]: List of documents.
227 | """
228 | results = []
229 | for channel_id in channel_ids:
230 | results.extend(self._read_channel(channel_id))
231 | return results
232 |
233 |
234 | if __name__ == "__main__":
235 | reader = SlackReader(earliest_date=datetime.now() - timedelta(days=2),
236 | bot_user_id='U05DM3PEJA2',
237 | not_ignore_users=['U01S08W6Z9T'])
238 | for thread in reader.load_data(channel_ids=["C02R98X7DS9"]):
239 | logger.info(f'Text: {thread.text}')
240 | logger.info(f'Metadata: {thread.metadata}')
241 | logger.info('----------------------------')
242 |
--------------------------------------------------------------------------------
/ingest/readers/youtube_reader.py:
--------------------------------------------------------------------------------
1 | """YouTube reader."""
2 |
3 | from llama_index.core.readers.base import BasePydanticReader
4 | from llama_index.core.schema import Document
5 |
6 |
7 | class YoutubeReader(BasePydanticReader):
8 |
9 | def __init__(self) -> None:
10 | try:
11 | from youtube_transcript_api import YouTubeTranscriptApi
12 | except ImportError as e:
13 | raise ImportError(
14 | '`youtube_transcript_api` must be installed to use the YoutubeReader.\n'
15 | 'Please run `pip install --upgrade youtube-transcript-api`.'
16 | ) from e
17 |
18 | super().__init__()
19 |
20 | @classmethod
21 | def class_name(cls) -> str:
22 | """Get the name identifier of the class."""
23 | return "YoutubeReader"
24 |
25 | def load_data(self, video_ids: list[str], tokenizer) -> list[Document]:
26 | from youtube_transcript_api import YouTubeTranscriptApi
27 |
28 | documents: list[Document] = []
29 | for video_id in video_ids:
30 | yt_title = YoutubeReader._read_title(video_id)
31 | current_start = None
32 | current_text = ""
33 | current_token_count = 0
34 | transcript_array = YouTubeTranscriptApi.get_transcript(video_id)
35 |
36 | for segment in transcript_array:
37 | # Get the token count of the current segment text
38 | token_count = len(tokenizer(segment["text"], truncation=False, add_special_tokens=False)['input_ids'])
39 |
40 | # If adding this segment exceeds 512 tokens, finalize the current document
41 | if current_token_count + token_count > 512:
42 | documents.append(Document(
43 | text=current_text.strip(),
44 | metadata=YoutubeReader._get_node_metadata(video_id, int(current_start), yt_title),
45 | excluded_embed_metadata_keys=['yt_link'],
46 | excluded_llm_metadata_keys=['yt_link']
47 | ))
48 |
49 | # Start a new chunk
50 | current_start = segment["start"]
51 | current_text = segment["text"]
52 | current_token_count = token_count
53 | else:
54 | # Concatenate to the current chunk
55 | if not current_text:
56 | current_start = segment["start"]
57 | current_text += " " + segment["text"]
58 | current_token_count += token_count
59 |
60 | # Append the last chunk if it exists
61 | if current_text:
62 | documents.append(Document(
63 | text=current_text.strip(),
64 | metadata=YoutubeReader._get_node_metadata(video_id, int(current_start), yt_title),
65 | excluded_embed_metadata_keys=['yt_link'],
66 | excluded_llm_metadata_keys=['yt_link']
67 | ))
68 |
69 | return documents
70 |
71 | @staticmethod
72 | def _get_node_metadata(video_id: str, pos: int, yt_title: str) -> dict:
73 | return {
74 | 'yt_link': f"https://www.youtube.com/watch?v={video_id}&t={pos}s",
75 | 'yt_title': yt_title
76 | }
77 |
78 | @staticmethod
79 | def _read_title(video_id: str) -> str:
80 | params = {
81 | "format": "json",
82 | "url": f"https://www.youtube.com/watch?v={video_id}"
83 | }
84 | url = "https://www.youtube.com/oembed"
85 |
86 | import requests
87 | response = requests.get(url, params=params)
88 | if response.status_code == 200:
89 | data = response.json()
90 | return data['title']
91 | else:
92 | print(f"Failed to retrieve data: {response.status_code}")
93 | return ''
94 |
--------------------------------------------------------------------------------
/ingest/requirements.txt:
--------------------------------------------------------------------------------
1 | slack-sdk==3.30.0
2 | langchain==0.1.20
3 | google-api-python-client==2.134.0
4 | google-auth-httplib2==0.2.0
5 | google-auth-oauthlib==1.2.0
6 | sentence-transformers==3.0.1
7 | prefect-gcp==0.5.12
8 | GitPython==3.1.43
9 | pymilvus==2.4.4
10 | llama-index-core==0.10.48
11 | llama-index-readers-web==0.1.19
12 | llama-index-readers-github==0.1.9
13 | llama-index-vector-stores-milvus==0.1.20
14 | llama-index-embeddings-langchain==0.1.2
15 | trafilatura==1.10.0
16 | nbconvert==7.16.4
17 | ipython==8.25.0
18 | upstash-redis==1.1.0
19 | jupyter-notebook-parser==0.1.4
20 | youtube-transcript-api==0.6.3
21 |
--------------------------------------------------------------------------------
/ingest/utils/index_utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import tempfile
4 | from datetime import datetime, timedelta
5 |
6 | from jupyter_notebook_parser import JupyterNotebookParser
7 | from langchain.embeddings import CacheBackedEmbeddings
8 | from langchain_community.embeddings import HuggingFaceEmbeddings
9 | from langchain_community.storage import UpstashRedisByteStore
10 | from llama_index.core import Settings
11 | from llama_index.core.indices import VectorStoreIndex
12 | from llama_index.core.node_parser import NodeParser, SentenceSplitter, MarkdownNodeParser
13 | from llama_index.core.schema import Document
14 | from llama_index.core.storage import StorageContext
15 | from llama_index.readers.github import GithubRepositoryReader, GithubClient
16 | from llama_index.readers.web import TrafilaturaWebReader
17 | from llama_index.vector_stores.milvus import MilvusVectorStore
18 | from prefect.blocks.system import Secret
19 | from prefect_gcp import GcpCredentials
20 | from upstash_redis import Redis
21 |
22 | from ingest.readers.custom_faq_gdoc_reader import FAQGoogleDocsReader
23 | from ingest.readers.slack_reader import SlackReader
24 | from ingest.readers.youtube_reader import YoutubeReader
25 |
26 | BOT_USER_ID = 'U05DM3PEJA2'
27 | AU_TOMATOR_USER_ID = 'U01S08W6Z9T'
28 |
29 | EXCLUDE_FILTER_TYPE = GithubRepositoryReader.FilterType.EXCLUDE
30 |
31 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
32 |
33 | embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5')
34 |
35 | embedding_dimension = len(embeddings.embed_query("test"))
36 | print(f'embedding dimension = {embedding_dimension}')
37 |
38 |
39 | def load_embeddings() -> CacheBackedEmbeddings:
40 | redis_client = Redis(url=Secret.load('upstash-redis-rest-url').get(),
41 | token=Secret.load('upstash-redis-rest-token').get())
42 | embeddings_cache = UpstashRedisByteStore(client=redis_client,
43 | ttl=None,
44 | namespace=os.getenv('EMBEDDING_CACHE_NAMESPACE'))
45 |
46 | cached_embedder = CacheBackedEmbeddings.from_bytes_store(
47 | embeddings,
48 | embeddings_cache,
49 | namespace=embeddings.model_name + "/",
50 | )
51 | return cached_embedder
52 |
53 |
54 | Settings.embed_model = load_embeddings()
55 | Settings.llm = None
56 |
57 |
58 | def index_spreadsheet(url: str, title: str, collection_name: str):
59 | documents = TrafilaturaWebReader().load_data([url])
60 | for doc in documents:
61 | doc.metadata['title'] = title
62 | doc.metadata['source'] = url
63 | add_route_to_docs(documents, 'faq')
64 | add_to_index(documents, collection_name=collection_name)
65 |
66 |
67 | def add_route_to_docs(docs: [Document], route_name: str):
68 | route_key_name = 'route'
69 | for doc in docs:
70 | doc.metadata[route_key_name] = route_name
71 | doc.excluded_embed_metadata_keys.append(route_key_name)
72 | doc.excluded_llm_metadata_keys.append(route_key_name)
73 |
74 |
75 | def add_to_index(documents: list[Document],
76 | collection_name: str,
77 | overwrite: bool = False,
78 | node_parser: NodeParser = None):
79 | sentence_splitter = SentenceSplitter.from_defaults(chunk_size=512, chunk_overlap=50,
80 | tokenizer=embeddings.client.tokenizer)
81 | environment = os.getenv('EXECUTION_ENV', 'local')
82 | if environment == 'local':
83 | milvus_vector_store = MilvusVectorStore(uri='http://localhost:19530',
84 | collection_name=collection_name,
85 | dim=embedding_dimension,
86 | overwrite=overwrite)
87 | elif environment == 'zilliz-cluster':
88 | milvus_vector_store = MilvusVectorStore(
89 | uri=Secret.load('zilliz-public-endpoint').get(),
90 | token=Secret.load('zilliz-api-key').get(),
91 | collection_name=collection_name,
92 | dim=embedding_dimension,
93 | overwrite=overwrite)
94 | else:
95 | milvus_vector_store = MilvusVectorStore(collection_name=collection_name,
96 | uri=Secret.load('zilliz-cloud-uri').get(),
97 | token=Secret.load('zilliz-cloud-api-key').get(),
98 | dim=embedding_dimension,
99 | overwrite=overwrite)
100 | storage_context = StorageContext.from_defaults(vector_store=milvus_vector_store)
101 | transformations = [t for t in [node_parser, sentence_splitter] if t is not None]
102 |
103 | VectorStoreIndex.from_documents(documents,
104 | transformations=transformations,
105 | storage_context=storage_context,
106 | show_progress=True)
107 |
108 |
109 | def index_github_repo(owner: str,
110 | repo: str,
111 | branch: str,
112 | collection_name: str,
113 | ignore_file_extensions: [str] = None,
114 | ignore_directories: [str] = None,
115 | ):
116 | if ignore_file_extensions is None:
117 | ignore_file_extensions = ['.jpg', '.png', '.svg', '.gitignore', '.csv', '.jar']
118 | if ignore_directories is None:
119 | ignore_directories = ['.github', '.gitignore', '2021', '2022', 'images']
120 | github_client = GithubClient(Secret.load('github-token').get(), verbose=True)
121 | documents = GithubRepositoryReader(
122 | github_client=github_client,
123 | owner=owner,
124 | repo=repo,
125 | filter_directories=(ignore_directories, EXCLUDE_FILTER_TYPE),
126 | filter_file_extensions=(ignore_file_extensions, EXCLUDE_FILTER_TYPE),
127 | ).load_data(branch=branch)
128 | for doc in documents:
129 | doc.metadata['branch'] = branch
130 | doc.metadata['owner'] = owner
131 | doc.metadata['repo'] = repo
132 | add_route_to_docs(documents, 'github')
133 |
134 | ipynb_docs = [parse_ipynb_doc(doc) for doc in documents if doc.metadata.get('file_name', '').endswith('.ipynb')]
135 | md_docs = [doc for doc in documents if doc.metadata.get('file_name', '').endswith('.md')]
136 | other_docs = [doc for doc in documents if not doc.metadata.get('file_name', '').endswith(('.ipynb', '.md'))]
137 |
138 | add_to_index(other_docs, collection_name=collection_name)
139 | add_to_index(md_docs, collection_name=collection_name, node_parser=MarkdownNodeParser())
140 | add_to_index(ipynb_docs, collection_name=collection_name)
141 |
142 |
143 | def parse_ipynb_doc(ipynb_doc: Document) -> Document:
144 | ipynb_json = json.loads(ipynb_doc.text)
145 | temp_ipynb = tempfile.NamedTemporaryFile(suffix='.ipynb')
146 | try:
147 | with open(temp_ipynb.name, 'w') as f_out:
148 | json.dump(ipynb_json, f_out)
149 | parsed = JupyterNotebookParser(temp_ipynb.name)
150 | all_cells = parsed.get_all_cells()
151 | parsed_text = ''.join([JupyterNotebookParser._join_source_lines(cell.get('source', ''))
152 | for cell in all_cells])
153 | ipynb_doc.text = parsed_text
154 | return ipynb_doc
155 | finally:
156 | temp_ipynb.close()
157 |
158 |
159 | def index_slack_history(channel_ids: [str], collection_name: str):
160 | earliest_date = datetime.now() - timedelta(days=90)
161 | slack_reader = SlackReader(earliest_date=earliest_date,
162 | bot_user_id=BOT_USER_ID,
163 | not_ignore_users=[AU_TOMATOR_USER_ID],
164 | slack_token=Secret.load('slack-bot-token').get())
165 | print('Starting to load slack messages from the last 90 days')
166 | documents = slack_reader.load_data(channel_ids=channel_ids)
167 | add_route_to_docs(documents, 'slack')
168 | print('Starting to add loaded Slack messages to the index')
169 | add_to_index(documents, collection_name=collection_name)
170 |
171 |
172 | def index_faq(document_ids: [str], collection_name: str):
173 | temp_creds = tempfile.NamedTemporaryFile()
174 | creds_dict = GcpCredentials.load("google-drive-creds").service_account_info.get_secret_value()
175 | with open(temp_creds.name, 'w') as f_out:
176 | json.dump(creds_dict, f_out)
177 | gdocs_reader = FAQGoogleDocsReader(service_account_json_path=temp_creds.name)
178 | print('Starting to load FAQ document')
179 | documents = gdocs_reader.load_data(document_ids=document_ids)
180 | temp_creds.close()
181 | add_route_to_docs(documents, 'faq')
182 | print('Starting to add loaded FAQ document to the index')
183 | add_to_index(documents,
184 | collection_name=collection_name,
185 | overwrite=True,
186 | )
187 |
188 |
189 | def index_youtube(video_ids: list[str], collection_name: str):
190 | yt_reader = YoutubeReader()
191 | documents = yt_reader.load_data(video_ids=video_ids, tokenizer=embeddings.client.tokenizer)
192 | print('Starting to add loaded Video transcripts to the index')
193 | add_to_index(documents, collection_name=collection_name)
194 |
--------------------------------------------------------------------------------
/prefect.yaml:
--------------------------------------------------------------------------------
1 | # Welcome to your prefect.yaml file! You can use this file for storing and managing
2 | # configuration for deploying your flows. We recommend committing this file to source
3 | # control along with your flow code.
4 |
5 | # Generic metadata about this project
6 | name: zoomcamp-bot-index
7 | prefect-version: 2.19.5
8 |
9 | # build section allows you to manage and build docker images
10 | build:
11 |
12 | # push section allows you to manage if and how this project is uploaded to remote locations
13 | push:
14 | - prefect_docker.deployments.steps.push_docker_image:
15 | requires: prefect-docker>=0.3.1
16 | image_name: '{{ build_image.image_name }}'
17 | tag: '{{ build_image.tag }}'
18 |
19 | # pull section allows you to provide instructions for cloning this project in remote locations
20 | pull:
21 |
22 |
23 | definitions:
24 | work_pools:
25 | zoomcamp_faq_bot_workpool: &zoomcamp-faq-bot-pool
26 | name: zoomcamp-faq-bot
27 | work_queue_name: docker_queue
28 | job_variables: &job-variables
29 | image: '{{ build_image.image }}'
30 | env:
31 | EXECUTION_ENV: zilliz
32 | auto_remove: true
33 | schedules:
34 | at_0_daily: &at_0_daily
35 | cron: 0 0 * * *
36 | timezone: Europe/Madrid
37 | day_or: true
38 | at_1_daily: &at_1_daily
39 | cron: 0 1 * * *
40 | timezone: Europe/Madrid
41 | day_or: true
42 | at_23_monday: &at_23_monday
43 | cron: 0 23 * * 1
44 | timezone: Europe/Madrid
45 | day_or: true
46 | at_23_tuesday: &at_23_tuesday
47 | cron: 0 23 * * 2
48 | timezone: Europe/Madrid
49 | day_or: true
50 | at_23_wednesday: &at_23_wednesday
51 | cron: 0 23 * * 3
52 | timezone: Europe/Madrid
53 | day_or: true
54 | actions:
55 | docker_build:
56 | - prefect.deployments.steps.run_shell_script: &shell-script-config
57 | id: get-commit-hash
58 | script: git rev-parse --short HEAD
59 | stream_output: false
60 | - prefect_docker.deployments.steps.build_docker_image: &docker-build-config
61 | id: build_image
62 | requires: prefect-docker>=0.3.1
63 | tag: '{{ get-commit-hash.stdout }}'
64 | platform: linux/amd64
65 |
66 |
67 | # the deployments section allows you to provide configuration for deploying flows
68 | deployments:
69 | - name: fill-index-zilliz-ml
70 | tags:
71 | - ml-ingest
72 | - zoomcamp-faq-bot
73 | description: Fill Zilliz index for ML Zoomcamp
74 | schedules:
75 | - *at_23_tuesday
76 | entrypoint: ingest/ml/ingest_ml.py:fill_ml_index
77 | work_pool: *zoomcamp-faq-bot-pool
78 | build:
79 | - prefect.deployments.steps.run_shell_script: *shell-script-config
80 | - prefect_docker.deployments.steps.build_docker_image:
81 | <<: *docker-build-config # Uses the docker_build_config and overrides the dockerfile and image_name fields
82 | dockerfile: ingest/ml.dockerfile
83 | image_name: aaalexlit/zoomcamp-faq-ingest-ml
84 | pull:
85 | - prefect.deployments.steps.set_working_directory:
86 | directory: /usr/src
87 | - name: fill-index-zilliz-de
88 | tags:
89 | - de-ingest
90 | - zoomcamp-faq-bot
91 | description: Fill Zilliz index for DE Zoomcamp
92 | schedules:
93 | - *at_23_monday
94 | entrypoint: ingest/de/ingest_de.py:fill_de_index
95 | work_pool: *zoomcamp-faq-bot-pool
96 | build:
97 | - prefect.deployments.steps.run_shell_script: *shell-script-config
98 | - prefect_docker.deployments.steps.build_docker_image:
99 | <<: *docker-build-config
100 | # Uses the docker_build_config and overrides the dockerfile and image_name fields
101 | dockerfile: ingest/de.dockerfile
102 | image_name: aaalexlit/zoomcamp-faq-ingest-de
103 | pull:
104 | - prefect.deployments.steps.set_working_directory:
105 | directory: /usr/src
106 | - name: fill-index-zilliz-mlops
107 | tags:
108 | - mlops-ingest
109 | - zoomcamp-faq-bot
110 | description: Fill Zilliz index for MLOps Zoomcamp
111 | schedules:
112 | - *at_0_daily
113 | entrypoint: ingest/mlops/ingest_mlops.py:fill_mlops_index
114 | work_pool:
115 | <<: *zoomcamp-faq-bot-pool
116 | job_variables:
117 | <<: *job-variables
118 | env:
119 | EXECUTION_ENV: zilliz-cluster
120 | build:
121 | - prefect.deployments.steps.run_shell_script: *shell-script-config
122 | - prefect_docker.deployments.steps.build_docker_image:
123 | <<: *docker-build-config
124 | # Uses the docker_build_config and overrides the dockerfile and image_name fields
125 | dockerfile: ingest/mlops.dockerfile
126 | image_name: aaalexlit/zoomcamp-faq-ingest-mlops
127 | pull:
128 | - prefect.deployments.steps.set_working_directory:
129 | directory: /usr/src
130 | - name: fill-index-zilliz-llm
131 | tags:
132 | - llm-ingest
133 | - zoomcamp-faq-bot
134 | description: Fill Zilliz index for LLM Zoomcamp
135 | schedules:
136 | - *at_23_wednesday
137 | entrypoint: ingest/llm/ingest_llm.py:fill_llm_index
138 | work_pool:
139 | <<: *zoomcamp-faq-bot-pool
140 | job_variables:
141 | <<: *job-variables
142 | env:
143 | EXECUTION_ENV: zilliz-cluster
144 | build:
145 | - prefect.deployments.steps.run_shell_script: *shell-script-config
146 | - prefect_docker.deployments.steps.build_docker_image:
147 | <<: *docker-build-config
148 | # Uses the docker_build_config and overrides the dockerfile and image_name fields
149 | dockerfile: ingest/llm.dockerfile
150 | image_name: aaalexlit/zoomcamp-faq-ingest-llm
151 | pull:
152 | - prefect.deployments.steps.set_working_directory:
153 | directory: /usr/src
154 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | openai
2 | slack-bolt
3 | slack-sdk
4 | langchain
5 | google-api-python-client
6 | google-auth-httplib2
7 | google-auth-oauthlib
8 | sentence-transformers
9 | prefect
10 | prefect-gcp
11 | GitPython
12 | pymilvus
13 | llama-index-core
14 | llama-index-readers-web
15 | llama-index-readers-github
16 | llama-index-vector-stores-milvus
17 | llama-index-embeddings-langchain
18 | llama-index-postprocessor-cohere-rerank
19 | llama-index-llms-langchain
20 | llama-index-llms-fireworks
21 | ipython
22 | cohere
23 | trafilatura
24 | nbconvert
25 | prefect-docker
26 | langchain-openai
27 | upstash-redis
28 | jupyter-notebook-parser
29 | requests==2.31.0
30 | youtube-transcript-api
31 |
--------------------------------------------------------------------------------
/slack_bot/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10-slim
2 |
3 | RUN apt-get update && \
4 | apt-get install -y gcc python3-dev
5 |
6 | WORKDIR /usr/src/app
7 |
8 | COPY requirements.txt ./
9 | RUN pip install --no-cache-dir -r requirements.txt
10 |
11 | COPY main.py ./
12 |
13 | CMD [ "python", "-u", "./main.py" ]
--------------------------------------------------------------------------------
/slack_bot/README.md:
--------------------------------------------------------------------------------
1 | # Running the bot locally
2 |
3 | 1. re-create separate conda environment using [slack_bot/requirements.txt](../slack_bot/requirements.txt)
4 | ```shell
5 | conda activate base
6 | conda remove --name slack-bot --all
7 | conda create --name slack-bot python=3.10
8 | conda activate slack-bot
9 | cd slack_bot
10 | pip install -r requirements.txt
11 | ```
12 | 1. Rename [dev.env](../dev.env) to `.env` and set all the required variables
13 |
14 | 1. Run ingestion with local milvus following [local_development.md](../ingest/local_development.md)
15 |
16 | 1. Run [main.py](main.py)
17 |
18 | ```shell
19 | source .env
20 | python main.py
21 | ```
22 | In Pycharm IDE use a provided run configuration [run_bot_local_ws.run.xml](../.run/run_bot_local_ws.run.xml)
23 |
--------------------------------------------------------------------------------
/slack_bot/app_manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "display_information": {
3 | "name": "FAQBotForMLOps",
4 | "description": "MLOps FAQ as a bot",
5 | "background_color": "#2e7898"
6 | },
7 | "features": {
8 | "bot_user": {
9 | "display_name": "QABotForMLOps",
10 | "always_online": false
11 | }
12 | },
13 | "oauth_config": {
14 | "scopes": {
15 | "bot": [
16 | "app_mentions:read",
17 | "channels:history",
18 | "channels:read",
19 | "chat:write"
20 | ]
21 | }
22 | },
23 | "settings": {
24 | "event_subscriptions": {
25 | "bot_events": [
26 | "app_mention"
27 | ]
28 | },
29 | "interactivity": {
30 | "is_enabled": true
31 | },
32 | "org_deploy_enabled": false,
33 | "socket_mode_enabled": true,
34 | "token_rotation_enabled": false
35 | }
36 | }
--------------------------------------------------------------------------------
/slack_bot/bot_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaalexlit/faq-slack-bot/78962d17dbd68438bd443b7dfdb961ac9d13e574/slack_bot/bot_icon.png
--------------------------------------------------------------------------------
/slack_bot/dev.env:
--------------------------------------------------------------------------------
1 | # datatalks slack token
2 | SLACK_APP_TOKEN=xapp-..
3 | SLACK_BOT_TOKEN=xoxb-..
4 |
5 | # OpenAI API key
6 | OPENAI_API_KEY=sk-..
7 |
8 | PINECONE_API_KEY=..
9 | PINECONE_ENV=..
10 |
11 | WANDB_API_KEY=..
12 | LANGCHAIN_API_KEY=lsv2_..
13 |
14 | ZILLIZ_CLOUD_URI=https://..
15 | ZILLIZ_CLOUD_API_KEY=..
16 |
17 | ZILLIZ_PUBLIC_ENDPOINT=https://..
18 | ZILLIZ_API_KEY=..
19 |
20 | COHERE_API_KEY=..
21 |
22 | # DEBUG log level
23 | #LOG_LEVEL=10
24 |
--------------------------------------------------------------------------------
/slack_bot/docker-compose-my-workspace.yml:
--------------------------------------------------------------------------------
1 | services:
2 | faq-slack-bot:
3 | build:
4 | context: .
5 | platform: linux/amd64
6 | env_file:
7 | - ../.env
8 | environment:
9 | - LOCALHOST=host.docker.internal
10 |
--------------------------------------------------------------------------------
/slack_bot/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | faq-slack-bot:
3 | build:
4 | context: .
5 | platform: linux/amd64
6 | env_file:
7 | - .env
8 |
--------------------------------------------------------------------------------
/slack_bot/main.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import hashlib
3 | import logging
4 | import os
5 | import re
6 | import sys
7 | import uuid
8 |
9 | from cohere.core import ApiError as CohereAPIError
10 | from langchain import callbacks
11 | from langchain_openai import ChatOpenAI
12 | from langsmith import Client
13 | from llama_index.core import ChatPromptTemplate
14 | from llama_index.core import VectorStoreIndex, Settings
15 | from llama_index.core import get_response_synthesizer
16 | from llama_index.core.llms import ChatMessage, MessageRole
17 | from llama_index.core.postprocessor import TimeWeightedPostprocessor
18 | from llama_index.core.query_engine import RetrieverQueryEngine
19 | # from llama_index.postprocessor.cohere_rerank import CohereRerank
20 | from llama_index.vector_stores.milvus import MilvusVectorStore
21 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding
22 | from requests.exceptions import ChunkedEncodingError
23 | from slack_bolt import App
24 | from slack_bolt.adapter.socket_mode import SocketModeHandler
25 | from slack_sdk.models.views import View
26 | from slack_sdk.web import WebClient
27 |
28 | logging.basicConfig(stream=sys.stdout,
29 | level=os.getenv('LOG_LEVEL', logging.INFO),
30 | format='%(asctime)s %(message)s',
31 | datefmt='%d-%m-%Y %H:%M:%S', )
32 | logger = logging.getLogger(__name__)
33 |
34 | DE_CHANNELS = ['C01FABYF2RG', 'C06CBSE16JC', 'C06BZJX8PSP']
35 | ML_CHANNELS = ['C0288NJ5XSA', 'C05C3SGMLBB', 'C05DTQECY66']
36 | MLOPS_CHANNELS = ['C02R98X7DS9', 'C06C1N46CQ1', 'C0735558X52']
37 | LLM_CHANNELS = ['C079QE5NAMP', 'C078X7REVN3', 'C06TEGTGM3J']
38 |
39 | ALLOWED_CHANNELS = DE_CHANNELS + ML_CHANNELS + MLOPS_CHANNELS + LLM_CHANNELS
40 |
41 | PROJECT_NAME = 'datatalks-faq-slackbot'
42 | ML_ZOOMCAMP_PROJECT_NAME = 'ml-zoomcamp-slack-bot'
43 | DE_ZOOMCAMP_PROJECT_NAME = 'de-zoomcamp-slack-bot'
44 |
45 | ML_COLLECTION_NAME = 'mlzoomcamp_faq_git'
46 | DE_COLLECTION_NAME = 'dezoomcamp_faq_git'
47 | MLOPS_COLLECTION_NAME = 'mlopszoomcamp'
48 | LLM_COLLECTION_NAME = 'llmzoomcamp'
49 |
50 | GPT_MODEL_NAME = 'gpt-4o-mini-2024-07-18'
51 |
52 | # Event API & Web API
53 | SLACK_BOT_TOKEN = os.getenv('SLACK_BOT_TOKEN')
54 | SLACK_APP_TOKEN = os.getenv('SLACK_APP_TOKEN')
55 | app = App(token=SLACK_BOT_TOKEN)
56 | langsmith_client = Client()
57 |
58 |
59 | @app.action('upvote')
60 | def add_positive_feedback(ack, body):
61 | ack()
62 | add_feedback(body, 'upvote')
63 |
64 |
65 | @app.action('downvote')
66 | def add_negative_feedback(ack, body):
67 | ack()
68 | add_feedback(body, 'downvote')
69 |
70 |
71 | def add_feedback(body, feedback_type: str):
72 | run_id = None
73 | feedback_id = None
74 | try:
75 | original_blocks = body['message']['blocks']
76 | actions_block_elements = [block for block in original_blocks if block.get('type') == 'actions'][0]['elements']
77 | element_to_update = \
78 | [element for element in actions_block_elements if element.get('action_id') == feedback_type][0]
79 | element_text_to_update = element_to_update['text']['text']
80 | updated_text, updated_number = increment_number_in_string(element_text_to_update)
81 | element_to_update['text']['text'] = updated_text
82 |
83 | run_id = body['actions'][0]['value']
84 | feedback_id = get_feedback_id_from_run_id_and_feedback_type(run_id, feedback_type)
85 |
86 | user_id = body['user']['id']
87 | user_name = body['user']['username']
88 |
89 | logger.info(f'run_id {run_id} {feedback_type}d by {user_name}({user_id})')
90 |
91 | if updated_number > 1:
92 | langsmith_client.update_feedback(
93 | feedback_id=feedback_id,
94 | score=updated_number
95 | )
96 | else:
97 | langsmith_client.create_feedback(
98 | run_id=run_id,
99 | key=feedback_type,
100 | score=updated_number,
101 | feedback_id=feedback_id
102 | )
103 |
104 | client.chat_update(
105 | channel=body['channel']['id'],
106 | ts=body['message']['ts'],
107 | blocks=original_blocks,
108 | text=body['message']['text']
109 | )
110 | except Exception as ex:
111 | error_message = f'An error occurred when trying to record user feedback with action body =\n{body}\n'
112 | if run_id:
113 | error_message += f'for run_id = {run_id}\n'
114 | if feedback_id:
115 | error_message += f'and feedback_id = {feedback_id}\n'
116 |
117 | logger.error(f'{error_message}'
118 | f'Error: {ex}')
119 | show_feedback_logging_error_modal(body['trigger_id'])
120 |
121 |
122 | def show_feedback_logging_error_modal(trigger_id):
123 | client.views_open(trigger_id=trigger_id,
124 | view=View(type='modal',
125 | title='Error recording feedback',
126 | blocks=[
127 | {
128 | "type": "section",
129 | "text": {
130 | "type": "mrkdwn",
131 | "text": (
132 | "An error occurred while attempting to capture your feedback.\n"
133 | "Please try again later. Apologies for the inconvenience.")
134 | }
135 | }
136 | ]))
137 |
138 |
139 | def get_feedback_id_from_run_id_and_feedback_type(run_id, feedback_type):
140 | # Combine run_id UUID bytes and action bytes
141 | combined_bytes = uuid.UUID(run_id).bytes + feedback_type.encode('utf-8')
142 | # Hash the combined bytes
143 | hashed_bytes = hashlib.sha1(combined_bytes).digest()
144 | # Convert hashed bytes to UUID
145 | return uuid.UUID(bytes=hashed_bytes[:16])
146 |
147 |
148 | # This gets activated when the bot is tagged in a channel
149 | @app.event("app_mention")
150 | def handle_message_events(body):
151 | channel_id = body["event"]["channel"]
152 | event_ts = body["event"]["event_ts"]
153 | user = body["event"]["user"]
154 |
155 | if channel_id not in ALLOWED_CHANNELS:
156 | client.chat_postMessage(channel=channel_id,
157 | thread_ts=event_ts,
158 | text="Apologies, I can't answer questions in this channel")
159 | return
160 |
161 | # Extract question from the message text
162 | question = remove_mentions(str(body["event"]["text"]))
163 | if question.strip() == '':
164 | client.chat_postMessage(channel=channel_id,
165 | thread_ts=event_ts,
166 | text=('Ooops! It seems like your question is empty. '
167 | 'Please make sure to tag me in your message along with your question.')
168 | )
169 | return
170 | logger.info(question)
171 |
172 | # Let the user know that we are busy with the request
173 | greeting_message = get_greeting_message(channel_id)
174 |
175 | posted_greeting_message = client.chat_postMessage(channel=channel_id,
176 | thread_ts=event_ts,
177 | text=greeting_message,
178 | unfurl_links=False)
179 | try:
180 | with callbacks.collect_runs() as cb:
181 | if channel_id in MLOPS_CHANNELS:
182 | response = mlops_query_engine.query(question)
183 | elif channel_id in ML_CHANNELS:
184 | response = ml_query_engine.query(question)
185 | elif channel_id in LLM_CHANNELS:
186 | response = llm_query_engine.query(question)
187 | else:
188 | response = de_query_engine.query(question)
189 | # get the id of the last run that's supposedly a run that delivers the final answer
190 | run_id = cb.traced_runs[-1].id
191 |
192 | response_text = f"Hey, <@{user}>! Here you go: \n{response}"
193 |
194 | response_blocks = [
195 | {
196 | "type": "section",
197 | "text": {
198 | "type": "mrkdwn",
199 | "text": response_text
200 | }
201 | },
202 | {
203 | "type": "divider"
204 | }]
205 | if hasattr(response, "source_nodes"):
206 | sources = links_to_source_nodes(response)
207 | references = f"References:\n{sources}"
208 | references_blocks = [{
209 | "type": "section",
210 | "text": {
211 | "type": "mrkdwn",
212 | "text": references
213 | }
214 | },
215 | {
216 | "type": "divider"
217 | }]
218 | response_blocks.extend(references_blocks)
219 |
220 | response_blocks.extend([{
221 | "type": "context",
222 | "elements": [
223 | {
224 | "type": "mrkdwn",
225 | "text": ":pray: Please leave your feedback to help me improve "
226 | }
227 | ]
228 | },
229 | {
230 | "type": "actions",
231 | "elements": [
232 | {
233 | "type": "button",
234 | "text": {
235 | "type": "plain_text",
236 | "text": ":thumbsup: 0"
237 | },
238 | "style": "primary",
239 | "value": f"{run_id}",
240 | "action_id": "upvote"
241 | },
242 | {
243 | "type": "button",
244 | "text": {
245 | "type": "plain_text",
246 | "text": ":thumbsdown: 0"
247 | },
248 | "style": "danger",
249 | "value": f"{run_id}",
250 | "action_id": "downvote"
251 | }
252 | ]
253 | }
254 | ])
255 |
256 | client.chat_postMessage(channel=channel_id,
257 | thread_ts=event_ts,
258 | blocks=response_blocks,
259 | text=response_text,
260 | unfurl_media=False
261 | )
262 | client.chat_delete(channel=channel_id,
263 | ts=posted_greeting_message.data['ts'])
264 | except CohereAPIError:
265 | client.chat_postMessage(channel=channel_id,
266 | thread_ts=event_ts,
267 | text="There was an error, please try again later")
268 | except Exception as e:
269 | logger.error(f'Error responding to a query\n{e}')
270 | client.chat_postMessage(channel=channel_id,
271 | thread_ts=event_ts,
272 | text=f"There was an error: {e}")
273 |
274 |
275 | def links_to_source_nodes(response):
276 | res = set()
277 | source_nodes = response.source_nodes
278 | link_template = 'https://datatalks-club.slack.com/archives/{}/p{}'
279 | for node in source_nodes:
280 | # Slack
281 | if 'channel' in node.metadata:
282 | channel_id = node.metadata['channel']
283 | thread_ts = node.metadata['thread_ts']
284 | thread_ts_str = str(thread_ts).replace('.', '')
285 | link_template.format(channel_id, thread_ts_str)
286 | res.add(link_template.format(channel_id, thread_ts_str))
287 | # Google doc
288 | elif 'source' in node.metadata:
289 | title = node.metadata['title']
290 | if title == 'FAQ':
291 | section_title = node.text.split('\n', 1)[0]
292 | res.add(f"<{node.metadata['source']}|"
293 | f" {title}-{section_title}...> ")
294 | else:
295 | res.add(f"<{node.metadata['source']}| {title}>")
296 | # GitHub
297 | elif 'repo' in node.metadata:
298 | repo = node.metadata['repo']
299 | owner = node.metadata['owner']
300 | branch = node.metadata['branch']
301 | file_path = node.metadata['file_path']
302 | link_to_file = build_repo_path(owner=owner, repo=repo, branch=branch, file_path=file_path)
303 | res.add(f'<{link_to_file}| GitHub-{repo}-{file_path.split("/")[-1]}>')
304 | elif 'yt_link' in node.metadata:
305 | yt_link = node.metadata['yt_link']
306 | yt_title = node.metadata['yt_title']
307 | res.add(f'<{yt_link}| Youtube-{yt_title}>')
308 | return '\n'.join(res)
309 |
310 |
311 | def increment_number_in_string(source_string):
312 | # Regular expression to find any sequence of digits (\d+)
313 | pattern = r'(\d+)'
314 |
315 | # Define a lambda function to replace matched digits with the incremented value
316 | replacer = lambda match: str(int(match.group(0)) + 1)
317 |
318 | # Use re.sub() to replace matched digits with the incremented value
319 | result_string = re.sub(pattern, replacer, source_string)
320 | result_number = int(re.search(pattern, result_string).group(0))
321 |
322 | return result_string, result_number
323 |
324 |
325 | def build_repo_path(owner: str, repo: str, branch: str, file_path: str):
326 | return f'https://github.com/{owner}/{repo}/blob/{branch}/{file_path}'
327 |
328 |
329 | def remove_mentions(input_text):
330 | # Define a regular expression pattern to match the mention
331 | mention_pattern = r'<@U[0-9A-Z]+>'
332 |
333 | return re.sub(mention_pattern, '', input_text)
334 |
335 |
336 | def get_greeting_message(channel_id):
337 | message_template = "Hello from {name} FAQ Bot! :robot_face: \n" \
338 | "Please note that I'm under active development. " \
339 | "The answers might not be accurate since I'm " \
340 | "just a human-friendly interface to the " \
341 | "" \
342 | ", this Slack channel, and this course's ." \
343 | "\nThanks for your request, I'm on it!"
344 | if channel_id in MLOPS_CHANNELS:
345 | name = 'MLOps'
346 | link = '12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit#heading=h.uwpp1jrsj0d'
347 | repo = 'mlops-zoomcamp'
348 | elif channel_id in ML_CHANNELS:
349 | name = 'ML'
350 | link = '1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8/edit#heading=h.98qq6wfuzeck'
351 | repo = 'machine-learning-zoomcamp'
352 | elif channel_id in LLM_CHANNELS:
353 | name = 'LLM'
354 | link = '1m2KexowAXTmexfC5rVTCSnaShvdUQ8Ag2IEiwBDHxN0/edit#heading=h.o29af0z8xx88'
355 | repo = 'llm-zoomcamp'
356 | else:
357 | name = 'DE'
358 | link = '19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit#heading=h.o29af0z8xx88'
359 | repo = 'data-engineering-zoomcamp'
360 | return message_template.format(name=name, link=link, repo=repo)
361 |
362 |
363 | def log_to_langsmith():
364 | os.environ["LANGCHAIN_TRACING_V2"] = "true"
365 | os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
366 | os.environ["LANGCHAIN_PROJECT"] = PROJECT_NAME
367 |
368 |
369 | def get_prompt_template(zoomcamp_name: str, cohort_year: int, course_start_date: str) -> ChatPromptTemplate:
370 | system_prompt = ChatMessage(
371 | content=(
372 | "You are a helpful AI assistant for the {zoomcamp_name} ZoomCamp course at DataTalksClub, "
373 | "and you can be found in the course's Slack channel.\n"
374 | "As a trustworthy assistant, you must provide helpful answers to students' questions about the course, "
375 | "and assist them in finding solutions when they encounter problems/errors while following the course. \n"
376 | "You must do it using only the excerpts from the course FAQ document, Slack threads, and GitHub repository "
377 | "that are provided to you, without relying on prior knowledge.\n"
378 | "Current cohort is year {cohort_year} one and the course start date is {course_start_date}. \n"
379 | "Today is {current_date}. Take this into account when answering questions with temporal aspect. \n"
380 | "Here are your guidelines:\n"
381 | "- Provide clear and concise explanations for your conclusions, including relevant evidences, and "
382 | "relevant code snippets if the question pertains to code. \n"
383 | "- Avoid starting your answer with 'Based on the provided ...' or 'The context information ...' "
384 | "or anything like this, instead, provide the information directly in the response.\n"
385 | "- Justify your response in detail by explaining why you made the conclusions you actually made.\n"
386 | "- In your response, refrain from rephrasing the user's question or problem; simply provide an answer.\n"
387 | "- Make sure that the code examples you provide are accurate and runnable.\n"
388 | "- If the question requests confirmation, avoid repeating the question. Instead, conduct your own "
389 | "analysis based on the provided sources.\n"
390 | "- In cases where the provided information is insufficient and you are uncertain about the response, "
391 | "reply with: 'I don't think I have an answer for this; you'll have to ask your fellows or instructors.\n"
392 | "- All the hyperlinks need to be taken from the provided excerpts, not from prior knowledge. "
393 | "If there are no hyperlinks provided, abstain from adding hyperlinks to the answer.\n"
394 | "- The hyperlinks need to be formatted the following way: \n"
395 | "Example of the correctly formatted link to github: \n"
396 | ""
397 | ),
398 | role=MessageRole.SYSTEM,
399 | )
400 | user_prompt = ChatMessage(content=("Excerpts from the course FAQ document, Slack threads, and "
401 | "GitHub repository are below delimited by the dashed lines:\n"
402 | "---------------------\n"
403 | "{context_str}\n"
404 | "---------------------\n"
405 | "Question: {query_str}\n"
406 | "Answer: "),
407 | role=MessageRole.USER, )
408 | return ChatPromptTemplate(message_templates=[
409 | system_prompt,
410 | user_prompt,
411 | ],
412 | function_mappings={'zoomcamp_name': lambda **kwargs: zoomcamp_name,
413 | 'cohort_year': lambda **kwargs: cohort_year,
414 | 'current_date': lambda **kwargs: datetime.datetime.now().strftime("%d %B %Y"),
415 | 'course_start_date': lambda **kwargs: course_start_date})
416 |
417 |
418 | def get_retriever_query_engine(collection_name: str,
419 | zoomcamp_name: str,
420 | cohort_year: int,
421 | course_start_date: str):
422 | if os.getenv('LOCAL_MILVUS', None):
423 | localhost = os.getenv('LOCALHOST', 'localhost')
424 | vector_store = MilvusVectorStore(collection_name=collection_name,
425 | dim=embedding_dimension,
426 | overwrite=False,
427 | uri=f'http://{localhost}:19530')
428 | else:
429 | if collection_name in [MLOPS_COLLECTION_NAME, LLM_COLLECTION_NAME]:
430 | vector_store = MilvusVectorStore(collection_name=collection_name,
431 | uri=os.getenv("ZILLIZ_PUBLIC_ENDPOINT"),
432 | token=os.getenv("ZILLIZ_API_KEY"),
433 | dim=embedding_dimension,
434 | overwrite=False)
435 | else:
436 | vector_store = MilvusVectorStore(collection_name=collection_name,
437 | uri=os.getenv("ZILLIZ_CLOUD_URI"),
438 | token=os.getenv("ZILLIZ_CLOUD_API_KEY"),
439 | dim=embedding_dimension,
440 | overwrite=False)
441 | vector_store_index = VectorStoreIndex.from_vector_store(vector_store,
442 | embed_model=embeddings)
443 | # cohere_rerank = CohereRerank(api_key=os.getenv('COHERE_API_KEY'), top_n=4)
444 | recency_postprocessor = get_time_weighted_postprocessor()
445 | # node_postprocessors = [recency_postprocessor, cohere_rerank]
446 | node_postprocessors = [recency_postprocessor]
447 | qa_prompt_template = get_prompt_template(zoomcamp_name=zoomcamp_name,
448 | cohort_year=cohort_year,
449 | course_start_date=course_start_date)
450 | Settings.llm = ChatOpenAI(model=GPT_MODEL_NAME,
451 | temperature=0.7)
452 |
453 | response_synthesizer = get_response_synthesizer(text_qa_template=qa_prompt_template,
454 | verbose=True,
455 | )
456 | return RetrieverQueryEngine(vector_store_index.as_retriever(similarity_top_k=15),
457 | node_postprocessors=node_postprocessors,
458 | response_synthesizer=response_synthesizer,
459 | )
460 |
461 |
462 | def get_time_weighted_postprocessor():
463 | return TimeWeightedPostprocessor(
464 | last_accessed_key='thread_ts',
465 | time_decay=0.4,
466 | time_access_refresh=False,
467 | top_k=10,
468 | )
469 |
470 |
471 | if __name__ == "__main__":
472 | client = WebClient(SLACK_BOT_TOKEN)
473 |
474 | logger.info('Downloading embeddings...')
475 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
476 | while True:
477 | try:
478 | embeddings = HuggingFaceEmbedding(model_name='BAAI/bge-base-en-v1.5')
479 | embedding_dimension = len(embeddings.get_text_embedding("test"))
480 | except ChunkedEncodingError as e:
481 | continue
482 | break
483 |
484 | log_to_langsmith()
485 |
486 | ml_query_engine = get_retriever_query_engine(collection_name=ML_COLLECTION_NAME,
487 | zoomcamp_name='Machine Learning',
488 | cohort_year=2024,
489 | course_start_date='16 September 2024')
490 |
491 | de_query_engine = get_retriever_query_engine(collection_name=DE_COLLECTION_NAME,
492 | zoomcamp_name='Data Engineering',
493 | cohort_year=2025,
494 | course_start_date='13 January 2025')
495 |
496 | mlops_query_engine = get_retriever_query_engine(collection_name=MLOPS_COLLECTION_NAME,
497 | zoomcamp_name='MLOps',
498 | cohort_year=2024,
499 | course_start_date='13 May 2024')
500 |
501 | llm_query_engine = get_retriever_query_engine(collection_name=LLM_COLLECTION_NAME,
502 | zoomcamp_name='LLM',
503 | cohort_year=2024,
504 | course_start_date='17 June 2024')
505 | SocketModeHandler(app, SLACK_APP_TOKEN).start()
506 |
--------------------------------------------------------------------------------
/slack_bot/requirements.txt:
--------------------------------------------------------------------------------
1 | openai==1.61.0
2 | slack-bolt==1.22.0
3 | slack-sdk==3.34.0
4 | langchain==0.3.17
5 | langchain-community==0.3.16
6 | sentence-transformers==3.4.1
7 | cohere==5.15.0
8 | pymilvus==2.5.4
9 | langchain-openai==0.3.3
10 | llama-index-core==0.12.15
11 | llama-index-vector-stores-milvus==0.5.0
12 | llama-index-embeddings-huggingface==0.5.1
13 | llama-index-postprocessor-cohere-rerank==0.4.0
14 | llama-index-llms-langchain==0.5.1
15 |
--------------------------------------------------------------------------------
/slack_bot_custom_ingestion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaalexlit/faq-slack-bot/78962d17dbd68438bd443b7dfdb961ac9d13e574/slack_bot_custom_ingestion.png
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import langchain
4 | import pinecone
5 | from langchain.chains import RetrievalQA
6 | from langchain.embeddings import HuggingFaceEmbeddings
7 | from langchain.vectorstores import Pinecone
8 | from langchain.chat_models import ChatOpenAI
9 |
10 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
11 | embeddings = HuggingFaceEmbeddings()
12 | langchain.debug = True
13 |
14 |
15 | def main(question):
16 | pinecone.init(
17 | api_key=os.getenv('PINECONE_API_KEY'),
18 | environment=os.getenv('PINECONE_ENV')
19 | )
20 |
21 | pinecone_index = Pinecone.from_existing_index(index_name='mlops-faq-bot',
22 | embedding=embeddings)
23 | qa = RetrievalQA.from_chain_type(
24 | llm=ChatOpenAI(model_name='gpt-3.5-turbo-1106'),
25 | retriever=pinecone_index.as_retriever()
26 | )
27 | qa.return_source_documents = True
28 | print(f"Question: {question}")
29 |
30 | result = qa.apply([question])
31 | for res in result:
32 | print(res.keys())
33 | print(f"Question: {res['query']}")
34 | print(f"Answer: {res['result']}")
35 | for doc in res['source_documents']:
36 | print("----------------------------------------------------")
37 | print(f"Metadata: {doc.metadata}")
38 | print(f"Content: {doc.page_content}")
39 |
40 |
41 | if __name__ == "__main__":
42 | # main("How can I solve connection in use problem with mlflow?")
43 | main("MLflow UI throws an error on the browser 'Access to localhost was denied'. Any idea how to resolve this?")
44 |
--------------------------------------------------------------------------------