├── .github └── workflows │ ├── docker-image.yml │ ├── fly-deploy.yml │ └── fly-rollback.yml ├── .gitignore ├── .prefectignore ├── .run ├── create_secrets_blocks.run.xml ├── ingest_de.run.xml ├── ingest_ml.run.xml ├── ingest_mlops.run.xml ├── run_bot_datatalks.run.xml └── run_bot_local_ws.run.xml ├── Mlops_chatbot_diagram.png ├── README.md ├── dev.env ├── fly.toml ├── ingest ├── README.md ├── de.dockerfile ├── de │ └── ingest_de.py ├── dev.env ├── llm.dockerfile ├── llm │ └── ingest_llm.py ├── local_development.md ├── local_milvus │ └── docker-compose.yml ├── ml.dockerfile ├── ml │ └── ingest_ml.py ├── mlops.dockerfile ├── mlops │ ├── ingest_mlops.py │ └── ingest_mlops_old.py ├── prefect.md ├── prefect_infra │ └── create_secrets_blocks.py ├── readers │ ├── custom_faq_gdoc_reader.py │ ├── slack_reader.py │ └── youtube_reader.py ├── requirements.txt └── utils │ └── index_utils.py ├── prefect.yaml ├── requirements.txt ├── slack_bot ├── Dockerfile ├── README.md ├── app_manifest.json ├── bot_icon.png ├── dev.env ├── docker-compose-my-workspace.yml ├── docker-compose.yml ├── main.py └── requirements.txt ├── slack_bot_custom_ingestion.png └── test.py /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | build-and-push-image: 8 | name: Push Docker image to Docker Hub 9 | runs-on: ubuntu-latest 10 | permissions: 11 | contents: read 12 | packages: write 13 | 14 | steps: 15 | - name: Check out the repo 16 | uses: actions/checkout@v4 17 | 18 | - name: Log in to Docker Hub 19 | uses: docker/login-action@v3 20 | with: 21 | username: ${{ secrets.DOCKER_USERNAME }} 22 | password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }} 23 | 24 | - name: Pull currently used Docker image 25 | run: docker pull aaalexlit/faq-slack-bot:main 26 | 27 | - name: Tag currently used Docker image as "previous" to enable easy rollback 28 | run: docker tag aaalexlit/faq-slack-bot:main aaalexlit/faq-slack-bot:previous 29 | 30 | - name: Push tagged image to Docker Hub 31 | run: docker push aaalexlit/faq-slack-bot:previous 32 | 33 | - name: Extract metadata (tags, labels) for Docker 34 | id: meta 35 | uses: docker/metadata-action@v5 36 | with: 37 | images: aaalexlit/faq-slack-bot 38 | tags: | 39 | type=sha 40 | type=ref,event=branch 41 | 42 | - name: Build and push Docker image 43 | uses: docker/build-push-action@v5 44 | with: 45 | context: ./slack_bot/ 46 | file: ./slack_bot/Dockerfile 47 | push: true 48 | tags: ${{ steps.meta.outputs.tags }} 49 | labels: ${{ steps.meta.outputs.labels }} 50 | -------------------------------------------------------------------------------- /.github/workflows/fly-deploy.yml: -------------------------------------------------------------------------------- 1 | name: Fly Deploy 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["Docker Image CI"] 6 | types: 7 | - completed 8 | workflow_dispatch: 9 | inputs: 10 | docker_tag: 11 | description: 'Docker image tag to be deployed. by default `main`' 12 | required: false 13 | default: 'main' 14 | 15 | jobs: 16 | deploy: 17 | name: Deploy app 18 | runs-on: ubuntu-latest 19 | if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }} 20 | steps: 21 | - uses: actions/checkout@v4 22 | - uses: superfly/flyctl-actions/setup-flyctl@master 23 | - run: | 24 | if [ "${{ github.event_name }}" == "workflow_run" ]; then 25 | DOCKER_TAG="main" 26 | else 27 | DOCKER_TAG="${{ github.event.workflow_run.event.inputs.docker_tag || github.event.inputs.docker_tag }}" 28 | fi 29 | flyctl deploy --remote-only --image aaalexlit/faq-slack-bot:${DOCKER_TAG} 30 | env: 31 | FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} 32 | -------------------------------------------------------------------------------- /.github/workflows/fly-rollback.yml: -------------------------------------------------------------------------------- 1 | name: Fly Rollback 2 | on: 3 | workflow_dispatch: 4 | jobs: 5 | rollback: 6 | name: Rollback to the previously deployed image 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - uses: superfly/flyctl-actions/setup-flyctl@master 11 | - run: flyctl deploy --remote-only --image aaalexlit/faq-slack-bot:previous 12 | env: 13 | FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### PyCharm+all template 2 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 3 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 4 | 5 | # User-specific stuff 6 | .idea/**/workspace.xml 7 | .idea/**/tasks.xml 8 | .idea/**/usage.statistics.xml 9 | .idea/**/dictionaries 10 | .idea/**/shelf 11 | 12 | # AWS User-specific 13 | .idea/**/aws.xml 14 | 15 | # Generated files 16 | .idea/**/contentModel.xml 17 | 18 | # Sensitive or high-churn files 19 | .idea/**/dataSources/ 20 | .idea/**/dataSources.ids 21 | .idea/**/dataSources.local.xml 22 | .idea/**/sqlDataSources.xml 23 | .idea/**/dynamic.xml 24 | .idea/**/uiDesigner.xml 25 | .idea/**/dbnavigator.xml 26 | 27 | # Gradle 28 | .idea/**/gradle.xml 29 | .idea/**/libraries 30 | 31 | # Gradle and Maven with auto-import 32 | # When using Gradle or Maven with auto-import, you should exclude module files, 33 | # since they will be recreated, and may cause churn. Uncomment if using 34 | # auto-import. 35 | # .idea/artifacts 36 | # .idea/compiler.xml 37 | # .idea/jarRepositories.xml 38 | # .idea/modules.xml 39 | # .idea/*.iml 40 | # .idea/modules 41 | # *.iml 42 | # *.ipr 43 | 44 | # CMake 45 | cmake-build-*/ 46 | 47 | # Mongo Explorer plugin 48 | .idea/**/mongoSettings.xml 49 | 50 | # File-based project format 51 | *.iws 52 | 53 | # IntelliJ 54 | out/ 55 | 56 | # mpeltonen/sbt-idea plugin 57 | .idea_modules/ 58 | 59 | # JIRA plugin 60 | atlassian-ide-plugin.xml 61 | 62 | # Cursive Clojure plugin 63 | .idea/replstate.xml 64 | 65 | # SonarLint plugin 66 | .idea/sonarlint/ 67 | 68 | # Crashlytics plugin (for Android Studio and IntelliJ) 69 | com_crashlytics_export_strings.xml 70 | crashlytics.properties 71 | crashlytics-build.properties 72 | fabric.properties 73 | 74 | # Editor-based Rest Client 75 | .idea/httpRequests 76 | 77 | # Android studio 3.1+ serialized cache file 78 | .idea/caches/build_file_checksums.ser 79 | 80 | ### Python template 81 | # Byte-compiled / optimized / DLL files 82 | __pycache__/ 83 | *.py[cod] 84 | *$py.class 85 | 86 | # C extensions 87 | *.so 88 | 89 | # Distribution / packaging 90 | .Python 91 | build/ 92 | develop-eggs/ 93 | dist/ 94 | downloads/ 95 | eggs/ 96 | .eggs/ 97 | lib/ 98 | lib64/ 99 | parts/ 100 | sdist/ 101 | var/ 102 | wheels/ 103 | share/python-wheels/ 104 | *.egg-info/ 105 | .installed.cfg 106 | *.egg 107 | MANIFEST 108 | 109 | # PyInstaller 110 | # Usually these files are written by a python script from a template 111 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 112 | *.manifest 113 | *.spec 114 | 115 | # Installer logs 116 | pip-log.txt 117 | pip-delete-this-directory.txt 118 | 119 | # Unit test / coverage reports 120 | htmlcov/ 121 | .tox/ 122 | .nox/ 123 | .coverage 124 | .coverage.* 125 | .cache 126 | nosetests.xml 127 | coverage.xml 128 | *.cover 129 | *.py,cover 130 | .hypothesis/ 131 | .pytest_cache/ 132 | cover/ 133 | 134 | # Translations 135 | *.mo 136 | *.pot 137 | 138 | # Django stuff: 139 | *.log 140 | local_settings.py 141 | db.sqlite3 142 | db.sqlite3-journal 143 | 144 | # Flask stuff: 145 | instance/ 146 | .webassets-cache 147 | 148 | # Scrapy stuff: 149 | .scrapy 150 | 151 | # Sphinx documentation 152 | docs/_build/ 153 | 154 | # PyBuilder 155 | .pybuilder/ 156 | target/ 157 | 158 | # Jupyter Notebook 159 | .ipynb_checkpoints 160 | 161 | # IPython 162 | profile_default/ 163 | ipython_config.py 164 | 165 | # pyenv 166 | # For a library or package, you might want to ignore these files since the code is 167 | # intended to run in multiple environments; otherwise, check them in: 168 | # .python-version 169 | 170 | # pipenv 171 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 172 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 173 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 174 | # install all needed dependencies. 175 | #Pipfile.lock 176 | 177 | # poetry 178 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 179 | # This is especially recommended for binary packages to ensure reproducibility, and is more 180 | # commonly ignored for libraries. 181 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 182 | #poetry.lock 183 | 184 | # pdm 185 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 186 | #pdm.lock 187 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 188 | # in version control. 189 | # https://pdm.fming.dev/#use-with-ide 190 | .pdm.toml 191 | 192 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 193 | __pypackages__/ 194 | 195 | # Celery stuff 196 | celerybeat-schedule 197 | celerybeat.pid 198 | 199 | # SageMath parsed files 200 | *.sage.py 201 | 202 | # Environments 203 | .env 204 | .venv 205 | env/ 206 | venv/ 207 | ENV/ 208 | env.bak/ 209 | venv.bak/ 210 | 211 | # Spyder project settings 212 | .spyderproject 213 | .spyproject 214 | 215 | # Rope project settings 216 | .ropeproject 217 | 218 | # mkdocs documentation 219 | /site 220 | 221 | # mypy 222 | .mypy_cache/ 223 | .dmypy.json 224 | dmypy.json 225 | 226 | # Pyre type checker 227 | .pyre/ 228 | 229 | # pytype static type analyzer 230 | .pytype/ 231 | 232 | # Cython debug symbols 233 | cython_debug/ 234 | 235 | # PyCharm 236 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 237 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 238 | # and can be added to the global gitignore or merged into this file. For a more nuclear 239 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 240 | #.idea/ 241 | 242 | ingest/keys 243 | wandb/ 244 | .idea 245 | 246 | # ignore local folders where the indexed repo gets cloned to 247 | **/git 248 | 249 | # ignore local milvus volumes 250 | **/volumes 251 | 252 | /ingest/ml/build_docker_image.md 253 | 254 | *.ipynb -------------------------------------------------------------------------------- /.prefectignore: -------------------------------------------------------------------------------- 1 | # prefect artifacts 2 | .prefectignore 3 | 4 | # python artifacts 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | *.egg-info/ 9 | *.egg 10 | 11 | # Type checking artifacts 12 | .mypy_cache/ 13 | .dmypy.json 14 | dmypy.json 15 | .pyre/ 16 | 17 | # IPython 18 | profile_default/ 19 | ipython_config.py 20 | *.ipynb_checkpoints/* 21 | 22 | # Environments 23 | .python-version 24 | .env 25 | .venv 26 | env/ 27 | venv/ 28 | 29 | # MacOS 30 | .DS_Store 31 | 32 | # Dask 33 | dask-worker-space/ 34 | 35 | # Editors 36 | .idea/ 37 | .vscode/ 38 | 39 | # VCS 40 | .git/ 41 | .hg/ 42 | -------------------------------------------------------------------------------- /.run/create_secrets_blocks.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 25 | -------------------------------------------------------------------------------- /.run/ingest_de.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 27 | -------------------------------------------------------------------------------- /.run/ingest_ml.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 27 | -------------------------------------------------------------------------------- /.run/ingest_mlops.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 27 | -------------------------------------------------------------------------------- /.run/run_bot_datatalks.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 26 | -------------------------------------------------------------------------------- /.run/run_bot_local_ws.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 26 | -------------------------------------------------------------------------------- /Mlops_chatbot_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaalexlit/faq-slack-bot/78962d17dbd68438bd443b7dfdb961ac9d13e574/Mlops_chatbot_diagram.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is a repo for a human-interface LLM-powered QA Slack chatbot for answering questions 2 | related to DataTalks.Club Zoomcamps 3 | 4 | # Current version 5 | 6 | Please follow [this report](https://api.wandb.ai/links/aaalex-lit/ii6tpid4) that 7 | explains in detail how the bot works 8 | 9 | # First (outdated) version 10 | ## Behind the scenes 11 | Course FAQ Google Document and the Course repo get indexed to the Pinecone vector store. 12 | Then semantic search retrieves the most similar (and hopefully most relevant) pieces to the question asked. 13 | Then this information is passed as a context to a conversational LLM to form the final answer. 14 | 15 | ![Diagram](Mlops_chatbot_diagram.png) 16 | 17 | # Before you start 18 | 19 | Use any python virtual environment manager of your preference 20 | and install the dependencies from [requirements.txt](requirements.txt) 21 | 22 | # Fill [Pinecone](https://www.pinecone.io/) index 23 | 1. Set `PINECONE_API_KEY` and `PINECONE_ENV` 24 | environmental variables accordingly 25 | 26 | 2. run [ingest/ingest.py](ingest/mlops/ingest_mlops_old.py) 27 | 28 | ```bash 29 | python ingest_mlops_old.py 30 | ``` 31 | # Test QA pipeline locally 32 | 1. Set `OPENAI_API_KEY`, `PINECONE_API_KEY`, and `PINECONE_ENV` 33 | environmental variables accordingly 34 | 1. Run [test.py](test.py) 35 | 36 | ```bash 37 | python test.py 38 | ``` 39 | # Launch the bot 40 | 1. Set `OPENAI_API_KEY`, `SLACK_APP_TOKEN`, `SLACK_BOT_TOKEN`, 41 | `PINECONE_API_KEY`, and `PINECONE_ENV` 42 | environmental variables accordingly 43 | 1. To launch the bot on the CLI run [slack_bot/main.py](slack_bot/main.py) 44 | ```bash 45 | python main.py 46 | ``` 47 | 48 | Alternatively it can be launched with Docker, please follow 49 | [this README](slack_bot/README.md) -------------------------------------------------------------------------------- /dev.env: -------------------------------------------------------------------------------- 1 | # test workspace slack token 2 | SLACK_APP_TOKEN=xapp-.. 3 | SLACK_BOT_TOKEN=xoxb-.. 4 | 5 | # OpenAI API key 6 | OPENAI_API_KEY=sk-.. 7 | 8 | PINECONE_API_KEY=.. 9 | PINECONE_ENV=.. 10 | 11 | WANDB_API_KEY=.. 12 | 13 | ZILLIZ_CLOUD_URI=https://.. 14 | ZILLIZ_CLOUD_API_KEY=.. 15 | 16 | ZILLIZ_PUBLIC_ENDPOINT=https://.. 17 | ZILLIZ_API_KEY=.. 18 | 19 | LANGCHAIN_API_KEY=lsv2_.. 20 | 21 | COHERE_API_KEY=.. 22 | 23 | # DEBUG log level 24 | #LOG_LEVEL=10 25 | 26 | LOCAL_MILVUS=True -------------------------------------------------------------------------------- /fly.toml: -------------------------------------------------------------------------------- 1 | app = "faq-slack-bot" 2 | primary_region = "mad" 3 | 4 | [build] 5 | image = "aaalexlit/faq-slack-bot:main" 6 | 7 | [env] 8 | PINECONE_ENV = "gcp-starter" 9 | -------------------------------------------------------------------------------- /ingest/README.md: -------------------------------------------------------------------------------- 1 | # Execute indexing 2 | ## For ML Zoomcamp 3 | At the moment the indexing is scheduled to execute with [Prefect Cloud](https://app.prefect.cloud/) 4 | via deployments every 24 hours at 23 CET 5 | 6 | Steps to change/run the deployment are described in [prefect.md](prefect.md) 7 | 8 | ## For MLOps Zoomcamp 9 | 10 | Execute [ingest.py](mlops/ingest_mlops_old.py) 11 | ```shell 12 | python ingest_mlops_old.py 13 | ``` 14 | 15 | # Setup Prefect 16 | 17 | To run any ingestion, Prefect needs to be set up, 18 | as the code relies on secrets stored in Prefect blocks. 19 | 20 | ## Create a new profile to use with the cloud and use it (Optional) 21 | 22 | ```bash 23 | prefect profile create cloud 24 | prefect profile use cloud 25 | ``` 26 | 27 | ## Log in to prefect cloud either though browser or using the API key 28 | ```bash 29 | prefect cloud login 30 | ``` 31 | 32 | Create the required prefect blocks. Make sure to set up corresponding environment 33 | variables. 34 | 35 | ```shell 36 | python ingest/prefect_infra/create_secrets_blocks.py 37 | ``` -------------------------------------------------------------------------------- /ingest/de.dockerfile: -------------------------------------------------------------------------------- 1 | FROM prefecthq/prefect:2-python3.10 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y gcc python3-dev 5 | 6 | RUN pip install -U pip 7 | 8 | WORKDIR /usr/src 9 | 10 | COPY ingest/requirements.txt ./ 11 | RUN pip install --no-cache-dir -r requirements.txt 12 | 13 | ENV EMBEDDING_CACHE_NAMESPACE=de_zoomcamp 14 | 15 | COPY ingest/de/ingest_de.py ingest/de/ 16 | COPY ingest/readers ingest/readers 17 | COPY ingest/utils ingest/utils 18 | -------------------------------------------------------------------------------- /ingest/de/ingest_de.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from prefect import flow, task 4 | 5 | from ingest.utils.index_utils import index_spreadsheet, index_github_repo, \ 6 | index_slack_history, index_faq, index_youtube 7 | 8 | DE_CHANNEL_ID = 'C01FABYF2RG' 9 | FAQ_COLLECTION_NAME = 'dezoomcamp_faq_git' 10 | 11 | os.environ['PREFECT_LOGGING_EXTRA_LOGGERS'] = 'llama-index-core' 12 | 13 | 14 | @task(name="Index course github repo") 15 | def index_course_github_repo(): 16 | owner = 'DataTalksClub' 17 | repo = 'data-engineering-zoomcamp' 18 | branch = 'main' 19 | index_github_repo(owner=owner, 20 | repo=repo, 21 | branch=branch, 22 | collection_name=FAQ_COLLECTION_NAME, 23 | ignore_directories=['.github', '.gitignore', 'cohorts/2022', 'cohorts/2023', 'cohorts/2024', 24 | 'images'], 25 | ) 26 | 27 | 28 | @task(name="Index risingwave zoomcamp github repo") 29 | def index_risingwave_zoomcamp_github_repo(): 30 | owner = 'risingwavelabs' 31 | repo = 'risingwave-data-talks-workshop-2024-03-04' 32 | branch = 'main' 33 | index_github_repo(owner=owner, 34 | repo=repo, 35 | branch=branch, 36 | collection_name=FAQ_COLLECTION_NAME, 37 | ignore_directories=['assets', 'data'], 38 | ignore_file_extensions=['.gitignore', '.parquet', '.csv']) 39 | 40 | 41 | @task(name="Index mage zoomcamp github repo") 42 | def index_mage_zoomcamp_github_repo(): 43 | owner = 'mage-ai' 44 | repo = 'mage-zoomcamp' 45 | branch = 'solutions' 46 | index_github_repo(owner=owner, 47 | repo=repo, 48 | branch=branch, 49 | collection_name=FAQ_COLLECTION_NAME, 50 | ignore_directories=[], 51 | ignore_file_extensions=['.gitignore']) 52 | 53 | 54 | @task(name="Index FAQ Google Document") 55 | def index_google_doc(): 56 | document_ids = ["19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw"] 57 | print('Loading google doc...') 58 | index_faq(document_ids, FAQ_COLLECTION_NAME) 59 | 60 | 61 | @task(name="Index course schedule") 62 | def index_course_schedule(): 63 | url = ( 64 | 'https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-' 65 | 'yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml') 66 | title = 'DE Zoomcamp 2024 syllabus and deadlines' 67 | index_spreadsheet(url, title, FAQ_COLLECTION_NAME) 68 | 69 | 70 | @task(name="Index slack messages") 71 | def index_slack_messages(): 72 | channel_ids = [DE_CHANNEL_ID] 73 | index_slack_history(channel_ids, FAQ_COLLECTION_NAME) 74 | 75 | 76 | @task(name="Index QA videos subtitles") 77 | def index_yt_subtitles(): 78 | video_ids = ['X8cEEwi8DTM'] 79 | index_youtube(video_ids, FAQ_COLLECTION_NAME) 80 | 81 | 82 | @flow(name="Update DE info Milvus index", log_prints=True) 83 | def fill_de_index(): 84 | print(f"Execution environment is {os.getenv('EXECUTION_ENV', 'local')}") 85 | index_google_doc() 86 | index_slack_messages.submit(wait_for=[index_google_doc]) 87 | index_course_schedule.submit(wait_for=[index_google_doc]) 88 | # index_evaluation_criteria.submit(wait_for=[index_google_doc]) 89 | index_course_github_repo.submit(wait_for=[index_google_doc]) 90 | index_yt_subtitles.submit(wait_for=[index_google_doc]) 91 | 92 | 93 | if __name__ == '__main__': 94 | fill_de_index() 95 | -------------------------------------------------------------------------------- /ingest/dev.env: -------------------------------------------------------------------------------- 1 | PINECONE_API_KEY=.. 2 | PINECONE_ENV=.. 3 | ZILLIZ_CLOUD_URI=https://.. 4 | ZILLIZ_CLOUD_API_KEY=.. 5 | SLACK_BOT_TOKEN=xoxb-.. 6 | GITHUB_TOKEN=ghp_.. 7 | UPSTASH_REDIS_REST_URL=https://.. 8 | UPSTASH_REDIS_REST_TOKEN=.. 9 | ZILLIZ_PUBLIC_ENDPOINT=https://.. 10 | ZILLIZ_API_KEY=.. 11 | -------------------------------------------------------------------------------- /ingest/llm.dockerfile: -------------------------------------------------------------------------------- 1 | FROM prefecthq/prefect:2-python3.10 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y gcc python3-dev 5 | 6 | RUN pip install -U pip 7 | 8 | WORKDIR /usr/src 9 | 10 | COPY ingest/requirements.txt ./ 11 | RUN pip install --no-cache-dir -r requirements.txt 12 | 13 | ENV EMBEDDING_CACHE_NAMESPACE=llm_zoomcamp 14 | 15 | COPY ingest/llm/ingest_llm.py ingest/llm/ 16 | COPY ingest/readers ingest/readers 17 | COPY ingest/utils ingest/utils 18 | -------------------------------------------------------------------------------- /ingest/llm/ingest_llm.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from prefect import flow, task 4 | 5 | from ingest.utils.index_utils import index_github_repo, \ 6 | index_slack_history, index_faq 7 | 8 | SLACK_CHANNEL_ID = 'C06TEGTGM3J' 9 | COLLECTION_NAME = 'llmzoomcamp' 10 | 11 | 12 | @task(name="Index course github repo") 13 | def index_course_github_repo(): 14 | owner = 'DataTalksClub' 15 | repo = 'llm-zoomcamp' 16 | branch = 'main' 17 | index_github_repo(owner=owner, 18 | repo=repo, 19 | branch=branch, 20 | collection_name=COLLECTION_NAME, 21 | ignore_directories=['.github', '.gitignore', 'images'], 22 | ) 23 | 24 | 25 | @task(name="Index FAQ Google Document") 26 | def index_google_doc(): 27 | document_ids = ["1m2KexowAXTmexfC5rVTCSnaShvdUQ8Ag2IEiwBDHxN0"] 28 | print('Loading google doc...') 29 | index_faq(document_ids, COLLECTION_NAME) 30 | 31 | 32 | @task(name="Index slack messages") 33 | def index_slack_messages(): 34 | channel_ids = [SLACK_CHANNEL_ID] 35 | index_slack_history(channel_ids, COLLECTION_NAME) 36 | 37 | 38 | @flow(name="Update LLM info Milvus index", log_prints=True) 39 | def fill_llm_index(): 40 | print(f"Execution environment is {os.getenv('EXECUTION_ENV', 'local')}") 41 | index_google_doc() 42 | index_slack_messages.submit(wait_for=[index_google_doc]) 43 | index_course_github_repo.submit(wait_for=[index_google_doc]) 44 | 45 | 46 | if __name__ == '__main__': 47 | fill_llm_index() 48 | -------------------------------------------------------------------------------- /ingest/local_development.md: -------------------------------------------------------------------------------- 1 | # Run ingestion locally for ML and DE Zoomcamps 2 | 3 | Steps to fill in the index locally: 4 | 5 | 1. start dockerized [Milvus](https://milvus.io/) from [local_milvus](local_milvus) folder 6 | ```shell 7 | cd ingest/local_milvus 8 | docker compose up 9 | ``` 10 | 11 | 1. Rename [dev.env](../dev.env) to `.env` and set all the required variables 12 | 13 | 1. Create the prefect blocks (needs to be run once) 14 | ```shell 15 | python ingest/prefect_infra/create_secrets_blocks.py 16 | ``` 17 | 18 | 1. execute ingestion script [ingest_ml.py](ml/ingest_ml.py) (for ML zoomcamp data) 19 | or [ingest_de.py](de/ingest_de.py) (for DE zoomcamp data). 20 | It will be executed with `EXECUTION_ENV` env var set to `local` by default 21 | ```shell 22 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 23 | python ingest/ml/ingest_ml.py 24 | ``` 25 | 26 | If you're using Pycharm IDE there are run configurations available: 27 | [ingest_de](../.run/ingest_de.run.xml) 28 | [ingest_ml](../.run/ingest_ml.run.xml) -------------------------------------------------------------------------------- /ingest/local_milvus/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.5' 2 | 3 | services: 4 | etcd: 5 | container_name: milvus-etcd 6 | image: quay.io/coreos/etcd:v3.5.16 7 | environment: 8 | - ETCD_AUTO_COMPACTION_MODE=revision 9 | - ETCD_AUTO_COMPACTION_RETENTION=1000 10 | - ETCD_QUOTA_BACKEND_BYTES=4294967296 11 | - ETCD_SNAPSHOT_COUNT=50000 12 | volumes: 13 | - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd 14 | command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd 15 | healthcheck: 16 | test: ["CMD", "etcdctl", "endpoint", "health"] 17 | interval: 30s 18 | timeout: 20s 19 | retries: 3 20 | 21 | minio: 22 | container_name: milvus-minio 23 | image: minio/minio:RELEASE.2023-03-20T20-16-18Z 24 | environment: 25 | MINIO_ACCESS_KEY: minioadmin 26 | MINIO_SECRET_KEY: minioadmin 27 | ports: 28 | - "9001:9001" 29 | - "9000:9000" 30 | volumes: 31 | - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data 32 | command: minio server /minio_data --console-address ":9001" 33 | healthcheck: 34 | test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] 35 | interval: 30s 36 | timeout: 20s 37 | retries: 3 38 | 39 | standalone: 40 | container_name: milvus-standalone 41 | image: milvusdb/milvus:v2.5.4 42 | command: ["milvus", "run", "standalone"] 43 | security_opt: 44 | - seccomp:unconfined 45 | environment: 46 | ETCD_ENDPOINTS: etcd:2379 47 | MINIO_ADDRESS: minio:9000 48 | volumes: 49 | - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus 50 | healthcheck: 51 | test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"] 52 | interval: 30s 53 | start_period: 90s 54 | timeout: 20s 55 | retries: 3 56 | ports: 57 | - "19530:19530" 58 | - "9091:9091" 59 | depends_on: 60 | - "etcd" 61 | - "minio" 62 | 63 | networks: 64 | default: 65 | name: milvus -------------------------------------------------------------------------------- /ingest/ml.dockerfile: -------------------------------------------------------------------------------- 1 | FROM prefecthq/prefect:2-python3.10 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y gcc python3-dev 5 | 6 | RUN pip install -U pip 7 | 8 | WORKDIR /usr/src 9 | 10 | COPY ingest/requirements.txt ./ 11 | RUN pip install --no-cache-dir -r requirements.txt 12 | 13 | ENV EMBEDDING_CACHE_NAMESPACE=ml_zoomcamp 14 | 15 | COPY ingest/ml/ingest_ml.py ingest/ml/ 16 | COPY ingest/readers ingest/readers 17 | COPY ingest/utils ingest/utils 18 | -------------------------------------------------------------------------------- /ingest/ml/ingest_ml.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from prefect import flow, task 4 | 5 | from ingest.utils.index_utils import index_spreadsheet, index_github_repo, \ 6 | index_slack_history, index_faq 7 | 8 | ML_CHANNEL_ID = 'C0288NJ5XSA' 9 | FAQ_COLLECTION_NAME = 'mlzoomcamp_faq_git' 10 | 11 | 12 | @task(name="Index course github repo") 13 | def index_course_github_repo(): 14 | owner = 'DataTalksClub' 15 | repo = 'machine-learning-zoomcamp' 16 | branch = 'master' 17 | index_github_repo(owner=owner, 18 | repo=repo, 19 | branch=branch, 20 | collection_name=FAQ_COLLECTION_NAME) 21 | 22 | 23 | @task(name="Index book github repo") 24 | def index_book_github_repo(): 25 | owner = 'alexeygrigorev' 26 | repo = 'mlbookcamp-code' 27 | branch = 'master' 28 | ignore_directories = ['.github', 'course-zoomcamp', 'images', 'util'] 29 | index_github_repo(owner=owner, 30 | repo=repo, 31 | branch=branch, 32 | ignore_directories=ignore_directories, 33 | collection_name=FAQ_COLLECTION_NAME) 34 | 35 | 36 | @task(name="Index FAQ Google Document") 37 | def index_google_doc(): 38 | document_ids = ["1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8"] 39 | print('Loading google doc...') 40 | index_faq(document_ids, FAQ_COLLECTION_NAME) 41 | 42 | 43 | @task(name="Index course schedule") 44 | def index_course_schedule(): 45 | url = ('https://docs.google.com/spreadsheets/d/e/2PACX' 46 | '-1vSkEwMv5OKwCdPfW6LgqQvKk48dZjPcFDrjDstBqZfq38UPadh0Nws1b57qOVYwzAjSufKnVf7umGWH/pubhtml') 47 | title = 'ML Zoomcamp 2023 syllabus and deadlines' 48 | index_spreadsheet(url, title, FAQ_COLLECTION_NAME) 49 | 50 | 51 | @task(name="Index project evaluation criteria") 52 | def index_evaluation_criteria(): 53 | url = ('https://docs.google.com/spreadsheets/d/e/2PACX' 54 | '-1vQCwqAtkjl07MTW-SxWUK9GUvMQ3Pv_fF8UadcuIYLgHa0PlNu9BRWtfLgivI8xSCncQs82HDwGXSm3/pubhtml') 55 | title = 'ML Zoomcamp project evaluation criteria : Project criteria' 56 | index_spreadsheet(url, title, FAQ_COLLECTION_NAME) 57 | 58 | 59 | @task(name="Index slack messages") 60 | def index_slack_messages(): 61 | channel_ids = [ML_CHANNEL_ID] 62 | index_slack_history(channel_ids, FAQ_COLLECTION_NAME) 63 | 64 | 65 | @flow(name="Update ML info Milvus index", log_prints=True) 66 | def fill_ml_index(): 67 | print(f"Execution environment is {os.getenv('EXECUTION_ENV', 'local')}") 68 | index_google_doc() 69 | index_slack_messages.submit(wait_for=[index_google_doc]) 70 | index_course_schedule.submit(wait_for=[index_google_doc]) 71 | index_evaluation_criteria.submit(wait_for=[index_google_doc]) 72 | index_course_github_repo.submit(wait_for=[index_google_doc]) 73 | index_book_github_repo.submit(wait_for=[index_google_doc]) 74 | 75 | 76 | if __name__ == '__main__': 77 | fill_ml_index() 78 | -------------------------------------------------------------------------------- /ingest/mlops.dockerfile: -------------------------------------------------------------------------------- 1 | FROM prefecthq/prefect:2-python3.10 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y gcc python3-dev 5 | 6 | RUN pip install -U pip 7 | 8 | WORKDIR /usr/src 9 | 10 | COPY ingest/requirements.txt ./ 11 | RUN pip install --no-cache-dir -r requirements.txt 12 | 13 | ENV EMBEDDING_CACHE_NAMESPACE=mlops_zoomcamp 14 | 15 | COPY ingest/mlops/ingest_mlops.py ingest/mlops/ 16 | COPY ingest/readers ingest/readers 17 | COPY ingest/utils ingest/utils 18 | -------------------------------------------------------------------------------- /ingest/mlops/ingest_mlops.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from prefect import flow, task 4 | 5 | from ingest.utils.index_utils import index_github_repo, \ 6 | index_slack_history, index_faq 7 | 8 | SLACK_CHANNEL_ID = 'C02R98X7DS9' 9 | COLLECTION_NAME = 'mlopszoomcamp' 10 | 11 | 12 | @task(name="Index course github repo") 13 | def index_course_github_repo(): 14 | owner = 'DataTalksClub' 15 | repo = 'mlops-zoomcamp' 16 | branch = 'main' 17 | index_github_repo(owner=owner, 18 | repo=repo, 19 | branch=branch, 20 | collection_name=COLLECTION_NAME, 21 | ignore_directories=['.github', '.gitignore', 'cohorts/2022', 'cohorts/2023', 'images'], 22 | ) 23 | 24 | 25 | @task(name="Index FAQ Google Document") 26 | def index_google_doc(): 27 | document_ids = ["12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0"] 28 | print('Loading google doc...') 29 | index_faq(document_ids, COLLECTION_NAME) 30 | 31 | 32 | @task(name="Index slack messages") 33 | def index_slack_messages(): 34 | channel_ids = [SLACK_CHANNEL_ID] 35 | index_slack_history(channel_ids, COLLECTION_NAME) 36 | 37 | 38 | @flow(name="Update MLOps info Milvus index", log_prints=True) 39 | def fill_mlops_index(): 40 | print(f"Execution environment is {os.getenv('EXECUTION_ENV', 'local')}") 41 | index_google_doc() 42 | index_slack_messages.submit(wait_for=[index_google_doc]) 43 | index_course_github_repo.submit(wait_for=[index_google_doc]) 44 | 45 | 46 | if __name__ == '__main__': 47 | fill_mlops_index() 48 | -------------------------------------------------------------------------------- /ingest/mlops/ingest_mlops_old.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import tempfile 5 | import time 6 | from pathlib import Path 7 | 8 | import pinecone # type: ignore 9 | from langchain_community.document_loaders import GoogleDriveLoader, GitLoader 10 | from langchain.embeddings import HuggingFaceEmbeddings 11 | from langchain.text_splitter import RecursiveCharacterTextSplitter 12 | from langchain.vectorstores import Pinecone 13 | from prefect import flow, task 14 | from prefect.blocks.system import Secret 15 | from prefect_gcp import GcpCredentials 16 | 17 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 18 | embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5') 19 | embedding_dimension = len(embeddings.embed_query("test")) 20 | print(f'embedding dimension = {embedding_dimension}') 21 | 22 | 23 | @task(name="Index FAQ Google Document") 24 | def ingest_google_doc(index_name: str, 25 | document_ids: list[str], 26 | ): 27 | print('Loading google doc...') 28 | temp_creds = tempfile.NamedTemporaryFile() 29 | creds_dict = GcpCredentials.load("google-drive-creds").service_account_info.get_secret_value() 30 | with open(temp_creds.name, 'w') as f_out: 31 | json.dump(creds_dict, f_out) 32 | loader = GoogleDriveLoader(service_account_key=Path(temp_creds.name), 33 | document_ids=document_ids) 34 | # loader = GoogleDriveLoader(service_account_key=Path.cwd() / "keys" / "service_account_key.json", 35 | # document_ids=document_ids) 36 | 37 | raw_docs = loader.load() 38 | temp_creds.close() 39 | print('Splitting docs for indexing...') 40 | text_splitter = get_text_splitter() 41 | docs = text_splitter.split_documents(raw_docs) 42 | 43 | index_docs(docs, index_name) 44 | 45 | 46 | def index_docs(docs, index_name): 47 | print('Filling the index up...') 48 | Pinecone.from_documents(docs, embeddings, index_name=index_name) 49 | time.sleep(10) 50 | print_index_status(index_name) 51 | 52 | 53 | @task(name="Delete and Create Pinecone index") 54 | def create_pinecone_index(index_name: str): 55 | if index_name in pinecone.list_indexes(): 56 | print(f"Index {index_name} exists. Deleting...") 57 | pinecone.delete_index(index_name) 58 | 59 | if index_name not in pinecone.list_indexes(): 60 | print(f"Creating index {index_name}...") 61 | pinecone.create_index( 62 | name=index_name, 63 | dimension=embedding_dimension 64 | ) 65 | 66 | print_index_status(index_name) 67 | 68 | 69 | def print_index_status(index_name): 70 | index = pinecone.GRPCIndex(index_name) 71 | index_stats = index.describe_index_stats() 72 | print(f"index stats: {index_stats}") 73 | 74 | 75 | @task(name="Index git repo") 76 | def ingest_git_repo(repo_url: str, index_name: str): 77 | local_dir_path = f"./git/{repo_url[repo_url.rindex('/') + 1:]}" 78 | if Path(local_dir_path).exists(): 79 | remove_local_dir(local_dir_path) 80 | loader = GitLoader( 81 | clone_url=repo_url, 82 | repo_path=local_dir_path, 83 | ) 84 | print('Loading and Splitting git repo for indexing...') 85 | text_splitter = get_text_splitter() 86 | docs = loader.load_and_split(text_splitter) 87 | index_docs(docs, index_name) 88 | remove_local_dir(local_dir_path) 89 | 90 | 91 | def remove_local_dir(local_dir_path): 92 | print(f'Removing local files in {local_dir_path}') 93 | shutil.rmtree(local_dir_path) 94 | 95 | 96 | def get_text_splitter(): 97 | return RecursiveCharacterTextSplitter( 98 | chunk_size=1000, 99 | chunk_overlap=200, 100 | ) 101 | 102 | 103 | @flow(name="Update the index in Pinecone for MLOps Zoomcamp", log_prints=True) 104 | def create_and_fill_the_index(index_name: str, 105 | google_doc_ids: list[str], 106 | repo_url: str, 107 | overwrite: bool): 108 | pinecone.init( 109 | api_key=Secret.load('pinecone-api-key').get(), 110 | environment=Secret.load('pinecone-env').get() 111 | ) 112 | if overwrite: 113 | create_pinecone_index(index_name=index_name) 114 | ingest_google_doc(index_name, 115 | google_doc_ids) 116 | ingest_git_repo(repo_url, index_name) 117 | 118 | 119 | if __name__ == "__main__": 120 | index_name = 'mlops-faq-bot' 121 | google_doc_id = ["12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0"] 122 | repo_url = 'https://github.com/DataTalksClub/mlops-zoomcamp' 123 | overwrite = True 124 | create_and_fill_the_index(index_name=index_name, 125 | google_doc_ids=google_doc_id, 126 | repo_url=repo_url, 127 | overwrite=overwrite) 128 | -------------------------------------------------------------------------------- /ingest/prefect.md: -------------------------------------------------------------------------------- 1 | # Run the ingestion for ML with prefect deployments 2 | 3 | ## Execute ingestion 4 | 5 | Currently, indexing is scheduled to execute: 6 | - Daily at 00:00 CET for **DE Zoomcamp** documents 7 | - Weekly at 23:00 CET on Monday for **ML Zoomcamp** documents 8 | 9 | Before running any execution make sure the worker is started: 10 | ```shell 11 | prefect worker start --pool zoomcamp-faq-bot 12 | ``` 13 | 14 | Ad-hoc executions can be run from the [Prefect Cloud UI](https://app.prefect.cloud/) 15 | by launching the corresponding deployment. 16 | 17 | 18 | It's also possible to run it from the command line: 19 | 20 | ### Run ingestion deployment for ML 21 | ```shell 22 | prefect deployment run 'Update ML info Milvus index/fill-index-zilliz-ml' 23 | ``` 24 | 25 | ### Run ingestion deployment for DE 26 | ```shell 27 | prefect deployment run 'Update DE info Milvus index/fill-index-zilliz-de' 28 | ``` 29 | 30 | ## Change the properties of a deployment 31 | ### Bulk 32 | Depending on the nature of the changes, after modifying the code or 33 | [prefect.yaml](../prefect.yaml) re-create both deployments by running 34 | 35 | ```shell 36 | prefect deploy --all 37 | ``` 38 | ### Individual 39 | Alternatively it can be done per deployment if the changes are not affecting both 40 | **re-create deployment for ML ingestion** 41 | ```shell 42 | prefect deploy --name fill-index-zilliz-ml 43 | ``` 44 | **re-create deployment for DE ingestion** 45 | ```shell 46 | prefect deploy --name fill-index-zilliz-de 47 | ``` 48 | 49 | ## Setup prefect from scratch 50 | 51 | Login to prefect cloud: 52 | 53 | ```shell 54 | prefect cloud login 55 | ``` 56 | 57 | Create the required blocks: 58 | 59 | ```shell 60 | python ingest/prefect_infra/create_secrets_blocks.py 61 | ``` 62 | 63 | Create work pool 64 | 65 | ```shell 66 | prefect work-pool create --type docker zoomcamp-faq-bot 67 | ``` 68 | 69 | Run the following command in this new terminal to start the worker: 70 | 71 | ```shell 72 | prefect worker start --pool zoomcamp-faq-bot 73 | ``` 74 | 75 | Create all the deployments from [prefect.yaml](../prefect.yaml) file 76 | 77 | ```shell 78 | prefect deploy --all 79 | ``` 80 | 81 | Run the ingestion by executing created deployments following the 82 | instructions above. -------------------------------------------------------------------------------- /ingest/prefect_infra/create_secrets_blocks.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | 5 | from prefect.blocks.system import Secret 6 | from prefect_gcp import GcpCredentials 7 | 8 | 9 | def create_gcp_creds_block(): 10 | block_name = "google-drive-creds" 11 | try: 12 | GcpCredentials.load(block_name) 13 | print(f"Block {block_name} exists") 14 | except ValueError: 15 | print(f"Creating Block {block_name}") 16 | with open("../keys/service_account_key.json", 'r') as f_in: 17 | service_account_info_str = f_in.read() 18 | 19 | service_account_info = json.loads(service_account_info_str) 20 | 21 | GcpCredentials( 22 | service_account_info=service_account_info 23 | ).save(block_name) 24 | time.sleep(10) 25 | 26 | 27 | def create_secret_block(block_name: str, env_var_name: str) -> None: 28 | try: 29 | Secret.load(block_name) 30 | print(f"Block {block_name} exists") 31 | except ValueError: 32 | print(f"Creating Block {block_name}") 33 | Secret(value=os.getenv(env_var_name)).save(name=block_name) 34 | time.sleep(10) 35 | 36 | 37 | def create_pinecone_secrets(): 38 | create_secret_block('pinecone-api-key', 'PINECONE_API_KEY') 39 | create_secret_block('pinecone-env', 'PINECONE_ENV') 40 | 41 | 42 | def create_zilliz_secrets(): 43 | create_secret_block('zilliz-cloud-uri', 'ZILLIZ_CLOUD_URI') 44 | create_secret_block('zilliz-cloud-api-key', 'ZILLIZ_CLOUD_API_KEY') 45 | create_secret_block('zilliz-public-endpoint', 'ZILLIZ_PUBLIC_ENDPOINT') 46 | create_secret_block('zilliz-api-key', 'ZILLIZ_API_KEY') 47 | 48 | 49 | def create_slack_secrets(): 50 | create_secret_block('slack-bot-token', 'SLACK_BOT_TOKEN') 51 | 52 | 53 | def create_github_secrets(): 54 | create_secret_block('github-token', 'GITHUB_TOKEN') 55 | 56 | 57 | def create_upstash_redis_secrets(): 58 | create_secret_block('upstash-redis-rest-url', 'UPSTASH_REDIS_REST_URL') 59 | create_secret_block('upstash-redis-rest-token', 'UPSTASH_REDIS_REST_TOKEN') 60 | 61 | 62 | if __name__ == '__main__': 63 | create_gcp_creds_block() 64 | create_pinecone_secrets() 65 | create_zilliz_secrets() 66 | create_slack_secrets() 67 | create_github_secrets() 68 | create_upstash_redis_secrets() 69 | -------------------------------------------------------------------------------- /ingest/readers/custom_faq_gdoc_reader.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any, Optional 3 | 4 | from llama_index.core.readers.base import BasePydanticReader 5 | from llama_index.core.schema import Document 6 | 7 | DEFAULT_TOKEN_JSON_PATH = 'token.json' 8 | DEFAULT_SERVICE_ACCOUNT_JSON_PATH = 'service_account.json' 9 | DEFAULT_CREDENTIALS_JSON_PATH = 'credentials.json' 10 | 11 | HEADING_STYLE_TEMPLATE = 'HEADING_{}' 12 | DEFAULT_QUESTION_HEADING_STYLE_NUM = 2 13 | 14 | EXCLUDED_LLM_METADATA_KEYS = ['source', 'title', 'section_name'] 15 | EXCLUDED_EMBED_METADATA_KEYS = ['source', 'title'] 16 | 17 | SCOPES = ["https://www.googleapis.com/auth/documents.readonly"] 18 | 19 | 20 | class FAQGoogleDocsReader(BasePydanticReader): 21 | token_json_path: str = DEFAULT_TOKEN_JSON_PATH 22 | service_account_json_path: str = DEFAULT_SERVICE_ACCOUNT_JSON_PATH 23 | credentials_json_path: str = DEFAULT_CREDENTIALS_JSON_PATH 24 | question_heading_style_num: int = DEFAULT_QUESTION_HEADING_STYLE_NUM 25 | is_remote: bool = True 26 | 27 | def __init__(self, 28 | token_json_path: Optional[str] = DEFAULT_TOKEN_JSON_PATH, 29 | service_account_json_path: Optional[str] = DEFAULT_SERVICE_ACCOUNT_JSON_PATH, 30 | credentials_json_path: Optional[str] = DEFAULT_CREDENTIALS_JSON_PATH, 31 | question_heading_style_num: Optional[int] = DEFAULT_QUESTION_HEADING_STYLE_NUM 32 | ) -> None: 33 | """Initialize with parameters.""" 34 | try: 35 | import google # noqa 36 | import google_auth_oauthlib # noqa 37 | import googleapiclient # noqa 38 | except ImportError as e: 39 | raise ImportError( 40 | '`google_auth_oauthlib`, `googleapiclient` and `google` ' 41 | 'must be installed to use the GoogleDocsReader.\n' 42 | 'Please run `pip install --upgrade google-api-python-client ' 43 | 'google-auth-httplib2 google-auth-oauthlib`.' 44 | ) from e 45 | super().__init__(token_json_path=token_json_path, 46 | service_account_json_path=service_account_json_path, 47 | credentials_json_path=credentials_json_path, 48 | question_heading_style_num=question_heading_style_num) 49 | 50 | @classmethod 51 | def class_name(cls) -> str: 52 | return 'CustomGoogleDocsReader' 53 | 54 | def load_data(self, document_ids: [str]) -> [Document]: 55 | """Load data from the input directory. 56 | 57 | Args: 58 | document_ids (List[str]): a list of document ids. 59 | """ 60 | if document_ids is None: 61 | raise ValueError('Must specify a "document_ids" in `load_kwargs`.') 62 | 63 | results = [] 64 | for document_id in document_ids: 65 | docs = self._load_docs(document_id) 66 | results.extend(docs) 67 | return results 68 | 69 | def _load_docs(self, document_id: str) -> [Document]: 70 | """Load a document from Google Docs. 71 | 72 | Args: 73 | document_id: the document id. 74 | 75 | Returns: 76 | The document text. 77 | """ 78 | import googleapiclient.discovery as discovery 79 | 80 | credentials = self._get_credentials() 81 | docs_service = discovery.build('docs', 'v1', credentials=credentials) 82 | doc = docs_service.documents().get(documentId=document_id).execute() 83 | doc_content = doc.get('body').get('content') 84 | doc_source = f'https://docs.google.com/document/d/{document_id}/edit#heading=' 85 | return self._structural_elements_to_docs(doc_content, doc_source) 86 | 87 | def _get_credentials(self) -> Any: 88 | """Get valid user credentials from storage. 89 | 90 | The file token.json stores the user's access and refresh tokens, and is 91 | created automatically when the authorization flow completes for the first 92 | time. 93 | 94 | Returns: 95 | Credentials, the obtained credential. 96 | """ 97 | from google.auth.transport.requests import Request 98 | from google.oauth2 import service_account 99 | from google.oauth2.credentials import Credentials 100 | from google_auth_oauthlib.flow import InstalledAppFlow 101 | 102 | creds = None 103 | if os.path.exists(self.token_json_path): 104 | creds = Credentials.from_authorized_user_file(self.token_json_path, SCOPES) 105 | elif os.path.exists(self.service_account_json_path): 106 | return service_account.Credentials.from_service_account_file( 107 | self.service_account_json_path, scopes=SCOPES 108 | ) 109 | # If there are no (valid) credentials available, let the user log in. 110 | if not creds or not creds.valid: 111 | if creds and creds.expired and creds.refresh_token: 112 | creds.refresh(Request()) 113 | else: 114 | flow = InstalledAppFlow.from_client_secrets_file( 115 | self.credentials_json_path, SCOPES 116 | ) 117 | creds = flow.run_local_server(port=8080) 118 | # Save the credentials for the next run 119 | with open(self.token_json_path, 'w') as token: 120 | token.write(creds.to_json()) 121 | 122 | return creds 123 | 124 | @staticmethod 125 | def _read_paragraph_element(element: Any) -> Any: 126 | """Return the text in the given ParagraphElement. 127 | 128 | Args: 129 | element: a ParagraphElement from a Google Doc. 130 | """ 131 | text_run = element.get('textRun') 132 | return text_run.get('content') if text_run else '' 133 | 134 | @staticmethod 135 | def _get_text_from_paragraph_elements(elements: [Any]) -> Any: 136 | return ''.join(FAQGoogleDocsReader._read_paragraph_element(elem) for elem in elements) 137 | 138 | def _structural_elements_to_docs(self, 139 | doc_elements: [Any], 140 | doc_source: str) -> [Document]: 141 | """Recurse through a list of Structural Elements. 142 | 143 | Read a document's text where text may be in nested elements. 144 | 145 | Args: 146 | doc_elements: a list of Structural Elements. 147 | """ 148 | docs = [] 149 | text = '' 150 | heading_id = '' 151 | section_name = '' 152 | question_heading_style = HEADING_STYLE_TEMPLATE.format(self.question_heading_style_num) 153 | section_heading_style = HEADING_STYLE_TEMPLATE.format(self.question_heading_style_num - 1) 154 | for value in doc_elements: 155 | if 'paragraph' in value: 156 | paragraph = value['paragraph'] 157 | elements = paragraph.get('elements') 158 | paragraph_text = FAQGoogleDocsReader._get_text_from_paragraph_elements(elements) 159 | if 'paragraphStyle' in paragraph and 'headingId' in paragraph['paragraphStyle']: 160 | named_style_type = paragraph['paragraphStyle']['namedStyleType'] 161 | if named_style_type in [ 162 | question_heading_style, 163 | section_heading_style, 164 | ]: 165 | # create previous document checking if it's not empty 166 | if text != '': 167 | node_metadata = { 168 | 'source': doc_source + heading_id, 169 | 'section_name': section_name, 170 | 'title': 'FAQ' 171 | } 172 | prev_doc = Document(text=text, 173 | metadata=node_metadata, 174 | excluded_embed_metadata_keys=EXCLUDED_EMBED_METADATA_KEYS, 175 | excluded_llm_metadata_keys=EXCLUDED_LLM_METADATA_KEYS) 176 | docs.append(prev_doc) 177 | if named_style_type == question_heading_style: 178 | heading_id = paragraph['paragraphStyle']['headingId'] 179 | text = paragraph_text 180 | else: 181 | section_name = paragraph_text 182 | text = '' 183 | else: 184 | text += paragraph_text 185 | return docs 186 | 187 | 188 | if __name__ == '__main__': 189 | reader = FAQGoogleDocsReader(service_account_json_path='../keys/service_account_key.json') 190 | docs = reader.load_data(['1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8']) 191 | print(docs) 192 | -------------------------------------------------------------------------------- /ingest/readers/slack_reader.py: -------------------------------------------------------------------------------- 1 | """Slack reader.""" 2 | import logging 3 | import os 4 | import sys 5 | import time 6 | from datetime import datetime, timedelta 7 | from http.client import IncompleteRead 8 | from ssl import SSLContext 9 | from typing import Any, Optional 10 | 11 | from llama_index.core.bridge.pydantic import PrivateAttr 12 | from llama_index.core.readers.base import BasePydanticReader 13 | from llama_index.core.schema import Document 14 | 15 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, 16 | format='%(message)s') 17 | logger = logging.getLogger(__name__) 18 | EXCLUDED_METADATA_FIELDS = ['channel', 'thread_ts'] 19 | 20 | 21 | class SlackReader(BasePydanticReader): 22 | """Slack reader. 23 | 24 | Reads conversations from channels. If the earliest_date is provided, an 25 | optional latest_date can also be provided. If no latest_date is provided, 26 | we assume the latest date is the current timestamp. 27 | 28 | Args: 29 | slack_token (Optional[str]): Slack token. If not provided, we 30 | assume the environment variable `SLACK_BOT_TOKEN` is set. 31 | ssl (Optional[str]): Custom SSL context. If not provided, it is assumed 32 | there is already an SSL context available. 33 | earliest_date (Optional[datetime]): Earliest date from which 34 | to read conversations. If not provided, we read all messages. 35 | latest_date (Optional[datetime]): Latest date from which to 36 | read conversations. If not provided, defaults to current timestamp 37 | in combination with earliest_date. 38 | """ 39 | 40 | is_remote: bool = True 41 | slack_token: str 42 | earliest_date_timestamp: Optional[float] 43 | latest_date_timestamp: float 44 | bot_user_id: Optional[str] 45 | not_ignore_users: Optional[list[str]] = [] 46 | 47 | _client: Any = PrivateAttr() 48 | 49 | def __init__( 50 | self, 51 | slack_token: Optional[str] = None, 52 | ssl: Optional[SSLContext] = None, 53 | earliest_date: Optional[datetime] = None, 54 | latest_date: Optional[datetime] = None, 55 | earliest_date_timestamp: Optional[float] = None, 56 | latest_date_timestamp: Optional[float] = None, 57 | bot_user_id: Optional[str] = None, 58 | not_ignore_users: Optional[list[str]] = None 59 | ) -> None: 60 | """Initialize with parameters.""" 61 | from slack_sdk import WebClient 62 | 63 | if slack_token is None: 64 | slack_token = os.environ["SLACK_BOT_TOKEN"] 65 | if slack_token is None: 66 | raise ValueError( 67 | "Must specify `slack_token` or set environment " 68 | "variable `SLACK_BOT_TOKEN`." 69 | ) 70 | if ssl is None: 71 | self._client = WebClient(token=slack_token) 72 | else: 73 | self._client = WebClient(token=slack_token, ssl=ssl) 74 | if latest_date is not None and earliest_date is None: 75 | raise ValueError( 76 | "Must specify `earliest_date` if `latest_date` is specified." 77 | ) 78 | if not_ignore_users is None: 79 | not_ignore_users = [] 80 | if earliest_date is not None: 81 | earliest_date_timestamp = earliest_date.timestamp() 82 | else: 83 | earliest_date_timestamp = None or earliest_date_timestamp 84 | if latest_date is not None: 85 | latest_date_timestamp = latest_date.timestamp() 86 | else: 87 | latest_date_timestamp = datetime.now().timestamp() or latest_date_timestamp 88 | res = self._client.api_test() 89 | if not res["ok"]: 90 | raise ValueError(f"Error initializing Slack API: {res['error']}") 91 | 92 | super().__init__( 93 | slack_token=slack_token, 94 | earliest_date_timestamp=earliest_date_timestamp, 95 | latest_date_timestamp=latest_date_timestamp, 96 | bot_user_id=bot_user_id, 97 | not_ignore_users=not_ignore_users, 98 | ) 99 | 100 | @classmethod 101 | def class_name(cls) -> str: 102 | """Get the name identifier of the class.""" 103 | return "SlackReader" 104 | 105 | def _read_message(self, channel_id: str, message_ts: str) -> Document: 106 | from slack_sdk.errors import SlackApiError 107 | 108 | """Read a message.""" 109 | 110 | messages_text: list[str] = [] 111 | next_cursor = None 112 | while True: 113 | try: 114 | # https://slack.com/api/conversations.replies 115 | # List all replies to a message, including the message itself. 116 | conversations_replies_kwargs = { 117 | "channel": channel_id, 118 | "ts": message_ts, 119 | "cursor": next_cursor, 120 | } 121 | if self.earliest_date_timestamp is not None: 122 | conversations_replies_kwargs |= { 123 | "latest": str(self.latest_date_timestamp), 124 | "oldest": str(self.earliest_date_timestamp), 125 | } 126 | result = self._client.conversations_replies( 127 | **conversations_replies_kwargs # type: ignore 128 | ) 129 | messages = result["messages"] 130 | messages_text.extend(message["text"] for message in messages if message['user'] != self.bot_user_id 131 | and message['user'] not in self.not_ignore_users) 132 | messages_text.extend(message["attachments"][0]["text"] for message in messages if 133 | message['user'] in self.not_ignore_users 134 | and "attachments" in message 135 | and "text" in message["attachments"][0]) 136 | 137 | if not result["has_more"]: 138 | break 139 | 140 | next_cursor = result["response_metadata"]["next_cursor"] 141 | except SlackApiError as e: 142 | self.sleep_on_ratelimit(e) 143 | 144 | return Document(text="\n\n".join(messages_text), 145 | metadata={"channel": channel_id, "thread_ts": float(message_ts)}, 146 | excluded_embed_metadata_keys=EXCLUDED_METADATA_FIELDS, 147 | excluded_llm_metadata_keys=EXCLUDED_METADATA_FIELDS 148 | ) 149 | 150 | def _read_channel(self, channel_id: str) -> list[Document]: 151 | from slack_sdk.errors import SlackApiError 152 | 153 | """Read a channel.""" 154 | 155 | thread_documents: list[Document] = [] 156 | next_cursor = None 157 | while True: 158 | try: 159 | # Call the conversations.history method using the WebClient 160 | # conversations.history returns the first 100 messages by default 161 | # These results are paginated, 162 | # see: https://api.slack.com/methods/conversations.history$pagination 163 | conversations_history_kwargs = { 164 | "channel": channel_id, 165 | "cursor": next_cursor, 166 | "latest": str(self.latest_date_timestamp), 167 | } 168 | if self.earliest_date_timestamp is not None: 169 | conversations_history_kwargs["oldest"] = str( 170 | self.earliest_date_timestamp 171 | ) 172 | result = self._client.conversations_history( 173 | **conversations_history_kwargs # type: ignore 174 | ) 175 | conversation_history = result["messages"] 176 | # Print results 177 | logger.info(f"{len(conversation_history)} messages found in {channel_id}") 178 | 179 | for message in conversation_history: 180 | if self.is_for_indexing(message): 181 | read_message: Document = self._read_message(channel_id, message["ts"]) 182 | if read_message.text != "": 183 | thread_documents.append(read_message) 184 | 185 | if not result["has_more"]: 186 | break 187 | next_cursor = result["response_metadata"]["next_cursor"] 188 | 189 | except SlackApiError as e: 190 | self.sleep_on_ratelimit(e) 191 | except IncompleteRead: 192 | continue 193 | 194 | return thread_documents 195 | 196 | @staticmethod 197 | def sleep_on_ratelimit(e): 198 | if e.response["error"] == "ratelimited": 199 | retry_after = e.response.headers["retry-after"] 200 | logger.error( 201 | f'Rate limit error reached, sleeping for: {retry_after} seconds' 202 | ) 203 | time.sleep(int(retry_after) + 1) 204 | else: 205 | logger.error(f"Error parsing conversation replies: {e}") 206 | 207 | def is_for_indexing(self, message): 208 | # ignore unanswered messages 209 | if 'reply_count' in message: 210 | # if bot user id isn't specified or bot hasn't replied the message 211 | if not self.bot_user_id or self.bot_user_id not in message['reply_users']: 212 | return True 213 | if message['reply_users_count'] > 1: 214 | return True 215 | # even if it's a single message but from a user in un-ignore list, index it 216 | elif message['user'] in self.not_ignore_users: 217 | return True 218 | return False 219 | 220 | def load_data(self, channel_ids: list[str]) -> list[Document]: 221 | """Load data from the input directory. 222 | 223 | Args: 224 | channel_ids (List[str]): List of channel ids to read. 225 | Returns: 226 | List[Document]: List of documents. 227 | """ 228 | results = [] 229 | for channel_id in channel_ids: 230 | results.extend(self._read_channel(channel_id)) 231 | return results 232 | 233 | 234 | if __name__ == "__main__": 235 | reader = SlackReader(earliest_date=datetime.now() - timedelta(days=2), 236 | bot_user_id='U05DM3PEJA2', 237 | not_ignore_users=['U01S08W6Z9T']) 238 | for thread in reader.load_data(channel_ids=["C02R98X7DS9"]): 239 | logger.info(f'Text: {thread.text}') 240 | logger.info(f'Metadata: {thread.metadata}') 241 | logger.info('----------------------------') 242 | -------------------------------------------------------------------------------- /ingest/readers/youtube_reader.py: -------------------------------------------------------------------------------- 1 | """YouTube reader.""" 2 | 3 | from llama_index.core.readers.base import BasePydanticReader 4 | from llama_index.core.schema import Document 5 | 6 | 7 | class YoutubeReader(BasePydanticReader): 8 | 9 | def __init__(self) -> None: 10 | try: 11 | from youtube_transcript_api import YouTubeTranscriptApi 12 | except ImportError as e: 13 | raise ImportError( 14 | '`youtube_transcript_api` must be installed to use the YoutubeReader.\n' 15 | 'Please run `pip install --upgrade youtube-transcript-api`.' 16 | ) from e 17 | 18 | super().__init__() 19 | 20 | @classmethod 21 | def class_name(cls) -> str: 22 | """Get the name identifier of the class.""" 23 | return "YoutubeReader" 24 | 25 | def load_data(self, video_ids: list[str], tokenizer) -> list[Document]: 26 | from youtube_transcript_api import YouTubeTranscriptApi 27 | 28 | documents: list[Document] = [] 29 | for video_id in video_ids: 30 | yt_title = YoutubeReader._read_title(video_id) 31 | current_start = None 32 | current_text = "" 33 | current_token_count = 0 34 | transcript_array = YouTubeTranscriptApi.get_transcript(video_id) 35 | 36 | for segment in transcript_array: 37 | # Get the token count of the current segment text 38 | token_count = len(tokenizer(segment["text"], truncation=False, add_special_tokens=False)['input_ids']) 39 | 40 | # If adding this segment exceeds 512 tokens, finalize the current document 41 | if current_token_count + token_count > 512: 42 | documents.append(Document( 43 | text=current_text.strip(), 44 | metadata=YoutubeReader._get_node_metadata(video_id, int(current_start), yt_title), 45 | excluded_embed_metadata_keys=['yt_link'], 46 | excluded_llm_metadata_keys=['yt_link'] 47 | )) 48 | 49 | # Start a new chunk 50 | current_start = segment["start"] 51 | current_text = segment["text"] 52 | current_token_count = token_count 53 | else: 54 | # Concatenate to the current chunk 55 | if not current_text: 56 | current_start = segment["start"] 57 | current_text += " " + segment["text"] 58 | current_token_count += token_count 59 | 60 | # Append the last chunk if it exists 61 | if current_text: 62 | documents.append(Document( 63 | text=current_text.strip(), 64 | metadata=YoutubeReader._get_node_metadata(video_id, int(current_start), yt_title), 65 | excluded_embed_metadata_keys=['yt_link'], 66 | excluded_llm_metadata_keys=['yt_link'] 67 | )) 68 | 69 | return documents 70 | 71 | @staticmethod 72 | def _get_node_metadata(video_id: str, pos: int, yt_title: str) -> dict: 73 | return { 74 | 'yt_link': f"https://www.youtube.com/watch?v={video_id}&t={pos}s", 75 | 'yt_title': yt_title 76 | } 77 | 78 | @staticmethod 79 | def _read_title(video_id: str) -> str: 80 | params = { 81 | "format": "json", 82 | "url": f"https://www.youtube.com/watch?v={video_id}" 83 | } 84 | url = "https://www.youtube.com/oembed" 85 | 86 | import requests 87 | response = requests.get(url, params=params) 88 | if response.status_code == 200: 89 | data = response.json() 90 | return data['title'] 91 | else: 92 | print(f"Failed to retrieve data: {response.status_code}") 93 | return '' 94 | -------------------------------------------------------------------------------- /ingest/requirements.txt: -------------------------------------------------------------------------------- 1 | slack-sdk==3.30.0 2 | langchain==0.1.20 3 | google-api-python-client==2.134.0 4 | google-auth-httplib2==0.2.0 5 | google-auth-oauthlib==1.2.0 6 | sentence-transformers==3.0.1 7 | prefect-gcp==0.5.12 8 | GitPython==3.1.43 9 | pymilvus==2.4.4 10 | llama-index-core==0.10.48 11 | llama-index-readers-web==0.1.19 12 | llama-index-readers-github==0.1.9 13 | llama-index-vector-stores-milvus==0.1.20 14 | llama-index-embeddings-langchain==0.1.2 15 | trafilatura==1.10.0 16 | nbconvert==7.16.4 17 | ipython==8.25.0 18 | upstash-redis==1.1.0 19 | jupyter-notebook-parser==0.1.4 20 | youtube-transcript-api==0.6.3 21 | -------------------------------------------------------------------------------- /ingest/utils/index_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import tempfile 4 | from datetime import datetime, timedelta 5 | 6 | from jupyter_notebook_parser import JupyterNotebookParser 7 | from langchain.embeddings import CacheBackedEmbeddings 8 | from langchain_community.embeddings import HuggingFaceEmbeddings 9 | from langchain_community.storage import UpstashRedisByteStore 10 | from llama_index.core import Settings 11 | from llama_index.core.indices import VectorStoreIndex 12 | from llama_index.core.node_parser import NodeParser, SentenceSplitter, MarkdownNodeParser 13 | from llama_index.core.schema import Document 14 | from llama_index.core.storage import StorageContext 15 | from llama_index.readers.github import GithubRepositoryReader, GithubClient 16 | from llama_index.readers.web import TrafilaturaWebReader 17 | from llama_index.vector_stores.milvus import MilvusVectorStore 18 | from prefect.blocks.system import Secret 19 | from prefect_gcp import GcpCredentials 20 | from upstash_redis import Redis 21 | 22 | from ingest.readers.custom_faq_gdoc_reader import FAQGoogleDocsReader 23 | from ingest.readers.slack_reader import SlackReader 24 | from ingest.readers.youtube_reader import YoutubeReader 25 | 26 | BOT_USER_ID = 'U05DM3PEJA2' 27 | AU_TOMATOR_USER_ID = 'U01S08W6Z9T' 28 | 29 | EXCLUDE_FILTER_TYPE = GithubRepositoryReader.FilterType.EXCLUDE 30 | 31 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 32 | 33 | embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5') 34 | 35 | embedding_dimension = len(embeddings.embed_query("test")) 36 | print(f'embedding dimension = {embedding_dimension}') 37 | 38 | 39 | def load_embeddings() -> CacheBackedEmbeddings: 40 | redis_client = Redis(url=Secret.load('upstash-redis-rest-url').get(), 41 | token=Secret.load('upstash-redis-rest-token').get()) 42 | embeddings_cache = UpstashRedisByteStore(client=redis_client, 43 | ttl=None, 44 | namespace=os.getenv('EMBEDDING_CACHE_NAMESPACE')) 45 | 46 | cached_embedder = CacheBackedEmbeddings.from_bytes_store( 47 | embeddings, 48 | embeddings_cache, 49 | namespace=embeddings.model_name + "/", 50 | ) 51 | return cached_embedder 52 | 53 | 54 | Settings.embed_model = load_embeddings() 55 | Settings.llm = None 56 | 57 | 58 | def index_spreadsheet(url: str, title: str, collection_name: str): 59 | documents = TrafilaturaWebReader().load_data([url]) 60 | for doc in documents: 61 | doc.metadata['title'] = title 62 | doc.metadata['source'] = url 63 | add_route_to_docs(documents, 'faq') 64 | add_to_index(documents, collection_name=collection_name) 65 | 66 | 67 | def add_route_to_docs(docs: [Document], route_name: str): 68 | route_key_name = 'route' 69 | for doc in docs: 70 | doc.metadata[route_key_name] = route_name 71 | doc.excluded_embed_metadata_keys.append(route_key_name) 72 | doc.excluded_llm_metadata_keys.append(route_key_name) 73 | 74 | 75 | def add_to_index(documents: list[Document], 76 | collection_name: str, 77 | overwrite: bool = False, 78 | node_parser: NodeParser = None): 79 | sentence_splitter = SentenceSplitter.from_defaults(chunk_size=512, chunk_overlap=50, 80 | tokenizer=embeddings.client.tokenizer) 81 | environment = os.getenv('EXECUTION_ENV', 'local') 82 | if environment == 'local': 83 | milvus_vector_store = MilvusVectorStore(uri='http://localhost:19530', 84 | collection_name=collection_name, 85 | dim=embedding_dimension, 86 | overwrite=overwrite) 87 | elif environment == 'zilliz-cluster': 88 | milvus_vector_store = MilvusVectorStore( 89 | uri=Secret.load('zilliz-public-endpoint').get(), 90 | token=Secret.load('zilliz-api-key').get(), 91 | collection_name=collection_name, 92 | dim=embedding_dimension, 93 | overwrite=overwrite) 94 | else: 95 | milvus_vector_store = MilvusVectorStore(collection_name=collection_name, 96 | uri=Secret.load('zilliz-cloud-uri').get(), 97 | token=Secret.load('zilliz-cloud-api-key').get(), 98 | dim=embedding_dimension, 99 | overwrite=overwrite) 100 | storage_context = StorageContext.from_defaults(vector_store=milvus_vector_store) 101 | transformations = [t for t in [node_parser, sentence_splitter] if t is not None] 102 | 103 | VectorStoreIndex.from_documents(documents, 104 | transformations=transformations, 105 | storage_context=storage_context, 106 | show_progress=True) 107 | 108 | 109 | def index_github_repo(owner: str, 110 | repo: str, 111 | branch: str, 112 | collection_name: str, 113 | ignore_file_extensions: [str] = None, 114 | ignore_directories: [str] = None, 115 | ): 116 | if ignore_file_extensions is None: 117 | ignore_file_extensions = ['.jpg', '.png', '.svg', '.gitignore', '.csv', '.jar'] 118 | if ignore_directories is None: 119 | ignore_directories = ['.github', '.gitignore', '2021', '2022', 'images'] 120 | github_client = GithubClient(Secret.load('github-token').get(), verbose=True) 121 | documents = GithubRepositoryReader( 122 | github_client=github_client, 123 | owner=owner, 124 | repo=repo, 125 | filter_directories=(ignore_directories, EXCLUDE_FILTER_TYPE), 126 | filter_file_extensions=(ignore_file_extensions, EXCLUDE_FILTER_TYPE), 127 | ).load_data(branch=branch) 128 | for doc in documents: 129 | doc.metadata['branch'] = branch 130 | doc.metadata['owner'] = owner 131 | doc.metadata['repo'] = repo 132 | add_route_to_docs(documents, 'github') 133 | 134 | ipynb_docs = [parse_ipynb_doc(doc) for doc in documents if doc.metadata.get('file_name', '').endswith('.ipynb')] 135 | md_docs = [doc for doc in documents if doc.metadata.get('file_name', '').endswith('.md')] 136 | other_docs = [doc for doc in documents if not doc.metadata.get('file_name', '').endswith(('.ipynb', '.md'))] 137 | 138 | add_to_index(other_docs, collection_name=collection_name) 139 | add_to_index(md_docs, collection_name=collection_name, node_parser=MarkdownNodeParser()) 140 | add_to_index(ipynb_docs, collection_name=collection_name) 141 | 142 | 143 | def parse_ipynb_doc(ipynb_doc: Document) -> Document: 144 | ipynb_json = json.loads(ipynb_doc.text) 145 | temp_ipynb = tempfile.NamedTemporaryFile(suffix='.ipynb') 146 | try: 147 | with open(temp_ipynb.name, 'w') as f_out: 148 | json.dump(ipynb_json, f_out) 149 | parsed = JupyterNotebookParser(temp_ipynb.name) 150 | all_cells = parsed.get_all_cells() 151 | parsed_text = ''.join([JupyterNotebookParser._join_source_lines(cell.get('source', '')) 152 | for cell in all_cells]) 153 | ipynb_doc.text = parsed_text 154 | return ipynb_doc 155 | finally: 156 | temp_ipynb.close() 157 | 158 | 159 | def index_slack_history(channel_ids: [str], collection_name: str): 160 | earliest_date = datetime.now() - timedelta(days=90) 161 | slack_reader = SlackReader(earliest_date=earliest_date, 162 | bot_user_id=BOT_USER_ID, 163 | not_ignore_users=[AU_TOMATOR_USER_ID], 164 | slack_token=Secret.load('slack-bot-token').get()) 165 | print('Starting to load slack messages from the last 90 days') 166 | documents = slack_reader.load_data(channel_ids=channel_ids) 167 | add_route_to_docs(documents, 'slack') 168 | print('Starting to add loaded Slack messages to the index') 169 | add_to_index(documents, collection_name=collection_name) 170 | 171 | 172 | def index_faq(document_ids: [str], collection_name: str): 173 | temp_creds = tempfile.NamedTemporaryFile() 174 | creds_dict = GcpCredentials.load("google-drive-creds").service_account_info.get_secret_value() 175 | with open(temp_creds.name, 'w') as f_out: 176 | json.dump(creds_dict, f_out) 177 | gdocs_reader = FAQGoogleDocsReader(service_account_json_path=temp_creds.name) 178 | print('Starting to load FAQ document') 179 | documents = gdocs_reader.load_data(document_ids=document_ids) 180 | temp_creds.close() 181 | add_route_to_docs(documents, 'faq') 182 | print('Starting to add loaded FAQ document to the index') 183 | add_to_index(documents, 184 | collection_name=collection_name, 185 | overwrite=True, 186 | ) 187 | 188 | 189 | def index_youtube(video_ids: list[str], collection_name: str): 190 | yt_reader = YoutubeReader() 191 | documents = yt_reader.load_data(video_ids=video_ids, tokenizer=embeddings.client.tokenizer) 192 | print('Starting to add loaded Video transcripts to the index') 193 | add_to_index(documents, collection_name=collection_name) 194 | -------------------------------------------------------------------------------- /prefect.yaml: -------------------------------------------------------------------------------- 1 | # Welcome to your prefect.yaml file! You can use this file for storing and managing 2 | # configuration for deploying your flows. We recommend committing this file to source 3 | # control along with your flow code. 4 | 5 | # Generic metadata about this project 6 | name: zoomcamp-bot-index 7 | prefect-version: 2.19.5 8 | 9 | # build section allows you to manage and build docker images 10 | build: 11 | 12 | # push section allows you to manage if and how this project is uploaded to remote locations 13 | push: 14 | - prefect_docker.deployments.steps.push_docker_image: 15 | requires: prefect-docker>=0.3.1 16 | image_name: '{{ build_image.image_name }}' 17 | tag: '{{ build_image.tag }}' 18 | 19 | # pull section allows you to provide instructions for cloning this project in remote locations 20 | pull: 21 | 22 | 23 | definitions: 24 | work_pools: 25 | zoomcamp_faq_bot_workpool: &zoomcamp-faq-bot-pool 26 | name: zoomcamp-faq-bot 27 | work_queue_name: docker_queue 28 | job_variables: &job-variables 29 | image: '{{ build_image.image }}' 30 | env: 31 | EXECUTION_ENV: zilliz 32 | auto_remove: true 33 | schedules: 34 | at_0_daily: &at_0_daily 35 | cron: 0 0 * * * 36 | timezone: Europe/Madrid 37 | day_or: true 38 | at_1_daily: &at_1_daily 39 | cron: 0 1 * * * 40 | timezone: Europe/Madrid 41 | day_or: true 42 | at_23_monday: &at_23_monday 43 | cron: 0 23 * * 1 44 | timezone: Europe/Madrid 45 | day_or: true 46 | at_23_tuesday: &at_23_tuesday 47 | cron: 0 23 * * 2 48 | timezone: Europe/Madrid 49 | day_or: true 50 | at_23_wednesday: &at_23_wednesday 51 | cron: 0 23 * * 3 52 | timezone: Europe/Madrid 53 | day_or: true 54 | actions: 55 | docker_build: 56 | - prefect.deployments.steps.run_shell_script: &shell-script-config 57 | id: get-commit-hash 58 | script: git rev-parse --short HEAD 59 | stream_output: false 60 | - prefect_docker.deployments.steps.build_docker_image: &docker-build-config 61 | id: build_image 62 | requires: prefect-docker>=0.3.1 63 | tag: '{{ get-commit-hash.stdout }}' 64 | platform: linux/amd64 65 | 66 | 67 | # the deployments section allows you to provide configuration for deploying flows 68 | deployments: 69 | - name: fill-index-zilliz-ml 70 | tags: 71 | - ml-ingest 72 | - zoomcamp-faq-bot 73 | description: Fill Zilliz index for ML Zoomcamp 74 | schedules: 75 | - *at_23_tuesday 76 | entrypoint: ingest/ml/ingest_ml.py:fill_ml_index 77 | work_pool: *zoomcamp-faq-bot-pool 78 | build: 79 | - prefect.deployments.steps.run_shell_script: *shell-script-config 80 | - prefect_docker.deployments.steps.build_docker_image: 81 | <<: *docker-build-config # Uses the docker_build_config and overrides the dockerfile and image_name fields 82 | dockerfile: ingest/ml.dockerfile 83 | image_name: aaalexlit/zoomcamp-faq-ingest-ml 84 | pull: 85 | - prefect.deployments.steps.set_working_directory: 86 | directory: /usr/src 87 | - name: fill-index-zilliz-de 88 | tags: 89 | - de-ingest 90 | - zoomcamp-faq-bot 91 | description: Fill Zilliz index for DE Zoomcamp 92 | schedules: 93 | - *at_23_monday 94 | entrypoint: ingest/de/ingest_de.py:fill_de_index 95 | work_pool: *zoomcamp-faq-bot-pool 96 | build: 97 | - prefect.deployments.steps.run_shell_script: *shell-script-config 98 | - prefect_docker.deployments.steps.build_docker_image: 99 | <<: *docker-build-config 100 | # Uses the docker_build_config and overrides the dockerfile and image_name fields 101 | dockerfile: ingest/de.dockerfile 102 | image_name: aaalexlit/zoomcamp-faq-ingest-de 103 | pull: 104 | - prefect.deployments.steps.set_working_directory: 105 | directory: /usr/src 106 | - name: fill-index-zilliz-mlops 107 | tags: 108 | - mlops-ingest 109 | - zoomcamp-faq-bot 110 | description: Fill Zilliz index for MLOps Zoomcamp 111 | schedules: 112 | - *at_0_daily 113 | entrypoint: ingest/mlops/ingest_mlops.py:fill_mlops_index 114 | work_pool: 115 | <<: *zoomcamp-faq-bot-pool 116 | job_variables: 117 | <<: *job-variables 118 | env: 119 | EXECUTION_ENV: zilliz-cluster 120 | build: 121 | - prefect.deployments.steps.run_shell_script: *shell-script-config 122 | - prefect_docker.deployments.steps.build_docker_image: 123 | <<: *docker-build-config 124 | # Uses the docker_build_config and overrides the dockerfile and image_name fields 125 | dockerfile: ingest/mlops.dockerfile 126 | image_name: aaalexlit/zoomcamp-faq-ingest-mlops 127 | pull: 128 | - prefect.deployments.steps.set_working_directory: 129 | directory: /usr/src 130 | - name: fill-index-zilliz-llm 131 | tags: 132 | - llm-ingest 133 | - zoomcamp-faq-bot 134 | description: Fill Zilliz index for LLM Zoomcamp 135 | schedules: 136 | - *at_23_wednesday 137 | entrypoint: ingest/llm/ingest_llm.py:fill_llm_index 138 | work_pool: 139 | <<: *zoomcamp-faq-bot-pool 140 | job_variables: 141 | <<: *job-variables 142 | env: 143 | EXECUTION_ENV: zilliz-cluster 144 | build: 145 | - prefect.deployments.steps.run_shell_script: *shell-script-config 146 | - prefect_docker.deployments.steps.build_docker_image: 147 | <<: *docker-build-config 148 | # Uses the docker_build_config and overrides the dockerfile and image_name fields 149 | dockerfile: ingest/llm.dockerfile 150 | image_name: aaalexlit/zoomcamp-faq-ingest-llm 151 | pull: 152 | - prefect.deployments.steps.set_working_directory: 153 | directory: /usr/src 154 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai 2 | slack-bolt 3 | slack-sdk 4 | langchain 5 | google-api-python-client 6 | google-auth-httplib2 7 | google-auth-oauthlib 8 | sentence-transformers 9 | prefect 10 | prefect-gcp 11 | GitPython 12 | pymilvus 13 | llama-index-core 14 | llama-index-readers-web 15 | llama-index-readers-github 16 | llama-index-vector-stores-milvus 17 | llama-index-embeddings-langchain 18 | llama-index-postprocessor-cohere-rerank 19 | llama-index-llms-langchain 20 | llama-index-llms-fireworks 21 | ipython 22 | cohere 23 | trafilatura 24 | nbconvert 25 | prefect-docker 26 | langchain-openai 27 | upstash-redis 28 | jupyter-notebook-parser 29 | requests==2.31.0 30 | youtube-transcript-api 31 | -------------------------------------------------------------------------------- /slack_bot/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y gcc python3-dev 5 | 6 | WORKDIR /usr/src/app 7 | 8 | COPY requirements.txt ./ 9 | RUN pip install --no-cache-dir -r requirements.txt 10 | 11 | COPY main.py ./ 12 | 13 | CMD [ "python", "-u", "./main.py" ] -------------------------------------------------------------------------------- /slack_bot/README.md: -------------------------------------------------------------------------------- 1 | # Running the bot locally 2 | 3 | 1. re-create separate conda environment using [slack_bot/requirements.txt](../slack_bot/requirements.txt) 4 | ```shell 5 | conda activate base 6 | conda remove --name slack-bot --all 7 | conda create --name slack-bot python=3.10 8 | conda activate slack-bot 9 | cd slack_bot 10 | pip install -r requirements.txt 11 | ``` 12 | 1. Rename [dev.env](../dev.env) to `.env` and set all the required variables 13 | 14 | 1. Run ingestion with local milvus following [local_development.md](../ingest/local_development.md) 15 | 16 | 1. Run [main.py](main.py) 17 | 18 | ```shell 19 | source .env 20 | python main.py 21 | ``` 22 | In Pycharm IDE use a provided run configuration [run_bot_local_ws.run.xml](../.run/run_bot_local_ws.run.xml) 23 | -------------------------------------------------------------------------------- /slack_bot/app_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "display_information": { 3 | "name": "FAQBotForMLOps", 4 | "description": "MLOps FAQ as a bot", 5 | "background_color": "#2e7898" 6 | }, 7 | "features": { 8 | "bot_user": { 9 | "display_name": "QABotForMLOps", 10 | "always_online": false 11 | } 12 | }, 13 | "oauth_config": { 14 | "scopes": { 15 | "bot": [ 16 | "app_mentions:read", 17 | "channels:history", 18 | "channels:read", 19 | "chat:write" 20 | ] 21 | } 22 | }, 23 | "settings": { 24 | "event_subscriptions": { 25 | "bot_events": [ 26 | "app_mention" 27 | ] 28 | }, 29 | "interactivity": { 30 | "is_enabled": true 31 | }, 32 | "org_deploy_enabled": false, 33 | "socket_mode_enabled": true, 34 | "token_rotation_enabled": false 35 | } 36 | } -------------------------------------------------------------------------------- /slack_bot/bot_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaalexlit/faq-slack-bot/78962d17dbd68438bd443b7dfdb961ac9d13e574/slack_bot/bot_icon.png -------------------------------------------------------------------------------- /slack_bot/dev.env: -------------------------------------------------------------------------------- 1 | # datatalks slack token 2 | SLACK_APP_TOKEN=xapp-.. 3 | SLACK_BOT_TOKEN=xoxb-.. 4 | 5 | # OpenAI API key 6 | OPENAI_API_KEY=sk-.. 7 | 8 | PINECONE_API_KEY=.. 9 | PINECONE_ENV=.. 10 | 11 | WANDB_API_KEY=.. 12 | LANGCHAIN_API_KEY=lsv2_.. 13 | 14 | ZILLIZ_CLOUD_URI=https://.. 15 | ZILLIZ_CLOUD_API_KEY=.. 16 | 17 | ZILLIZ_PUBLIC_ENDPOINT=https://.. 18 | ZILLIZ_API_KEY=.. 19 | 20 | COHERE_API_KEY=.. 21 | 22 | # DEBUG log level 23 | #LOG_LEVEL=10 24 | -------------------------------------------------------------------------------- /slack_bot/docker-compose-my-workspace.yml: -------------------------------------------------------------------------------- 1 | services: 2 | faq-slack-bot: 3 | build: 4 | context: . 5 | platform: linux/amd64 6 | env_file: 7 | - ../.env 8 | environment: 9 | - LOCALHOST=host.docker.internal 10 | -------------------------------------------------------------------------------- /slack_bot/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | faq-slack-bot: 3 | build: 4 | context: . 5 | platform: linux/amd64 6 | env_file: 7 | - .env 8 | -------------------------------------------------------------------------------- /slack_bot/main.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import hashlib 3 | import logging 4 | import os 5 | import re 6 | import sys 7 | import uuid 8 | 9 | from cohere.core import ApiError as CohereAPIError 10 | from langchain import callbacks 11 | from langchain_openai import ChatOpenAI 12 | from langsmith import Client 13 | from llama_index.core import ChatPromptTemplate 14 | from llama_index.core import VectorStoreIndex, Settings 15 | from llama_index.core import get_response_synthesizer 16 | from llama_index.core.llms import ChatMessage, MessageRole 17 | from llama_index.core.postprocessor import TimeWeightedPostprocessor 18 | from llama_index.core.query_engine import RetrieverQueryEngine 19 | # from llama_index.postprocessor.cohere_rerank import CohereRerank 20 | from llama_index.vector_stores.milvus import MilvusVectorStore 21 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding 22 | from requests.exceptions import ChunkedEncodingError 23 | from slack_bolt import App 24 | from slack_bolt.adapter.socket_mode import SocketModeHandler 25 | from slack_sdk.models.views import View 26 | from slack_sdk.web import WebClient 27 | 28 | logging.basicConfig(stream=sys.stdout, 29 | level=os.getenv('LOG_LEVEL', logging.INFO), 30 | format='%(asctime)s %(message)s', 31 | datefmt='%d-%m-%Y %H:%M:%S', ) 32 | logger = logging.getLogger(__name__) 33 | 34 | DE_CHANNELS = ['C01FABYF2RG', 'C06CBSE16JC', 'C06BZJX8PSP'] 35 | ML_CHANNELS = ['C0288NJ5XSA', 'C05C3SGMLBB', 'C05DTQECY66'] 36 | MLOPS_CHANNELS = ['C02R98X7DS9', 'C06C1N46CQ1', 'C0735558X52'] 37 | LLM_CHANNELS = ['C079QE5NAMP', 'C078X7REVN3', 'C06TEGTGM3J'] 38 | 39 | ALLOWED_CHANNELS = DE_CHANNELS + ML_CHANNELS + MLOPS_CHANNELS + LLM_CHANNELS 40 | 41 | PROJECT_NAME = 'datatalks-faq-slackbot' 42 | ML_ZOOMCAMP_PROJECT_NAME = 'ml-zoomcamp-slack-bot' 43 | DE_ZOOMCAMP_PROJECT_NAME = 'de-zoomcamp-slack-bot' 44 | 45 | ML_COLLECTION_NAME = 'mlzoomcamp_faq_git' 46 | DE_COLLECTION_NAME = 'dezoomcamp_faq_git' 47 | MLOPS_COLLECTION_NAME = 'mlopszoomcamp' 48 | LLM_COLLECTION_NAME = 'llmzoomcamp' 49 | 50 | GPT_MODEL_NAME = 'gpt-4o-mini-2024-07-18' 51 | 52 | # Event API & Web API 53 | SLACK_BOT_TOKEN = os.getenv('SLACK_BOT_TOKEN') 54 | SLACK_APP_TOKEN = os.getenv('SLACK_APP_TOKEN') 55 | app = App(token=SLACK_BOT_TOKEN) 56 | langsmith_client = Client() 57 | 58 | 59 | @app.action('upvote') 60 | def add_positive_feedback(ack, body): 61 | ack() 62 | add_feedback(body, 'upvote') 63 | 64 | 65 | @app.action('downvote') 66 | def add_negative_feedback(ack, body): 67 | ack() 68 | add_feedback(body, 'downvote') 69 | 70 | 71 | def add_feedback(body, feedback_type: str): 72 | run_id = None 73 | feedback_id = None 74 | try: 75 | original_blocks = body['message']['blocks'] 76 | actions_block_elements = [block for block in original_blocks if block.get('type') == 'actions'][0]['elements'] 77 | element_to_update = \ 78 | [element for element in actions_block_elements if element.get('action_id') == feedback_type][0] 79 | element_text_to_update = element_to_update['text']['text'] 80 | updated_text, updated_number = increment_number_in_string(element_text_to_update) 81 | element_to_update['text']['text'] = updated_text 82 | 83 | run_id = body['actions'][0]['value'] 84 | feedback_id = get_feedback_id_from_run_id_and_feedback_type(run_id, feedback_type) 85 | 86 | user_id = body['user']['id'] 87 | user_name = body['user']['username'] 88 | 89 | logger.info(f'run_id {run_id} {feedback_type}d by {user_name}({user_id})') 90 | 91 | if updated_number > 1: 92 | langsmith_client.update_feedback( 93 | feedback_id=feedback_id, 94 | score=updated_number 95 | ) 96 | else: 97 | langsmith_client.create_feedback( 98 | run_id=run_id, 99 | key=feedback_type, 100 | score=updated_number, 101 | feedback_id=feedback_id 102 | ) 103 | 104 | client.chat_update( 105 | channel=body['channel']['id'], 106 | ts=body['message']['ts'], 107 | blocks=original_blocks, 108 | text=body['message']['text'] 109 | ) 110 | except Exception as ex: 111 | error_message = f'An error occurred when trying to record user feedback with action body =\n{body}\n' 112 | if run_id: 113 | error_message += f'for run_id = {run_id}\n' 114 | if feedback_id: 115 | error_message += f'and feedback_id = {feedback_id}\n' 116 | 117 | logger.error(f'{error_message}' 118 | f'Error: {ex}') 119 | show_feedback_logging_error_modal(body['trigger_id']) 120 | 121 | 122 | def show_feedback_logging_error_modal(trigger_id): 123 | client.views_open(trigger_id=trigger_id, 124 | view=View(type='modal', 125 | title='Error recording feedback', 126 | blocks=[ 127 | { 128 | "type": "section", 129 | "text": { 130 | "type": "mrkdwn", 131 | "text": ( 132 | "An error occurred while attempting to capture your feedback.\n" 133 | "Please try again later. Apologies for the inconvenience.") 134 | } 135 | } 136 | ])) 137 | 138 | 139 | def get_feedback_id_from_run_id_and_feedback_type(run_id, feedback_type): 140 | # Combine run_id UUID bytes and action bytes 141 | combined_bytes = uuid.UUID(run_id).bytes + feedback_type.encode('utf-8') 142 | # Hash the combined bytes 143 | hashed_bytes = hashlib.sha1(combined_bytes).digest() 144 | # Convert hashed bytes to UUID 145 | return uuid.UUID(bytes=hashed_bytes[:16]) 146 | 147 | 148 | # This gets activated when the bot is tagged in a channel 149 | @app.event("app_mention") 150 | def handle_message_events(body): 151 | channel_id = body["event"]["channel"] 152 | event_ts = body["event"]["event_ts"] 153 | user = body["event"]["user"] 154 | 155 | if channel_id not in ALLOWED_CHANNELS: 156 | client.chat_postMessage(channel=channel_id, 157 | thread_ts=event_ts, 158 | text="Apologies, I can't answer questions in this channel") 159 | return 160 | 161 | # Extract question from the message text 162 | question = remove_mentions(str(body["event"]["text"])) 163 | if question.strip() == '': 164 | client.chat_postMessage(channel=channel_id, 165 | thread_ts=event_ts, 166 | text=('Ooops! It seems like your question is empty. ' 167 | 'Please make sure to tag me in your message along with your question.') 168 | ) 169 | return 170 | logger.info(question) 171 | 172 | # Let the user know that we are busy with the request 173 | greeting_message = get_greeting_message(channel_id) 174 | 175 | posted_greeting_message = client.chat_postMessage(channel=channel_id, 176 | thread_ts=event_ts, 177 | text=greeting_message, 178 | unfurl_links=False) 179 | try: 180 | with callbacks.collect_runs() as cb: 181 | if channel_id in MLOPS_CHANNELS: 182 | response = mlops_query_engine.query(question) 183 | elif channel_id in ML_CHANNELS: 184 | response = ml_query_engine.query(question) 185 | elif channel_id in LLM_CHANNELS: 186 | response = llm_query_engine.query(question) 187 | else: 188 | response = de_query_engine.query(question) 189 | # get the id of the last run that's supposedly a run that delivers the final answer 190 | run_id = cb.traced_runs[-1].id 191 | 192 | response_text = f"Hey, <@{user}>! Here you go: \n{response}" 193 | 194 | response_blocks = [ 195 | { 196 | "type": "section", 197 | "text": { 198 | "type": "mrkdwn", 199 | "text": response_text 200 | } 201 | }, 202 | { 203 | "type": "divider" 204 | }] 205 | if hasattr(response, "source_nodes"): 206 | sources = links_to_source_nodes(response) 207 | references = f"References:\n{sources}" 208 | references_blocks = [{ 209 | "type": "section", 210 | "text": { 211 | "type": "mrkdwn", 212 | "text": references 213 | } 214 | }, 215 | { 216 | "type": "divider" 217 | }] 218 | response_blocks.extend(references_blocks) 219 | 220 | response_blocks.extend([{ 221 | "type": "context", 222 | "elements": [ 223 | { 224 | "type": "mrkdwn", 225 | "text": ":pray: Please leave your feedback to help me improve " 226 | } 227 | ] 228 | }, 229 | { 230 | "type": "actions", 231 | "elements": [ 232 | { 233 | "type": "button", 234 | "text": { 235 | "type": "plain_text", 236 | "text": ":thumbsup: 0" 237 | }, 238 | "style": "primary", 239 | "value": f"{run_id}", 240 | "action_id": "upvote" 241 | }, 242 | { 243 | "type": "button", 244 | "text": { 245 | "type": "plain_text", 246 | "text": ":thumbsdown: 0" 247 | }, 248 | "style": "danger", 249 | "value": f"{run_id}", 250 | "action_id": "downvote" 251 | } 252 | ] 253 | } 254 | ]) 255 | 256 | client.chat_postMessage(channel=channel_id, 257 | thread_ts=event_ts, 258 | blocks=response_blocks, 259 | text=response_text, 260 | unfurl_media=False 261 | ) 262 | client.chat_delete(channel=channel_id, 263 | ts=posted_greeting_message.data['ts']) 264 | except CohereAPIError: 265 | client.chat_postMessage(channel=channel_id, 266 | thread_ts=event_ts, 267 | text="There was an error, please try again later") 268 | except Exception as e: 269 | logger.error(f'Error responding to a query\n{e}') 270 | client.chat_postMessage(channel=channel_id, 271 | thread_ts=event_ts, 272 | text=f"There was an error: {e}") 273 | 274 | 275 | def links_to_source_nodes(response): 276 | res = set() 277 | source_nodes = response.source_nodes 278 | link_template = 'https://datatalks-club.slack.com/archives/{}/p{}' 279 | for node in source_nodes: 280 | # Slack 281 | if 'channel' in node.metadata: 282 | channel_id = node.metadata['channel'] 283 | thread_ts = node.metadata['thread_ts'] 284 | thread_ts_str = str(thread_ts).replace('.', '') 285 | link_template.format(channel_id, thread_ts_str) 286 | res.add(link_template.format(channel_id, thread_ts_str)) 287 | # Google doc 288 | elif 'source' in node.metadata: 289 | title = node.metadata['title'] 290 | if title == 'FAQ': 291 | section_title = node.text.split('\n', 1)[0] 292 | res.add(f"<{node.metadata['source']}|" 293 | f" {title}-{section_title}...> ") 294 | else: 295 | res.add(f"<{node.metadata['source']}| {title}>") 296 | # GitHub 297 | elif 'repo' in node.metadata: 298 | repo = node.metadata['repo'] 299 | owner = node.metadata['owner'] 300 | branch = node.metadata['branch'] 301 | file_path = node.metadata['file_path'] 302 | link_to_file = build_repo_path(owner=owner, repo=repo, branch=branch, file_path=file_path) 303 | res.add(f'<{link_to_file}| GitHub-{repo}-{file_path.split("/")[-1]}>') 304 | elif 'yt_link' in node.metadata: 305 | yt_link = node.metadata['yt_link'] 306 | yt_title = node.metadata['yt_title'] 307 | res.add(f'<{yt_link}| Youtube-{yt_title}>') 308 | return '\n'.join(res) 309 | 310 | 311 | def increment_number_in_string(source_string): 312 | # Regular expression to find any sequence of digits (\d+) 313 | pattern = r'(\d+)' 314 | 315 | # Define a lambda function to replace matched digits with the incremented value 316 | replacer = lambda match: str(int(match.group(0)) + 1) 317 | 318 | # Use re.sub() to replace matched digits with the incremented value 319 | result_string = re.sub(pattern, replacer, source_string) 320 | result_number = int(re.search(pattern, result_string).group(0)) 321 | 322 | return result_string, result_number 323 | 324 | 325 | def build_repo_path(owner: str, repo: str, branch: str, file_path: str): 326 | return f'https://github.com/{owner}/{repo}/blob/{branch}/{file_path}' 327 | 328 | 329 | def remove_mentions(input_text): 330 | # Define a regular expression pattern to match the mention 331 | mention_pattern = r'<@U[0-9A-Z]+>' 332 | 333 | return re.sub(mention_pattern, '', input_text) 334 | 335 | 336 | def get_greeting_message(channel_id): 337 | message_template = "Hello from {name} FAQ Bot! :robot_face: \n" \ 338 | "Please note that I'm under active development. " \ 339 | "The answers might not be accurate since I'm " \ 340 | "just a human-friendly interface to the " \ 341 | "" \ 342 | ", this Slack channel, and this course's ." \ 343 | "\nThanks for your request, I'm on it!" 344 | if channel_id in MLOPS_CHANNELS: 345 | name = 'MLOps' 346 | link = '12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit#heading=h.uwpp1jrsj0d' 347 | repo = 'mlops-zoomcamp' 348 | elif channel_id in ML_CHANNELS: 349 | name = 'ML' 350 | link = '1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8/edit#heading=h.98qq6wfuzeck' 351 | repo = 'machine-learning-zoomcamp' 352 | elif channel_id in LLM_CHANNELS: 353 | name = 'LLM' 354 | link = '1m2KexowAXTmexfC5rVTCSnaShvdUQ8Ag2IEiwBDHxN0/edit#heading=h.o29af0z8xx88' 355 | repo = 'llm-zoomcamp' 356 | else: 357 | name = 'DE' 358 | link = '19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit#heading=h.o29af0z8xx88' 359 | repo = 'data-engineering-zoomcamp' 360 | return message_template.format(name=name, link=link, repo=repo) 361 | 362 | 363 | def log_to_langsmith(): 364 | os.environ["LANGCHAIN_TRACING_V2"] = "true" 365 | os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com" 366 | os.environ["LANGCHAIN_PROJECT"] = PROJECT_NAME 367 | 368 | 369 | def get_prompt_template(zoomcamp_name: str, cohort_year: int, course_start_date: str) -> ChatPromptTemplate: 370 | system_prompt = ChatMessage( 371 | content=( 372 | "You are a helpful AI assistant for the {zoomcamp_name} ZoomCamp course at DataTalksClub, " 373 | "and you can be found in the course's Slack channel.\n" 374 | "As a trustworthy assistant, you must provide helpful answers to students' questions about the course, " 375 | "and assist them in finding solutions when they encounter problems/errors while following the course. \n" 376 | "You must do it using only the excerpts from the course FAQ document, Slack threads, and GitHub repository " 377 | "that are provided to you, without relying on prior knowledge.\n" 378 | "Current cohort is year {cohort_year} one and the course start date is {course_start_date}. \n" 379 | "Today is {current_date}. Take this into account when answering questions with temporal aspect. \n" 380 | "Here are your guidelines:\n" 381 | "- Provide clear and concise explanations for your conclusions, including relevant evidences, and " 382 | "relevant code snippets if the question pertains to code. \n" 383 | "- Avoid starting your answer with 'Based on the provided ...' or 'The context information ...' " 384 | "or anything like this, instead, provide the information directly in the response.\n" 385 | "- Justify your response in detail by explaining why you made the conclusions you actually made.\n" 386 | "- In your response, refrain from rephrasing the user's question or problem; simply provide an answer.\n" 387 | "- Make sure that the code examples you provide are accurate and runnable.\n" 388 | "- If the question requests confirmation, avoid repeating the question. Instead, conduct your own " 389 | "analysis based on the provided sources.\n" 390 | "- In cases where the provided information is insufficient and you are uncertain about the response, " 391 | "reply with: 'I don't think I have an answer for this; you'll have to ask your fellows or instructors.\n" 392 | "- All the hyperlinks need to be taken from the provided excerpts, not from prior knowledge. " 393 | "If there are no hyperlinks provided, abstain from adding hyperlinks to the answer.\n" 394 | "- The hyperlinks need to be formatted the following way: \n" 395 | "Example of the correctly formatted link to github: \n" 396 | "" 397 | ), 398 | role=MessageRole.SYSTEM, 399 | ) 400 | user_prompt = ChatMessage(content=("Excerpts from the course FAQ document, Slack threads, and " 401 | "GitHub repository are below delimited by the dashed lines:\n" 402 | "---------------------\n" 403 | "{context_str}\n" 404 | "---------------------\n" 405 | "Question: {query_str}\n" 406 | "Answer: "), 407 | role=MessageRole.USER, ) 408 | return ChatPromptTemplate(message_templates=[ 409 | system_prompt, 410 | user_prompt, 411 | ], 412 | function_mappings={'zoomcamp_name': lambda **kwargs: zoomcamp_name, 413 | 'cohort_year': lambda **kwargs: cohort_year, 414 | 'current_date': lambda **kwargs: datetime.datetime.now().strftime("%d %B %Y"), 415 | 'course_start_date': lambda **kwargs: course_start_date}) 416 | 417 | 418 | def get_retriever_query_engine(collection_name: str, 419 | zoomcamp_name: str, 420 | cohort_year: int, 421 | course_start_date: str): 422 | if os.getenv('LOCAL_MILVUS', None): 423 | localhost = os.getenv('LOCALHOST', 'localhost') 424 | vector_store = MilvusVectorStore(collection_name=collection_name, 425 | dim=embedding_dimension, 426 | overwrite=False, 427 | uri=f'http://{localhost}:19530') 428 | else: 429 | if collection_name in [MLOPS_COLLECTION_NAME, LLM_COLLECTION_NAME]: 430 | vector_store = MilvusVectorStore(collection_name=collection_name, 431 | uri=os.getenv("ZILLIZ_PUBLIC_ENDPOINT"), 432 | token=os.getenv("ZILLIZ_API_KEY"), 433 | dim=embedding_dimension, 434 | overwrite=False) 435 | else: 436 | vector_store = MilvusVectorStore(collection_name=collection_name, 437 | uri=os.getenv("ZILLIZ_CLOUD_URI"), 438 | token=os.getenv("ZILLIZ_CLOUD_API_KEY"), 439 | dim=embedding_dimension, 440 | overwrite=False) 441 | vector_store_index = VectorStoreIndex.from_vector_store(vector_store, 442 | embed_model=embeddings) 443 | # cohere_rerank = CohereRerank(api_key=os.getenv('COHERE_API_KEY'), top_n=4) 444 | recency_postprocessor = get_time_weighted_postprocessor() 445 | # node_postprocessors = [recency_postprocessor, cohere_rerank] 446 | node_postprocessors = [recency_postprocessor] 447 | qa_prompt_template = get_prompt_template(zoomcamp_name=zoomcamp_name, 448 | cohort_year=cohort_year, 449 | course_start_date=course_start_date) 450 | Settings.llm = ChatOpenAI(model=GPT_MODEL_NAME, 451 | temperature=0.7) 452 | 453 | response_synthesizer = get_response_synthesizer(text_qa_template=qa_prompt_template, 454 | verbose=True, 455 | ) 456 | return RetrieverQueryEngine(vector_store_index.as_retriever(similarity_top_k=15), 457 | node_postprocessors=node_postprocessors, 458 | response_synthesizer=response_synthesizer, 459 | ) 460 | 461 | 462 | def get_time_weighted_postprocessor(): 463 | return TimeWeightedPostprocessor( 464 | last_accessed_key='thread_ts', 465 | time_decay=0.4, 466 | time_access_refresh=False, 467 | top_k=10, 468 | ) 469 | 470 | 471 | if __name__ == "__main__": 472 | client = WebClient(SLACK_BOT_TOKEN) 473 | 474 | logger.info('Downloading embeddings...') 475 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 476 | while True: 477 | try: 478 | embeddings = HuggingFaceEmbedding(model_name='BAAI/bge-base-en-v1.5') 479 | embedding_dimension = len(embeddings.get_text_embedding("test")) 480 | except ChunkedEncodingError as e: 481 | continue 482 | break 483 | 484 | log_to_langsmith() 485 | 486 | ml_query_engine = get_retriever_query_engine(collection_name=ML_COLLECTION_NAME, 487 | zoomcamp_name='Machine Learning', 488 | cohort_year=2024, 489 | course_start_date='16 September 2024') 490 | 491 | de_query_engine = get_retriever_query_engine(collection_name=DE_COLLECTION_NAME, 492 | zoomcamp_name='Data Engineering', 493 | cohort_year=2025, 494 | course_start_date='13 January 2025') 495 | 496 | mlops_query_engine = get_retriever_query_engine(collection_name=MLOPS_COLLECTION_NAME, 497 | zoomcamp_name='MLOps', 498 | cohort_year=2024, 499 | course_start_date='13 May 2024') 500 | 501 | llm_query_engine = get_retriever_query_engine(collection_name=LLM_COLLECTION_NAME, 502 | zoomcamp_name='LLM', 503 | cohort_year=2024, 504 | course_start_date='17 June 2024') 505 | SocketModeHandler(app, SLACK_APP_TOKEN).start() 506 | -------------------------------------------------------------------------------- /slack_bot/requirements.txt: -------------------------------------------------------------------------------- 1 | openai==1.61.0 2 | slack-bolt==1.22.0 3 | slack-sdk==3.34.0 4 | langchain==0.3.17 5 | langchain-community==0.3.16 6 | sentence-transformers==3.4.1 7 | cohere==5.15.0 8 | pymilvus==2.5.4 9 | langchain-openai==0.3.3 10 | llama-index-core==0.12.15 11 | llama-index-vector-stores-milvus==0.5.0 12 | llama-index-embeddings-huggingface==0.5.1 13 | llama-index-postprocessor-cohere-rerank==0.4.0 14 | llama-index-llms-langchain==0.5.1 15 | -------------------------------------------------------------------------------- /slack_bot_custom_ingestion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaalexlit/faq-slack-bot/78962d17dbd68438bd443b7dfdb961ac9d13e574/slack_bot_custom_ingestion.png -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import langchain 4 | import pinecone 5 | from langchain.chains import RetrievalQA 6 | from langchain.embeddings import HuggingFaceEmbeddings 7 | from langchain.vectorstores import Pinecone 8 | from langchain.chat_models import ChatOpenAI 9 | 10 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 11 | embeddings = HuggingFaceEmbeddings() 12 | langchain.debug = True 13 | 14 | 15 | def main(question): 16 | pinecone.init( 17 | api_key=os.getenv('PINECONE_API_KEY'), 18 | environment=os.getenv('PINECONE_ENV') 19 | ) 20 | 21 | pinecone_index = Pinecone.from_existing_index(index_name='mlops-faq-bot', 22 | embedding=embeddings) 23 | qa = RetrievalQA.from_chain_type( 24 | llm=ChatOpenAI(model_name='gpt-3.5-turbo-1106'), 25 | retriever=pinecone_index.as_retriever() 26 | ) 27 | qa.return_source_documents = True 28 | print(f"Question: {question}") 29 | 30 | result = qa.apply([question]) 31 | for res in result: 32 | print(res.keys()) 33 | print(f"Question: {res['query']}") 34 | print(f"Answer: {res['result']}") 35 | for doc in res['source_documents']: 36 | print("----------------------------------------------------") 37 | print(f"Metadata: {doc.metadata}") 38 | print(f"Content: {doc.page_content}") 39 | 40 | 41 | if __name__ == "__main__": 42 | # main("How can I solve connection in use problem with mlflow?") 43 | main("MLflow UI throws an error on the browser 'Access to localhost was denied'. Any idea how to resolve this?") 44 | --------------------------------------------------------------------------------