├── .dockerignore
├── .github
    └── workflows
    │   └── python-app.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── LICENSE
├── README.md
├── alembic.ini
├── docker
    ├── Dockerfile
    ├── bot.sh
    └── docker-compose.yaml
├── docs
    └── designdoc.md
├── notebooks
    ├── __init__.py
    ├── chromadb.ipynb
    └── langchain.ipynb
├── poetry.toml
├── pyproject.toml
├── run_bot.sh
├── src
    ├── __init__.py
    ├── app
    │   ├── __init__.py
    │   ├── bot.py
    │   ├── loader.py
    │   └── openai.py
    ├── artifacts
    │   ├── __init__.py
    │   └── migrations
    │   │   ├── README
    │   │   ├── env.py
    │   │   └── script.py.mako
    ├── config
    │   ├── __init__.py
    │   ├── config.py
    │   ├── config.yaml
    │   └── managment.py
    ├── database
    │   ├── __init__.py
    │   ├── backup.py
    │   ├── chroma_service.py
    │   ├── database.py
    │   ├── models.py
    │   └── postgres_service.py
    ├── handlers
    │   ├── __init__.py
    │   ├── admin.py
    │   ├── callbacks.py
    │   ├── commands.py
    │   └── dialog.py
    └── utils
    │   ├── __init__.py
    │   ├── admin_service.py
    │   ├── antifrod.py
    │   ├── extractor.py
    │   ├── filters.py
    │   ├── markup.py
    │   ├── schemas.py
    │   ├── scrapper.py
    │   └── validation.py
└── tests
    ├── __init__.py
    ├── conftest.py
    └── test_db.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | .venv
2 | __pycache__
3 | .mypy_cache
4 | .dockerignore
5 | .git
6 | .gitignore
7 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |       - dev
 7 |       - 'dev/**'
 8 |   pull_request:
 9 |     types: [opened, synchronize]
10 | 
11 | jobs:
12 |   install:
13 |     runs-on: ubuntu-20.04
14 | 
15 |     steps:
16 |       - name: Check out repository
17 |         uses: actions/checkout@v3
18 | 
19 |       - name: Setup python
20 |         uses: actions/setup-python@v4
21 |         with:
22 |           python-version: '3.10.13'
23 | 
24 |       - name: Install Poetry
25 |         uses: snok/install-poetry@v1
26 |         with:
27 |           virtualenvs-create: true
28 |           virtualenvs-in-project: false
29 |           installer-parallel: true
30 | 
31 |       - name: Load cached venv
32 |         id: cached-poetry-dependencies
33 |         uses: actions/cache@v3
34 |         with:
35 |           path: .venv
36 |           key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
37 | 
38 |       - name: Install dependencies
39 |         if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
40 |         run: poetry install
41 | 
42 |   ruff:
43 |     runs-on: ubuntu-20.04
44 | 
45 |     steps:
46 |       - uses: actions/checkout@v3
47 |       - uses: chartboost/ruff-action@v1
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | .idea/
161 | 
162 | # Artifacts
163 | *.mp3
164 | *.mp4
165 | *.avi
166 | .ruff_cache
167 | *.session
168 | *.sql
169 | *.dump
170 | *.csv
171 | .env-docker
172 | .DS_Store
173 | src/artifacts/migrations/versions/*.py
174 | chroma_db/
175 | src/artifacts/sessions/
176 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v3.2.0
 6 |     hooks:
 7 |     -   id: trailing-whitespace
 8 |     -   id: end-of-file-fixer
 9 |     -   id: check-yaml
10 |     -   id: check-added-large-files
11 |     -   id: detect-private-key
12 | - repo: https://github.com/astral-sh/ruff-pre-commit
13 |   # Ruff version.
14 |   rev: v0.1.11
15 |   hooks:
16 |     # Run the linter.
17 |     - id: ruff
18 |       args: [ --fix ]
19 |     # Run the formatter.
20 |     - id: ruff-format


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.11.6
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Kartushov Danil (@torchme)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 🔎 POSTFINDER
 2 | 
 3 | ---
 4 | ![t.me/postfinder](https://img.shields.io/badge/Telegram-1F1F1F?style=for-the-badge&logo=telegram&logoColor=white) ![ChatGPT API](https://img.shields.io/badge/ChatGPT-1F1F1F?style=for-the-badge&logo=openai&logoColor=white) ![ChatGPT API](https://img.shields.io/badge/ChatGPT-1F1F1F?style=for-the-badge&logo=openai&logoColor=white) ![Chroma DB](https://img.shields.io/badge/Chroma_DB-1F1F1F?style=for-the-badge&logo=chromadb&logoColor=white) ![Python](https://img.shields.io/badge/Python-1F1F1F?style=for-the-badge&logo=python&logoColor=white) ![Docker](https://img.shields.io/badge/Docker-1F1F1F?style=for-the-badge&logo=docker&logoColor=white) ![GitBook](https://img.shields.io/badge/GitBook-1F1F1F?style=for-the-badge&logo=gitbook&logoColor=white)
 5 | 
 6 | ![Static Badge](https://img.shields.io/badge/python-3.10+stable-1f1f1f?style=flat&labelColor=lightblue&color=1f1f1f) ![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/torchme/PostFinder/python-app.yml?style=flat&labelColor=lightblue&color=1f1f1f) ![GitHub Repo stars](https://img.shields.io/github/stars/torchme/PostFinder?style=flat&labelColor=lightblue&color=1f1f1f.svg) ![Github Watchers](https://img.shields.io/github/watchers/torchme/PostFinder?style=flat&labelColor=lightblue&color=1f1f1f.svg) ![GitHub License](https://img.shields.io/github/license/torchme/PostFinder?style=flat&labelColor=lightblue&color=1f1f1f.svg)
 7 | 
 8 | <br />
 9 | 
10 | ---
11 | 
12 | ## What is it?
13 | 
14 | **Post Finder** is telegram bot integration based on semantich search content. 
15 | 
16 | ---
17 | ## 1. Installation from sources
18 | 
19 | #### Requirements
20 | 
21 | Install required dependencies with the following commands:
22 | 
23 | ```bash
24 | pip install poetry
25 | 
26 | pip install --upgrade pip
27 | 
28 | poetry install
29 | ```
30 | #### Development
31 | 
32 | In the development mode install pre-commits with the following commands:
33 | 
34 | ```bash
35 | pre-commit
36 | ```
37 | 
38 | or
39 | 
40 | ```bash
41 | pre-commit install
42 | ```
43 | 
44 | ## 2. How to run it?
45 | The first step is to create `.env` file with secret keys or `.env-docker` if you gonna start bot with docker.
46 | 
47 | *Place for documentation .env*
48 | 
49 | #### Run the bot
50 | 
51 | You can run the bot with the following command:
52 | 
53 | ```bash
54 | bash run_bot.sh
55 | ```
56 | 
57 | or with docker:
58 | 
59 | ```bash
60 | docker-compose -f docker/docker-compose.yaml build migrations
61 | docker-compose -f docker/docker-compose.yaml up migrations
62 | ```
63 | 
64 | ## Documentation
65 | 
66 | The official documentation is hosted on [GitBook](https://torchme.gitbook.io/postfinder/)
67 | 


--------------------------------------------------------------------------------
/alembic.ini:
--------------------------------------------------------------------------------
  1 | # A generic, single database configuration.
  2 | 
  3 | [alembic]
  4 | # path to migration scripts
  5 | script_location = src/artifacts/migrations
  6 | 
  7 | # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
  8 | # Uncomment the line below if you want the files to be prepended with date and time
  9 | # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
 10 | # for all available tokens
 11 | file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
 12 | 
 13 | # sys.path path, will be prepended to sys.path if present.
 14 | # defaults to the current working directory.
 15 | prepend_sys_path = .
 16 | 
 17 | # timezone to use when rendering the date within the migration file
 18 | # as well as the filename.
 19 | # If specified, requires the python>=3.9 or backports.zoneinfo library.
 20 | # Any required deps can installed by adding `alembic[tz]` to the pip requirements
 21 | # string value is passed to ZoneInfo()
 22 | # leave blank for localtime
 23 | # timezone =
 24 | 
 25 | # max length of characters to apply to the
 26 | # "slug" field
 27 | # truncate_slug_length = 40
 28 | 
 29 | # set to 'true' to run the environment during
 30 | # the 'revision' command, regardless of autogenerate
 31 | # revision_environment = false
 32 | 
 33 | # set to 'true' to allow .pyc and .pyo files without
 34 | # a source .py file to be detected as revisions in the
 35 | # versions/ directory
 36 | # sourceless = false
 37 | 
 38 | # version location specification; This defaults
 39 | # to migrations/versions.  When using multiple version
 40 | # directories, initial revisions must be specified with --version-path.
 41 | # The path separator used here should be the separator specified by "version_path_separator" below.
 42 | # version_locations = %(here)s/bar:%(here)s/bat:migrations/versions
 43 | 
 44 | # version path separator; As mentioned above, this is the character used to split
 45 | # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
 46 | # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
 47 | # Valid values for version_path_separator are:
 48 | #
 49 | # version_path_separator = :
 50 | # version_path_separator = ;
 51 | # version_path_separator = space
 52 | version_path_separator = os  # Use os.pathsep. Default configuration used for new projects.
 53 | 
 54 | # set to 'true' to search source files recursively
 55 | # in each "version_locations" directory
 56 | # new in Alembic version 1.10
 57 | # recursive_version_locations = false
 58 | 
 59 | # the output encoding used when revision files
 60 | # are written from script.py.mako
 61 | # output_encoding = utf-8
 62 | 
 63 | # DSN
 64 | sqlalchemy.url = postgresql+asyncpg://%(DB_USER)s:%(DB_PASS)s@%(DB_HOST)s:%(DB_PORT)s/%(DB_NAME)s?async_fallback=True
 65 | 
 66 | 
 67 | [post_write_hooks]
 68 | # post_write_hooks defines scripts or Python functions that are run
 69 | # on newly generated revision scripts.  See the documentation for further
 70 | # detail and examples
 71 | 
 72 | # format using "black" - use the console_scripts runner, against the "black" entrypoint
 73 | # hooks = black
 74 | # black.type = console_scripts
 75 | # black.entrypoint = black
 76 | # black.options = -l 79 REVISION_SCRIPT_FILENAME
 77 | 
 78 | # lint with attempts to fix using "ruff" - use the exec runner, execute a binary
 79 | # hooks = ruff
 80 | # ruff.type = exec
 81 | # ruff.executable = %(here)s/.venv/bin/ruff
 82 | # ruff.options = --fix REVISION_SCRIPT_FILENAME
 83 | 
 84 | # Logging configuration
 85 | [loggers]
 86 | keys = root,sqlalchemy,alembic
 87 | 
 88 | [handlers]
 89 | keys = console
 90 | 
 91 | [formatters]
 92 | keys = generic
 93 | 
 94 | [logger_root]
 95 | level = WARN
 96 | handlers = console
 97 | qualname =
 98 | 
 99 | [logger_sqlalchemy]
100 | level = WARN
101 | handlers =
102 | qualname = sqlalchemy.engine
103 | 
104 | [logger_alembic]
105 | level = INFO
106 | handlers =
107 | qualname = alembic
108 | 
109 | [handler_console]
110 | class = StreamHandler
111 | args = (sys.stderr,)
112 | level = NOTSET
113 | formatter = generic
114 | 
115 | [formatter_generic]
116 | format = %(levelname)-5.5s [%(name)s] %(message)s
117 | datefmt = %H:%M:%S
118 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11.6
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | COPY . /app
 6 | 
 7 | RUN pip install poetry
 8 | 
 9 | RUN poetry install
10 | 
11 | RUN chmod +x /app/docker/bot.sh
12 | 


--------------------------------------------------------------------------------
/docker/bot.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | poetry run python -m src.app.bot
4 | 


--------------------------------------------------------------------------------
/docker/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.5'
 2 | 
 3 | services:
 4 |   postgres_db:
 5 |     container_name: postgres_db
 6 |     image: postgres:14
 7 |     ports:
 8 |       - "5432:5432"
 9 |     env_file:
10 |       - ../.env-docker
11 | 
12 |   migrations:
13 |     container_name: alembic_migrations
14 |     build:
15 |       context: ../
16 |       dockerfile: docker/Dockerfile
17 |     depends_on:
18 |       - postgres_db
19 |     environment:
20 |       - ENV_STATE=docker
21 |     env_file:
22 |       - ../.env-docker
23 |     command: poetry run alembic upgrade head
24 | 
25 |   bot:
26 |     container_name: bot
27 |     image: docker-migrations:latest
28 |     depends_on:
29 |       - migrations
30 |     environment:
31 |       - ENV_STATE=docker
32 |     env_file:
33 |       - ../.env-docker
34 |     command: ["/app/docker/bot.sh"]
35 | 


--------------------------------------------------------------------------------
/docs/designdoc.md:
--------------------------------------------------------------------------------
  1 | # ML System Design Doc - [RU]
  2 | ## Дизайн ML системы сервиса PostFinder
  3 | 
  4 | ### 1. Цели и предпосылки 
  5 | #### 1.1. Зачем идем в разработку продукта?  
  6 | 
  7 | - **Бизнес-цель:** 
  8 |   - Улучшение доступа к информации в телеграм-каналах через автоматизированный поиск ответов с помощью чат-бота.
  9 | - **Проблематика:**
 10 |   - Накопление повторных вопросов от пользователей, перегружающих поддержку.
 11 |   - Недостаточная видимость старых, но релевантных постов для пользователей.
 12 |   - Долгий процесс поиска и отсутсвие хорошей системы поиска внутри телеграмма.
 13 | - **Преимущества использования ML:**
 14 |   - Использование современных LLM для автоматизации семантического поиска в большом объеме текстовой информации.
 15 |   - Суммаризация и предоставление релевантных ответов без необходимости ручного поиска.
 16 | - **Критерии успеха:** 
 17 |   - Интеграция бота в активные телеграм-каналы и обеспечение пользователей ответами на вопросы с помощью бота.
 18 |   - Пользователи находят ответы на свои вопросы в нашем телеграм боте.
 19 | - **Пользовательские потребности (исходя из USM и CJM):**
 20 |   - Разрешение пользовательской необходимости в быстром и точном поиске информации внутри телеграм-каналов.
 21 |   - Уменьшение количества повторных вопросов, загружающих поддержку, и предоставление пользователю возможности самостоятельного поиска нужной информации.
 22 |   - Поддержание вовлеченности пользователя за счет удобства и быстроты обнаружения релевантной информации.
 23 |   - USER STORY MAPPING:
 24 |     <img width="4144" alt="Untitled(4)" src="https://github.com/torchme/PostFinder/assets/101402639/0d6b13df-ee32-4fc6-a814-8688275b6286">
 25 | 
 26 |   - CUSTOMER JOURNAY MAP
 27 |     <img width="1823" alt="Untitled(2)" src="https://github.com/torchme/PostFinder/assets/101402639/8c27c562-a038-42a6-9e2f-e68b09e45419">
 28 | 
 29 | - **Ожидаемый пользовательский опыт:**
 30 |   - Пользователи смогут легко и интуитивно задавать вопросы и получать ответы, что улучшит их общее впечатление от использования телеграм-каналов.
 31 |   - Ответы будут предоставлены в формате, который легко ассимилируется и понимается пользователями.
 32 | 
 33 | #### 1.2. Бизнес-требования и ограничения  
 34 | 
 35 | - **Краткое описание бизнес требований**
 36 |   - Разработка интуитивно понятного интерфейса для задавания вопросов в чате.
 37 |   - Создание системы для автоматического отвечания на вопросы на основе данных из социальных сетей.
 38 |   - Интеграция с основными социальными платформами с акцентом на экономию времени и повышение вовлеченности пользователей.
 39 |   - Соблюдение правил конфиденциальности и данных, включая GDPR.
 40 | - **Бизнес-ограничения**
 41 |   - Разработка и тестирование в рамках бюджета и сроков проекта.
 42 |   - Обеспечение конфиденциальности и соответствие требованиям GDPR.
 43 |   - Управление зависимостями от сторонних API и поддержание модели в рамках финансовых ограничений.
 44 | - **Итерации проекта**
 45 |   - Первая итерация:
 46 |     - Прототипирование основного функционала для демонстрации возможности системы.
 47 |   - Вторая итерация: 
 48 |     - Разработка MVP для тестирования в контролируемой среде и проведение нагрузочных тестов. Бот должен уметь строить диалоги и искать в определенных телеграм каналов. А так-же реализована система мониторинга аккаунтов.
 49 |   - Третья итерация: 
 50 |     - Оценка производительности на реальных данных и масштабирование. Добавление антифрода аккаунтов и запросов. Сообщения об ошибках, систему лайков и интента команд.
 51 |   - Четвертая итерация:
 52 |     - Доработка системы на основе пользовательской обратной связи и оптимизация производительности. Создание системы подписок и ограничение пользовательских функций и чарн рейт.
 53 | - **Описание бизнес-процесса пилота**
 54 |   - Интеграция и тестирование системы на выбранных платформах.
 55 |   - Сбор и анализ обратной связи для последующих улучшений системы.
 56 | - **Критерии успеха и возможные пути развития проекта**
 57 |   - Снижение числа повторных вопросов и повышение качества ответов.
 58 |   - Развитие: расширение функциональности, поддержка новых платформ, улучшение алгоритмов.
 59 | 
 60 | #### 1.3. Что входит в скоуп проекта/итерации, что не входит   
 61 | 
 62 | - **Какие БТ будут покрыты с технической точки зрения за первую итерацию:**
 63 |   - Интеграция с API для анализа и классификации запросов пользователей на естественном языке.
 64 |   - Реализация базы данных для хранения и извлечения данных с использованием chroma db.
 65 |   - Разработка и тестирование алгоритма поиска для эффективного нахождения релевантных ответов.
 66 | - **Что не будет закрыто:**
 67 |   - Ограниченная интеграция с социальными платформами, сосредоточенная только на основных платформах.
 68 |   - Итерационная оптимизация производительности: начальные версии могут не поддерживать полный объем запросов.
 69 |   - Неполная асинхронность обработки запросов и масштабирование системы.
 70 | - **Какие БТ будут покрыты с технической точки зрения за вторую итерацию:**
 71 |   - Создание диалоговой системы, чат-бота с RAG и Memory
 72 |   - Ассинхронность запросов
 73 |   - Ручной фрод аккаунтов (Админка)
 74 | - **Какие БТ будут покрыты с технической точки зрения за третью итерацию:**
 75 |   - Интент комманды
 76 |   - Поиск без указания канала или ресурса
 77 |   - Автоматический фрод аккаунтов или запросов
 78 | - **Описание результата с точки зрения качества кода и воспроизводимости**
 79 |   - Код
 80 |     - Соответствие стандартам чистого кода и PEP8, использование Numpy Docstring для документирования.
 81 |     - Применение pre-commit-hooks для автоматической проверки кода RUFF. 
 82 |   - Все тесты в GitHub actions должны быть пройдены.
 83 |     - RUFF ✅
 84 |     - Успешная установка зависимостей через Poetry и сборка виртуального окружения. ✅
 85 |     - Отсутствие конфликтов среды и пакетов, корректная сборка Dockerfile. ❌
 86 |     - Проведение нагрузочных тестов и асинхронная работа кода, проверка функциональности бота, подключения к API и работоспособности баз данных. ❌
 87 |     - Все изменения кода подвергаются code review перед слиянием в основную ветку. ✅
 88 |   - Последовательность проверки кода:
 89 |     - Выбора задачи в todoist
 90 |     - Cоздания ветки в Git для этой задачи и ее выполнения
 91 |     - Pull request в Main с коротким описанием изменений
 92 |     - Код проверяет более опытный товарищ и после мерджит в Main или пишет какие коррективы стоит еще внести
 93 |     - Задача закрывается как выполненая в todoist
 94 | - **Описание планируемого технического долга**
 95 |   - Рассмотрение возможности интеграции с дополнительными API, включая OpenAI, для улучшения функциональности и качества ответов. ❌
 96 |   - Исследование альтернативных сервисов социальных сетей и способов интеграции. ✅
 97 |   - Эксперименты с различными запросами и промптами для обучения модели, а так-же токенайзерами и разбиением больших файлов. ❌
 98 | 
 99 | #### 1.4. Предпосылки решения  
100 | 
101 | - Для создания системы, которая отвечает на потребности бизнеса и пользователей, учитываем следующие предпосылки:
102 |   - **Используемые данные:** Взаимодействие с пользовательскими запросами и историческими данными постов в телеграм-каналах. Данные будут содержать текст запросов и контекст, в котором они были сделаны.
103 |   - **Горизонт прогноза:** Система будет ориентирована на немедленный ответ без прогнозирования долгосрочных трендов.
104 |   - **Гранулярность модели:** Ответы на вопросы будут генерироваться на уровне каждого запроса с использованием контекстно-зависимой обработки естественного языка.
105 |   - **Обоснование выбора данных и технологий:** Выбор базируется на способности LLM обрабатывать и понимать естественный язык, а также их способности обобщать и извлекать информацию из большого массива текста.
106 | ### 2. Методология `Data Scientist`     
107 | 
108 | #### 2.1. Постановка задачи  
109 | 
110 | - Решаемая техническая задача – разработка чат бот системы на основе LLM для автоматического ответа на вопросы пользователей, используя исторические данные постов телеграм и или других сервисов. Система включает элементы рекомендательной системы и поисковика аномалий в пользовательских запросах для предотвращения спама и нерелевантных запросов.
111 | 
112 | #### 2.2. Блок-схема решения  
113 | 
114 | Блок-схема будет включать следующие ключевые этапы:
115 | 
116 |   - Подготовка данных: Препроцессинг запросов и исторических данных постов.
117 |   - Разработка модели: Прототипирование и настройка LLM для интерпретации запросов и поиска ответов.
118 |   - Оптимизация: Тонкая настройка и рефакторинг для улучшения точности и скорости ответов.
119 |   - Тестирование: Валидация системы на реальных пользователях и сбор обратной связи.
120 |   - Закрытие технического долга: Работа над известными проблемами и улучшение инфраструктуры.
121 |   - Подготовка пилота: Интеграция системы с тестовыми каналами и начальные испытания.
122 | 
123 | #### 2.3. Этапы решения задачи `Data Scientist`  
124 | 
125 | При обращении к боту выполняется регистрация/аутентификация пользователя. Информация о пользователях хранится в базе данных PostgreSQL, имеющей следующую структуру:
126 | <table>
127 | <tr valign="top" style="border: none; border-collapse: collapse;">
128 | <td style="border: none; border-collapse: collapse;">
129 | 
130 | | user                                                                     |
131 | |--------------------------------------------------------------------------|
132 | | <span style="color:blue">**user_id**</span>                                                              |
133 | | telegram_id <br/>username<br/>first_name<br/>last_name<br/>registered_at |
134 | 
135 | </td><td style="border: none; border-collapse: collapse;">
136 | 
137 | | subscription_type                            |
138 | |----------------------------------------------|
139 | | <span style="color:green">**type_id**</span> |
140 | | type_name<br/>montly_price                   |
141 | 
142 | </td><td style="border: none; border-collapse: collapse;">
143 | 
144 | | user_subscription                                                                                                        |
145 | |--------------------------------------------------------------------------------------------------------------------------|
146 | | subscription_id                                                                                                          |
147 | | <span style="color:blue">**user_id**</span><br/><span style="color:green">**type_id**</span><br/>valid_from<br/>valid_to |
148 | 
149 | </td></tr></table>
150 | 
151 | - Этап 1 – Подготовка данных: Загрузка, предобработка и векторизация постов *целевого канала** для обучения модели, сохранение полученных эмбеддингов, например, в Chroma DB. На выходе — набор эмбеддингов постов *целевого канала**.  
152 | **Целевой канал: для автора – канал, к которому он подключает чат-бота, для пользователя – канал, который он выбирает для поиска информации.*
153 | - Этап 2 – Обработка запроса пользователя: Получение, предобработка и векторизация запроса пользователя, передача его в систему семантического поиска. На выходе — эмбеддинг запроса пользователя.  
154 | - Этап 3 – Семантический поиск: Определение схожести* запроса пользователя и информации в постах канала. На выходе — набор постов целевого канала, соответствующих запросу пользователя и отсортированных в порядке значимости согласно выбранному алгоритму.  
155 | **Возможные варианты определения схожести: косинусное сходство, расстояние Жаккара, предварительно обученные языковые модели (BERT, GPT и др.) для измерения сходства эмбеддингов.  
156 | Необходимо тестирование для определения оптимального варианта с учетом обрабатываемого объема информации, доступных вычислительных ресурсов и качества получаемого результата.*
157 | - Этап 4 – Получение обратной связи: Оценка пользователем качества выданных чат-ботом ответов. Это необходимо для оценки работы алгоритма и выбора направлений для улучшения качества выдачи.
158 | - Этап 5 – Адаптация модели: Добавление системы отслеживания метрик эффективности, например, точности и полноты ответов. Здесь же может быть фиксация изменений качества при выборе на этапе 3 различных API.
159 | - Этап 6 – Формирование отчета: Описание используемых методов и параметров, достигнутых показателей эффективности, полученных данных для анализа и дальнейших доработок и улучшения чат
160 |   
161 | ### 3. Подготовка пилота  
162 |   
163 | #### 3.1. Способ оценки пилота  
164 |   
165 | - Для оценки пилота сервиса PostFinder будет использован подход A/B тестирования, где одна группа пользователей продолжит использовать телеграм-канал без интегрированной системы PostFinder, а другая группа получит доступ к сервису. Оценка эффективности будет проводиться на основе следующих параметров:
166 | 
167 |   - **Время на поиск ответа:** Измерение времени, которое потребуется пользователям на нахождение информации с помощью PostFinder по сравнению с традиционным способом.
168 |   - **Точность ответов:** Анализ релевантности предоставляемых ответов, основанный на отзывах пользователей и их взаимодействии с ботом.
169 |   - **Удовлетворенность пользователей:** Опросы пользователей для сбора данных о их удовлетворенности сервисом.
170 |   - **Количество повторных запросов:** Сравнение количества повторных вопросов в чате с и без сервиса PostFinder.
171 |   - **LTV**: Количество пользователей который остаются и уходят от нашего продукта
172 |   - **DAU**: Количество уникальных пользователей которые ежедневно приходят к нам
173 |   
174 | #### 3.2. Что считаем успешным пилотом  
175 |   
176 | Пилот считается успешным, если наблюдается статистически значимое улучшение в следующих областях:
177 | 
178 |   - **Снижение времени на поиск ответа:** Значительное уменьшение времени, которое пользователи тратят на поиск ответов на свои вопросы.
179 |   - **Повышение точности ответов:** Увеличение процента точных и релевантных ответов, полученных пользователями.
180 |   - **Улучшение удовлетворенности пользователей:** Позитивная динамика в отзывах пользователей и оценках удовлетворенности сервисом.
181 |   - **Сокращение повторных запросов:** Уменьшение количества повторных вопросов на одни и те же темы, что свидетельствует об улучшении доступности информации.
182 | 
183 | #### 3.3. Подготовка пилота  
184 |   
185 |   - **Анализ нагрузки:** Оценка максимального количества запросов, которые модель сможет обработать в реальном времени.
186 |   - **Оценка ресурсов:** Расчет необходимого объема вычислительных ресурсов для обработки пользовательских запросов.
187 |   - **Определение бюджета:** Установление бюджета, доступного для проведения пилота, и соответствующего распределения ресурсов.
188 |   - **Экспериментальные ограничения:** В случае ограниченных ресурсов, определение максимального количества пользователей, которое может быть включено в пилот.
189 | 
190 | В процессе эксперимента с бейзлайном может быть уточнена вычислительная сложность, и, соответственно, скорректированы параметры пилота с учетом полученных данных.
191 | 
192 | ### 4. Внедрение `для production систем, если требуется`    
193 | 
194 | #### 4.1. Архитектура решения   
195 |   
196 | 4.1. Архитектура решения
197 | 
198 | - Для внедрения в production система PostFinder будет развернута в соответствии с моделью, которая предусматривает масштабируемость, отказоустойчивость и быстрый отклик. Архитектура системы разделена на следующие компоненты:
199 | 
200 |   - **Веб-интерфейс или API для пользовательских запросов:** Будет обрабатывать входящие запросы от пользователей и возвращать ответы.
201 |   - **Сервис обработки естественного языка (NLP):** Использует модели LLM для интерпретации запросов и поиска соответствующих ответов в базе данных.
202 |   - **База данных:** Хранит информацию о постах и исторические данные запросов пользователей.
203 |   - **Система управления сессиями:** Поддерживает состояние взаимодействия с пользователем.
204 |   - **Балансировщик нагрузки:** Распределяет запросы по серверам для оптимизации производительности и предотвращения перегрузки.
205 |  
206 | Блок схема архитектуры доступна по [ссылке]()
207 |   
208 | #### 4.2. Описание инфраструктуры и масштабируемости 
209 |   
210 | - Описание инфраструктуры и масштабируемости
211 | 
212 | Инфраструктура решения будет построена на облачной платформе с использованием контейнеризации и оркестрации для обеспечения масштабируемости и эластичности. Возможности автоматического масштабирования и самовосстановления системы будут включены для обработки колебаний нагрузки.
213 |   - CI/CD
214 |   - GitHub Actions
215 |   - TimeWeb/Selectel/YandexCloud/MtsCloud
216 |   - AirFlow/Dagster
217 |   - Docker
218 |   
219 | #### 4.3. Требования к работе системы  
220 | 
221 | Система будет разработана с учетом следующих требований:
222 | 
223 |   - **Отказоустойчивость:** Реализация нескольких уровней резервирования и быстрого восстановления работы в случае сбоев.
224 |   - **Отклик:** Система нацелена на предоставление ответов в рамках ~1ms, что обеспечивает почти мгновенный отклик на пользовательские запросы.
225 |   
226 | #### 4.4. Безопасность данных   
227 | 
228 | Внедрение системы будет включать строгие меры безопасности для защиты данных пользователей и обеспечения соответствия нормативным требованиям, таким как GDPR. 
229 | 
230 | Разработка включает:
231 |   - **Шифрование данных: Все данные, передаваемые и хранимые системой, будут зашифрованы.
232 |   - **Управление доступом: Строгая политика управления доступом для предотвращения неавторизованного доступа к информации. [Антифрод запросов, аккаунтов, телеграм каналов]
233 | 
234 | #### 4.5. Риски  
235 | 
236 | В процессе внедрения системы учитываются следующие риски:
237 |   - **Отключение API сервисов:** Разработка альтернативных механизмов взаимодействия с данными в случае изменения условий использования API сторонних сервисов.
238 |   - **Атаки на сервис:** Реализация системы мониторинга и средств обнаружения вторжений для своевременного реагирования на угрозы безопасности. [Антифрод]
239 |   - **Проблемы с масштабированием:** Планирование архитектуры системы с учетом возможности легкого горизонтального масштабирования для поддержания высокой производительности при увеличении числа пользователей.
240 | 
241 | Внедрение системы будет проводиться этапами, начиная с ограниченного пилотного запуска и постепенно расширяясь до полноценной интеграции в production, что позволит минимизировать риски и гарантировать постоянное качество обслуживания пользователей
242 |   
243 | ## 5. Контрибьюторы 
244 | 
245 | <a href="https://github.com/torchme">
246 | <img src="https://github.com/torchme.png?size=50" height="50" title="torchme"></a>&nbsp;
247 | <a href="https://github.com/dimages">
248 | <img src="https://github.com/dimages.png?size=50" title="dimages"></a>&nbsp;
249 | <a href="https://github.com/gr3eda1g0">
250 | <img src="https://github.com/gr3eda1g0.png?size=50" title="gr3eda1g0"></a>
251 | <a href="https://github.com/mayb333">
252 | <img src="https://github.com/mayb333.png?size=50" title="mayb333"></a>
253 | <a href="https://github.com/umbilnm">
254 | <img src="https://github.com/umbilnm.png?size=50" title="umbilnm"></a>
255 | 
256 | ## 6. Благодарности
257 | - **[Karpov Courses](https://karpov.courses/simulator-ml)**
258 | - **[Reliable ML](https://t.me/reliable_ml/196)**
259 | - **[AI Tallent Hub](https://ai.itmo.ru/)**
260 | 
261 | ## 7. Допольнительные документы
262 | - **[Превратите свой пет-проект из хобби в карьеру](https://habr.com/ru/articles/801549/)
263 | 


--------------------------------------------------------------------------------
/notebooks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/torchme/PostFinder/14de2c2ea8e6b26e57ba03662ddeec7d8bf1991c/notebooks/__init__.py


--------------------------------------------------------------------------------
/notebooks/chromadb.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "06fffa69",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# FIX BUG WITH SQLITE\n",
 11 |     "# https://gist.github.com/defulmere/8b9695e415a44271061cc8e272f3c300\n",
 12 |     "\n",
 13 |     "__import__('pysqlite3')\n",
 14 |     "import sys\n",
 15 |     "import os\n",
 16 |     "sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')\n",
 17 |     "\n",
 18 |     "DATABASES = {\n",
 19 |     "    'default': {\n",
 20 |     "        'ENGINE': 'django.db.backends.sqlite3',\n",
 21 |     "        'NAME': os.path.join('..', 'db.sqlite3'),\n",
 22 |     "    }\n",
 23 |     "}"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "id": "13e04b3f-4fba-405a-80d5-798a935c5e6e",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import os\n",
 34 |     "import openai\n",
 35 |     "from langchain.vectorstores import Chroma\n",
 36 |     "from langchain.embeddings.openai import OpenAIEmbeddings\n",
 37 |     "from dotenv import load_dotenv\n",
 38 |     "from chromadb.utils import embedding_functions\n",
 39 |     "from langchain.schema.document import Document"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "id": "7c99c48d",
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "import pandas as pd"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 5,
 55 |    "id": "25aa8d0a",
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "data = pd.read_csv(\"../src/artifacts/@karpovcourseschat.csv\", delimiter=\";\")"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 6,
 65 |    "id": "96e8b215",
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "data": {
 70 |       "text/html": [
 71 |        "<div>\n",
 72 |        "<style scoped>\n",
 73 |        "    .dataframe tbody tr th:only-of-type {\n",
 74 |        "        vertical-align: middle;\n",
 75 |        "    }\n",
 76 |        "\n",
 77 |        "    .dataframe tbody tr th {\n",
 78 |        "        vertical-align: top;\n",
 79 |        "    }\n",
 80 |        "\n",
 81 |        "    .dataframe thead th {\n",
 82 |        "        text-align: right;\n",
 83 |        "    }\n",
 84 |        "</style>\n",
 85 |        "<table border=\"1\" class=\"dataframe\">\n",
 86 |        "  <thead>\n",
 87 |        "    <tr style=\"text-align: right;\">\n",
 88 |        "      <th></th>\n",
 89 |        "      <th>message_id</th>\n",
 90 |        "      <th>date</th>\n",
 91 |        "      <th>text</th>\n",
 92 |        "    </tr>\n",
 93 |        "  </thead>\n",
 94 |        "  <tbody>\n",
 95 |        "    <tr>\n",
 96 |        "      <th>0</th>\n",
 97 |        "      <td>192405</td>\n",
 98 |        "      <td>2024-01-05 11:11:44+00:00</td>\n",
 99 |        "      <td>а я сильно загружал: обучался питону. ролики, ...</td>\n",
100 |        "    </tr>\n",
101 |        "    <tr>\n",
102 |        "      <th>1</th>\n",
103 |        "      <td>192404</td>\n",
104 |        "      <td>2024-01-05 11:05:58+00:00</td>\n",
105 |        "      <td>Не особо</td>\n",
106 |        "    </tr>\n",
107 |        "    <tr>\n",
108 |        "      <th>2</th>\n",
109 |        "      <td>192403</td>\n",
110 |        "      <td>2024-01-05 11:02:47+00:00</td>\n",
111 |        "      <td>видимо, нормально для этих моделей 🤔</td>\n",
112 |        "    </tr>\n",
113 |        "    <tr>\n",
114 |        "      <th>3</th>\n",
115 |        "      <td>192402</td>\n",
116 |        "      <td>2024-01-05 11:02:25+00:00</td>\n",
117 |        "      <td>оперативу загружаешь?</td>\n",
118 |        "    </tr>\n",
119 |        "    <tr>\n",
120 |        "      <th>4</th>\n",
121 |        "      <td>192401</td>\n",
122 |        "      <td>2024-01-05 11:00:51+00:00</td>\n",
123 |        "      <td>У меня такой мак уже год, 90% времени подключе...</td>\n",
124 |        "    </tr>\n",
125 |        "    <tr>\n",
126 |        "      <th>5</th>\n",
127 |        "      <td>192400</td>\n",
128 |        "      <td>2024-01-05 10:59:43+00:00</td>\n",
129 |        "      <td>с августа 21 года пользовались\\nсерийник бьётс...</td>\n",
130 |        "    </tr>\n",
131 |        "    <tr>\n",
132 |        "      <th>6</th>\n",
133 |        "      <td>192399</td>\n",
134 |        "      <td>2024-01-05 10:59:29+00:00</td>\n",
135 |        "      <td>hdmi</td>\n",
136 |        "    </tr>\n",
137 |        "    <tr>\n",
138 |        "      <th>7</th>\n",
139 |        "      <td>192398</td>\n",
140 |        "      <td>2024-01-05 10:58:59+00:00</td>\n",
141 |        "      <td>к монитору через type c или hdmi???\\nтут же ес...</td>\n",
142 |        "    </tr>\n",
143 |        "    <tr>\n",
144 |        "      <th>8</th>\n",
145 |        "      <td>192397</td>\n",
146 |        "      <td>2024-01-05 10:58:58+00:00</td>\n",
147 |        "      <td>?</td>\n",
148 |        "    </tr>\n",
149 |        "    <tr>\n",
150 |        "      <th>9</th>\n",
151 |        "      <td>192396</td>\n",
152 |        "      <td>2024-01-05 10:58:27+00:00</td>\n",
153 |        "      <td>бу сколько</td>\n",
154 |        "    </tr>\n",
155 |        "  </tbody>\n",
156 |        "</table>\n",
157 |        "</div>"
158 |       ],
159 |       "text/plain": [
160 |        "   message_id                       date  \\\n",
161 |        "0      192405  2024-01-05 11:11:44+00:00   \n",
162 |        "1      192404  2024-01-05 11:05:58+00:00   \n",
163 |        "2      192403  2024-01-05 11:02:47+00:00   \n",
164 |        "3      192402  2024-01-05 11:02:25+00:00   \n",
165 |        "4      192401  2024-01-05 11:00:51+00:00   \n",
166 |        "5      192400  2024-01-05 10:59:43+00:00   \n",
167 |        "6      192399  2024-01-05 10:59:29+00:00   \n",
168 |        "7      192398  2024-01-05 10:58:59+00:00   \n",
169 |        "8      192397  2024-01-05 10:58:58+00:00   \n",
170 |        "9      192396  2024-01-05 10:58:27+00:00   \n",
171 |        "\n",
172 |        "                                                text  \n",
173 |        "0  а я сильно загружал: обучался питону. ролики, ...  \n",
174 |        "1                                           Не особо  \n",
175 |        "2               видимо, нормально для этих моделей 🤔  \n",
176 |        "3                              оперативу загружаешь?  \n",
177 |        "4  У меня такой мак уже год, 90% времени подключе...  \n",
178 |        "5  с августа 21 года пользовались\\nсерийник бьётс...  \n",
179 |        "6                                               hdmi  \n",
180 |        "7  к монитору через type c или hdmi???\\nтут же ес...  \n",
181 |        "8                                                  ?  \n",
182 |        "9                                         бу сколько  "
183 |       ]
184 |      },
185 |      "execution_count": 6,
186 |      "metadata": {},
187 |      "output_type": "execute_result"
188 |     }
189 |    ],
190 |    "source": [
191 |     "data\n"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "id": "c9dcf79c",
197 |    "metadata": {},
198 |    "source": [
199 |     "# Создаем функцию эмбеддингов"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 7,
205 |    "id": "ea14a5ad",
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "load_dotenv(dotenv_path='../.env')\n",
210 |     "PROXY_API_KEY = os.getenv(\"PROXY_API_KEY\")\n"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 8,
216 |    "id": "142aea4e",
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "\n",
221 |     "\n",
222 |     "emb_fn = OpenAIEmbeddings(\n",
223 |     "    api_key=os.getenv(\"PROXY_API_KEY\"),\n",
224 |     "    model=\"text-embedding-ada-002\",\n",
225 |     "    base_url=\"https://api.proxyapi.ru/openai/v1\",\n",
226 |     ")"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 9,
232 |    "id": "e584d8a9",
233 |    "metadata": {},
234 |    "outputs": [
235 |     {
236 |      "data": {
237 |       "text/plain": [
238 |        "OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x7fe21d189f60>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x7fe21d0041c0>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base='https://api.proxyapi.ru/openai/v1', openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='sk-p43vYM3mtCnvdV1VF8Nm8CceANp4uO49', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)"
239 |       ]
240 |      },
241 |      "execution_count": 9,
242 |      "metadata": {},
243 |      "output_type": "execute_result"
244 |     }
245 |    ],
246 |    "source": [
247 |     "emb_fn"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "id": "5aaa8081",
253 |    "metadata": {},
254 |    "source": [
255 |     "# Создаем ChromaDB"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 12,
261 |    "id": "c4e64019",
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "chroma_client = chromadb.PersistentClient(path=\"./chroma_db_2\")\n"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 14,
271 |    "id": "82e3d81d",
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "collection = chroma_client.get_or_create_collection(name=\"my_collection\")"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 16,
281 |    "id": "b83501ad",
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "docs = data[\"text\"].to_list()\n",
286 |     "\n",
287 |     "collection.add(\n",
288 |     "    documents=docs,\n",
289 |     "    embeddings=emb_fn.embed_documents(docs),\n",
290 |     "    metadatas=[{\"source\": \"local\"} for _ in docs],\n",
291 |     "    ids=[f\"id{i}\" for i in range(len(docs))],\n",
292 |     ")"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 19,
298 |    "id": "c2faf92f",
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "collection = chroma_client.get_collection(name = \"my_collection\")"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 26,
308 |    "id": "16efa8b4",
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "user_query = \"как дела?\"\n",
313 |     "\n",
314 |     "search_result = collection.query(\n",
315 |     "  query_embeddings=emb_fn.embed_query(user_query),\n",
316 |     "  n_results=5\n",
317 |     ")"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 32,
323 |    "id": "c4693d1e",
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "data": {
328 |       "text/plain": [
329 |        "['к монитору через type c или hdmi???\\nтут же есть разница большая',\n",
330 |        " 'бу сколько',\n",
331 |        " '?',\n",
332 |        " 'видимо, нормально для этих моделей 🤔',\n",
333 |        " 'оперативу загружаешь?']"
334 |       ]
335 |      },
336 |      "execution_count": 32,
337 |      "metadata": {},
338 |      "output_type": "execute_result"
339 |     }
340 |    ],
341 |    "source": [
342 |     "results = search_result['documents'][0][::-1]"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": 38,
348 |    "id": "1dd76cc8",
349 |    "metadata": {},
350 |    "outputs": [
351 |     {
352 |      "data": {
353 |       "text/plain": [
354 |        "[Collection(name=my_collection)]"
355 |       ]
356 |      },
357 |      "execution_count": 38,
358 |      "metadata": {},
359 |      "output_type": "execute_result"
360 |     }
361 |    ],
362 |    "source": []
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "id": "122e73c0",
368 |    "metadata": {},
369 |    "outputs": [],
370 |    "source": []
371 |   }
372 |  ],
373 |  "metadata": {
374 |   "kernelspec": {
375 |    "display_name": "Python 3.10.5 ('.venv': poetry)",
376 |    "language": "python",
377 |    "name": "python3"
378 |   },
379 |   "language_info": {
380 |    "codemirror_mode": {
381 |     "name": "ipython",
382 |     "version": 3
383 |    },
384 |    "file_extension": ".py",
385 |    "mimetype": "text/x-python",
386 |    "name": "python",
387 |    "nbconvert_exporter": "python",
388 |    "pygments_lexer": "ipython3",
389 |    "version": "3.10.13"
390 |   },
391 |   "vscode": {
392 |    "interpreter": {
393 |     "hash": "de87916773026c2a3e75fb54c4f93bfed36f59c832b56191b02f8dd5801604e3"
394 |    }
395 |   }
396 |  },
397 |  "nbformat": 4,
398 |  "nbformat_minor": 5
399 | }
400 | 


--------------------------------------------------------------------------------
/notebooks/langchain.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# FIX BUG WITH SQLITE\n",
 10 |     "# https://gist.github.com/defulmere/8b9695e415a44271061cc8e272f3c300\n",
 11 |     "\n",
 12 |     "__import__('pysqlite3')\n",
 13 |     "import sys\n",
 14 |     "import os\n",
 15 |     "sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')\n",
 16 |     "\n",
 17 |     "DATABASES = {\n",
 18 |     "    'default': {\n",
 19 |     "        'ENGINE': 'django.db.backends.sqlite3',\n",
 20 |     "        'NAME': os.path.join('..', 'db.sqlite3'),\n",
 21 |     "    }\n",
 22 |     "}"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import os\n",
 32 |     "import openai\n",
 33 |     "from dotenv import load_dotenv\n",
 34 |     "from langchain_community.document_loaders import TextLoader\n",
 35 |     "from langchain.embeddings import OpenAIEmbeddings\n",
 36 |     "from langchain.text_splitter import CharacterTextSplitter\n",
 37 |     "from langchain.vectorstores import Chroma\n",
 38 |     "from langchain.retrievers.multi_query import MultiQueryRetriever\n",
 39 |     "from langchain.prompts import PromptTemplate\n",
 40 |     "from langchain_community.llms import OpenAI\n",
 41 |     "from langchain.chains import VectorDBQA\n",
 42 |     "from langchain.schema.document import Document"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "from langchain_community.document_loaders.csv_loader import CSVLoader"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 4,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "loader = CSVLoader(file_path=os.path.join(sys.path[0], \"../src/artifacts/@karpovcourseschat.csv\"), source_column=\"text\", metadata_columns=[\"date\", \"message_id\"], csv_args={\n",
 61 |     "    \"delimiter\": \";\",\n",
 62 |     "})\n",
 63 |     "data = loader.load()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 5,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "load_dotenv(dotenv_path='../.env')\n",
 73 |     "PROXY_API_KEY = os.getenv(\"PROXY_API_KEY\")"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 6,
 79 |    "metadata": {},
 80 |    "outputs": [
 81 |     {
 82 |      "name": "stderr",
 83 |      "output_type": "stream",
 84 |      "text": [
 85 |       "/home/kartushov/pet_projects/PostFinder/.venv/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:189: LangChainDeprecationWarning: The class `OpenAIEmbeddings` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use langchain_openai.OpenAIEmbeddings instead.\n",
 86 |       "  warn_deprecated(\n"
 87 |      ]
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "emb_fn = OpenAIEmbeddings(\n",
 92 |     "    api_key=os.getenv(\"PROXY_API_KEY\"),\n",
 93 |     "    model=\"text-embedding-ada-002\",\n",
 94 |     "    base_url=\"https://api.proxyapi.ru/openai/v1\",\n",
 95 |     ")"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 7,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "db = Chroma.from_documents(data, emb_fn)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 8,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "from langchain.chat_models import ChatOpenAI"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 9,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "name": "stderr",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "/home/kartushov/pet_projects/PostFinder/.venv/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:189: LangChainDeprecationWarning: The class `ChatOpenAI` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use langchain_openai.ChatOpenAI instead.\n",
126 |       "  warn_deprecated(\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0, openai_api_key=PROXY_API_KEY, openai_api_base=\"https://api.proxyapi.ru/openai/v1\")"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 32,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "QUERY_PROMPT = PromptTemplate(\n",
141 |     "    input_variables=[\"question\", \"context\"],\n",
142 |     "    template=\"\"\"Answer the question based on the context below. If the context doesn't contain the answer, say \"I don't know.\"\\n\\nContext: {context}\\n\\n---\\n\\nQuestion: {question}\\nAnswer:\"\"\",\n",
143 |     ")"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 27,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "# Retrieve and generate using the relevant snippets of the blog.\n",
153 |     "retriever = db.as_retriever()"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 28,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "question = \"Что такое симулятор мл?\""
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 29,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "docs = retriever.get_relevant_documents(question)"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 42,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "context_text = \"\\n\\n---\\n\\n\".join([doc.page_content for doc in docs[::-1]])"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 45,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "prompt =QUERY_PROMPT.format(context=context_text, question=question)"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 49,
195 |    "metadata": {},
196 |    "outputs": [
197 |     {
198 |      "name": "stderr",
199 |      "output_type": "stream",
200 |      "text": [
201 |       "/home/kartushov/pet_projects/PostFinder/.venv/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:189: LangChainDeprecationWarning: The function `__call__` was deprecated in LangChain 0.1.7 and will be removed in 0.2.0. Use invoke instead.\n",
202 |       "  warn_deprecated(\n"
203 |      ]
204 |     },
205 |     {
206 |      "data": {
207 |       "text/plain": [
208 |        "'Симулятор МЛ - это программное обеспечение или игра, которая позволяет практиковать и оттачивать навыки в области машинного обучения.'"
209 |       ]
210 |      },
211 |      "execution_count": 49,
212 |      "metadata": {},
213 |      "output_type": "execute_result"
214 |     }
215 |    ],
216 |    "source": [
217 |     "llm.predict(prompt)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": []
226 |   }
227 |  ],
228 |  "metadata": {
229 |   "kernelspec": {
230 |    "display_name": "Python 3.10.5 ('.venv': poetry)",
231 |    "language": "python",
232 |    "name": "python3"
233 |   },
234 |   "language_info": {
235 |    "codemirror_mode": {
236 |     "name": "ipython",
237 |     "version": 3
238 |    },
239 |    "file_extension": ".py",
240 |    "mimetype": "text/x-python",
241 |    "name": "python",
242 |    "nbconvert_exporter": "python",
243 |    "pygments_lexer": "ipython3",
244 |    "version": "3.10.13"
245 |   },
246 |   "orig_nbformat": 4,
247 |   "vscode": {
248 |    "interpreter": {
249 |     "hash": "de87916773026c2a3e75fb54c4f93bfed36f59c832b56191b02f8dd5801604e3"
250 |    }
251 |   }
252 |  },
253 |  "nbformat": 4,
254 |  "nbformat_minor": 2
255 | }
256 | 


--------------------------------------------------------------------------------
/poetry.toml:
--------------------------------------------------------------------------------
1 | [virtualenvs]
2 | create = true
3 | in-project = true
4 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "PostFinder"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Torchme"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.10.1 <4.0"
10 | python-dotenv = "^1.0.0"
11 | aiohttp = "^3.9.1"
12 | openai = "^1.3.7"
13 | loguru = "^0.7.2"
14 | pandas = "2.0.3"
15 | telethon = "^1.33.1"
16 | aiogram = "^3.2.0"
17 | pyyaml = "^6.0.1"
18 | types-pyyaml = "^6.0.12.12"
19 | langchain = "^0.0.354"
20 | tiktoken = "^0.5.2"
21 | chromadb = "^0.4.0"
22 | asyncpg = "^0.29.0"
23 | sqlalchemy = "^2.0.25"
24 | alembic = "^1.13.1"
25 | asyncio = "^3.4.3"
26 | greenlet = "^3.0.3"
27 | pytest = "^7.4.4"
28 | pytest-asyncio = "^0.23.3"
29 | langchain-openai = "^0.0.2.post1"
30 | ruamel-yaml = "^0.18.6"
31 | 
32 | [tool.poetry.group.dev.dependencies]
33 | pytest = "^7.4.3"
34 | pre-commit = "^3.5.0"
35 | ruff = "^0.1.11"
36 | jupyterlab = "^4.0.10"
37 | 
38 | [tool.ruff]
39 | exclude = [
40 |     "**/__init__.py",
41 |     ".bzr",
42 |     ".direnv",
43 |     ".eggs",
44 |     ".git",
45 |     ".git-rewrite",
46 |     ".hg",
47 |     ".ipynb_checkpoints",
48 |     ".mypy_cache",
49 |     ".nox",
50 |     ".pants.d",
51 |     ".pyenv",
52 |     ".pytest_cache",
53 |     ".pytype",
54 |     ".ruff_cache",
55 |     ".svn",
56 |     ".tox",
57 |     ".venv",
58 |     ".vscode",
59 |     "__pypackages__",
60 |     "_build",
61 |     "buck-out",
62 |     "build",
63 |     "dist",
64 |     "node_modules",
65 |     "site-packages",
66 |     ".venv",
67 |     ".csv"
68 | ]
69 | 
70 | line-length = 88
71 | indent-width = 4
72 | 
73 | target-version = "py38"
74 | 
75 | [tool.ruff.lint]
76 | select = ["E4", "E7", "E9", "F"]
77 | ignore = []
78 | fixable = ["ALL"]
79 | unfixable = []
80 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
81 | 
82 | [tool.ruff.format]
83 | quote-style = "double"
84 | indent-style = "space"
85 | skip-magic-trailing-comma = false
86 | line-ending = "auto"
87 | 
88 | [build-system]
89 | requires = ["poetry-core"]
90 | build-backend = "poetry.core.masonry.api"
91 | 


--------------------------------------------------------------------------------
/run_bot.sh:
--------------------------------------------------------------------------------
1 | poetry run python -m src.app.bot
2 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/torchme/PostFinder/14de2c2ea8e6b26e57ba03662ddeec7d8bf1991c/src/__init__.py


--------------------------------------------------------------------------------
/src/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/torchme/PostFinder/14de2c2ea8e6b26e57ba03662ddeec7d8bf1991c/src/app/__init__.py


--------------------------------------------------------------------------------
/src/app/bot.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from aiogram.types import BotCommand
 4 | from aiogram.methods import DeleteWebhook
 5 | from loguru import logger
 6 | 
 7 | from src.app.loader import bot, dp
 8 | from src.handlers.commands import router as router_commands
 9 | from src.handlers.callbacks import router as router_callbacks
10 | from src.handlers.dialog import router as router_dialog
11 | from src.handlers.admin import router as router_admin
12 | 
13 | 
14 | class PostFinderBot:
15 |     def __init__(self):
16 |         """
17 |         Initializes the object and registers the startup and shutdown events.
18 |         Also includes the specified routers for commands, callbacks, and dialog.
19 |         """
20 |         dp.startup.register(self.startup_event)
21 |         dp.shutdown.register(self.shutdown_event)
22 | 
23 |         dp.include_router(router_admin)
24 |         dp.include_router(router_commands)
25 |         dp.include_router(router_callbacks)
26 |         dp.include_router(router_dialog)
27 | 
28 |     async def start(self):
29 |         """
30 |         Starts the bot by polling the dispatcher.
31 |         """
32 |         await bot(DeleteWebhook(drop_pending_updates=True))
33 |         await dp.start_polling(bot)
34 | 
35 |     async def startup_event(self):
36 |         """
37 |         An asynchronous function to handle the startup event. It sets the bot commands and logs a warning message.
38 |         """
39 |         bot_commands = [
40 |             BotCommand(command="/help", description="ℹ️ About me"),
41 |             BotCommand(
42 |                 command="/find",
43 |                 description="🔍 Find response. Params: channel (str), query (str)",
44 |             ),
45 |             BotCommand(command="/account", description="🛒 Plan"),
46 |             BotCommand(command='/add_channel', description="Request to add channel"),
47 |         ]
48 |         await bot.set_my_commands(bot_commands)
49 | 
50 |         logger.warning("Registered commands")
51 |         logger.warning("Bot started")
52 | 
53 |     async def shutdown_event(self):
54 |         """
55 |         Asynchronous function to handle the shutdown event of the bot.
56 |         """
57 |         logger.warning("Bot stopped")
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     bot_runner = PostFinderBot()
62 |     asyncio.run(bot_runner.start())
63 | 


--------------------------------------------------------------------------------
/src/app/loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tiktoken
 3 | from aiogram import Bot, Dispatcher
 4 | from telethon import TelegramClient
 5 | from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 6 | from src.utils.extractor import Extractor   
 7 | from src.config import API_HASH, API_ID, TELEGRAM_BOT_TOKEN, PROXY_API_KEY
 8 | from src.database.postgres_service import PostgresManager
 9 | 
10 | 
11 | bot = Bot(token=TELEGRAM_BOT_TOKEN, parse_mode="markdown")
12 | 
13 | dp = Dispatcher()
14 | 
15 | client = TelegramClient(
16 |     "src/artifacts/sessions/post_finder.session", api_id=API_ID, api_hash=API_HASH
17 | )
18 | 
19 | emb_fn = OpenAIEmbeddings(
20 |     api_key=os.getenv("PROXY_API_KEY"),
21 |     model="text-embedding-ada-002",
22 |     base_url="https://api.proxyapi.ru/openai/v1",
23 | )
24 | 
25 | llm = ChatOpenAI(
26 |     model_name="gpt-3.5-turbo-1106",
27 |     temperature=0.7,
28 |     api_key=PROXY_API_KEY,
29 |     base_url="https://api.proxyapi.ru/openai/v1",
30 | )
31 | 
32 | extractor = Extractor(llm=llm)
33 | 
34 | encoding = tiktoken.get_encoding("cl100k_base")
35 | 
36 | pg_manager = PostgresManager()
37 | 


--------------------------------------------------------------------------------
/src/app/openai.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | from dotenv import load_dotenv
 6 | from loguru import logger
 7 | from openai import OpenAI
 8 | 
 9 | 
10 | dotenv_path = Path(".env")
11 | load_dotenv(dotenv_path=dotenv_path)
12 | PROXY_API_KEY = os.getenv("PROXY_API_KEY")
13 | 
14 | client = OpenAI(
15 |     api_key=PROXY_API_KEY,
16 |     base_url="https://api.proxyapi.ru/openai/v1",
17 | )
18 | 
19 | logger.add(
20 |     sys.stderr, format="{time} {level} {message}", filter="sub.module", level="INFO"
21 | )
22 | 
23 | 
24 | def chatgpt(prompt: str) -> str:
25 |     chat_completion = client.chat.completions.create(
26 |         model="gpt-3.5-turbo",
27 |         # TODO: Add multiple aggents to generate quastions
28 |         messages=[
29 |             {
30 |                 "role": "system",
31 |                 "content": "Ты умный AI чатбот в телеграм. Ты помогаешь студентам создавать красивые и понятные конспекты по видео лекциям.",
32 |             },
33 |             {"role": "user", "content": "Привет! Кто ты?"},
34 |             {
35 |                 "role": "assistant",
36 |                 "content": """Привет! Я AI Student Assistant. Отправь мне видео и я создать конспект по теме видео что бы ты мог разобрать сложные моменты!\n
37 |                 """,
38 |             },
39 |             {"role": "user", "content": "Верни результат в MarkDown разметке" + prompt},
40 |         ],
41 |     )
42 |     message_content = chat_completion.choices[0].message.content
43 |     if message_content is None:
44 |         raise ValueError("Ответ от API пустой или некорректный")
45 |     return message_content
46 | 


--------------------------------------------------------------------------------
/src/artifacts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/torchme/PostFinder/14de2c2ea8e6b26e57ba03662ddeec7d8bf1991c/src/artifacts/__init__.py


--------------------------------------------------------------------------------
/src/artifacts/migrations/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.
2 | 


--------------------------------------------------------------------------------
/src/artifacts/migrations/env.py:
--------------------------------------------------------------------------------
 1 | from logging.config import fileConfig
 2 | import os
 3 | import sys
 4 | 
 5 | from sqlalchemy import engine_from_config
 6 | from sqlalchemy import pool
 7 | 
 8 | sys.path.append(os.path.join(sys.path[0], "src"))
 9 | 
10 | from alembic import context  # noqa
11 | 
12 | from src.config import DB_HOST, DB_NAME, DB_PASS, DB_PORT, DB_USER  # noqa
13 | from src.database import Base  # noqa
14 | from src.database.models import *  # noqa
15 | 
16 | # this is the Alembic Config object, which provides
17 | # access to the values within the .ini file in use.
18 | config = context.config
19 | 
20 | section = config.config_ini_section
21 | config.set_section_option(section, "DB_HOST", DB_HOST)
22 | config.set_section_option(section, "DB_PORT", DB_PORT)
23 | config.set_section_option(section, "DB_USER", DB_USER)
24 | config.set_section_option(section, "DB_NAME", DB_NAME)
25 | config.set_section_option(section, "DB_PASS", DB_PASS)
26 | 
27 | # Interpret the config file for Python logging.
28 | # This line sets up loggers basically.
29 | if config.config_file_name is not None:
30 |     fileConfig(config.config_file_name)
31 | 
32 | # add your model's MetaData object here
33 | # for 'autogenerate' support
34 | # from myapp import mymodel
35 | # target_metadata = mymodel.Base.metadata
36 | target_metadata = Base.metadata
37 | 
38 | # other values from the config, defined by the needs of env.py,
39 | # can be acquired:
40 | # my_important_option = config.get_main_option("my_important_option")
41 | # ... etc.
42 | 
43 | 
44 | def run_migrations_offline() -> None:
45 |     """Run migrations in 'offline' mode.
46 | 
47 |     This configures the context with just a URL
48 |     and not an Engine, though an Engine is acceptable
49 |     here as well.  By skipping the Engine creation
50 |     we don't even need a DBAPI to be available.
51 | 
52 |     Calls to context.execute() here emit the given string to the
53 |     script output.
54 | 
55 |     """
56 |     url = config.get_main_option("sqlalchemy.url")
57 |     context.configure(
58 |         url=url,
59 |         target_metadata=target_metadata,
60 |         literal_binds=True,
61 |         dialect_opts={"paramstyle": "named"},
62 |     )
63 | 
64 |     with context.begin_transaction():
65 |         context.run_migrations()
66 | 
67 | 
68 | def run_migrations_online() -> None:
69 |     """Run migrations in 'online' mode.
70 | 
71 |     In this scenario we need to create an Engine
72 |     and associate a connection with the context.
73 | 
74 |     """
75 |     connectable = engine_from_config(
76 |         config.get_section(config.config_ini_section, {}),
77 |         prefix="sqlalchemy.",
78 |         poolclass=pool.NullPool,
79 |     )
80 | 
81 |     with connectable.connect() as connection:
82 |         context.configure(connection=connection, target_metadata=target_metadata)
83 | 
84 |         with context.begin_transaction():
85 |             context.run_migrations()
86 | 
87 | 
88 | if context.is_offline_mode():
89 |     run_migrations_offline()
90 | else:
91 |     run_migrations_online()
92 | 


--------------------------------------------------------------------------------
/src/artifacts/migrations/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | ${imports if imports else ""}
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = ${repr(up_revision)}
16 | down_revision: Union[str, None] = ${repr(down_revision)}
17 | branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
18 | depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
19 | 
20 | 
21 | def upgrade() -> None:
22 |     ${upgrades if upgrades else "pass"}
23 | 
24 | 
25 | def downgrade() -> None:
26 |     ${downgrades if downgrades else "pass"}
27 | 


--------------------------------------------------------------------------------
/src/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import *
2 | 


--------------------------------------------------------------------------------
/src/config/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | from dotenv import load_dotenv
 5 | 
 6 | from src.config.managment import Config
 7 | 
 8 | env_state = os.getenv("ENV_STATE", "development")
 9 | 
10 | if env_state == "docker":
11 |     dotenv_path = Path(".env-docker")
12 | else:
13 |     dotenv_path = Path(".env")
14 | 
15 | load_dotenv(dotenv_path=dotenv_path)
16 | 
17 | API_ID = os.getenv("API_ID")
18 | API_HASH = os.getenv("API_HASH")
19 | PHONE = os.getenv("PHONE")
20 | CONTACT_ACCOUNT = os.getenv("CONTACT_ACCOUNT")
21 | 
22 | TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "")
23 | PROXY_API_KEY = os.getenv("PROXY_API_KEY")
24 | 
25 | # Database data
26 | DB_USER = os.getenv("DB_USER")
27 | DB_HOST = os.getenv("DB_HOST")
28 | DB_PORT = os.getenv("DB_PORT")
29 | DB_NAME = os.getenv("DB_NAME")
30 | DB_PASS = os.getenv("DB_PASS")
31 | 
32 | ADMIN_CHAT_ID = os.getenv("ADMIN_CHAT_ID")
33 | 
34 | CONFIG_PATH = "src/config/config.yaml"
35 | 
36 | config = Config(filename=CONFIG_PATH)
37 | 


--------------------------------------------------------------------------------
/src/config/config.yaml:
--------------------------------------------------------------------------------
  1 | callback:
  2 |   approve:
  3 |     user: 
  4 |       to_admins: |
  5 |         ✅ Одобрено:
  6 |         username: {username}
  7 |         user_id: {user_id}
  8 |         Если захотите удалить пользователя, используйте /del_user [user_id] 
  9 |       to_user: |
 10 |         ✅ Спасибо за ожидание! Вам выдан доступ к боту
 11 |     channel: 
 12 |       to_user: |
 13 |         ✅ Спасибо за ожидание! Канал {} был одобрен модерацией!
 14 |       to_admins: |
 15 |         ✅Канал {} был добавлен в пул каналов.
 16 |   deny:
 17 |       user:
 18 |         to_admins: |
 19 |           ❌ Отказано:\nusername: {username}\nuser_id: {user_id}
 20 |         to_user: |
 21 |           ❌ К сожалению, Вам отказано в доступе к боту
 22 |       channel: 
 23 |         to_user: |
 24 |           ❌Канал {channel} был  отклонен модерацией
 25 |         to_admins: |
 26 |           ❌ Отказано:\nchannel: {channel}
 27 | messages:
 28 |   welcome: |
 29 |     Привет! Я — *PostFinder*, инструмент, который изменит твое взаимодействие с социальными сетями.
 30 | 
 31 |     Если вы постоянно теряетесь в ТОННЕ КОНТЕНТА,
 32 |     приходиться листать БЕСКОНЕЧНОЕ количество постов и никак не можете найти ТОТ САМЫЙ пост? То вы пришли в нужное место!
 33 | 
 34 |     *Команды:*
 35 |     - /help - показывает эту справку
 36 |     - /start - начало диалога
 37 |     - /find `<@channel> <Запрос>` - ищет ответ на Ваш вопрос в указанном канале
 38 | 
 39 |     _Например: /find @postfinder Как найти нужный пост в группе?_
 40 |   errors:
 41 |     parse_error: |
 42 |       Аргументы не были переданы.\nПожалуйста, уточните канал и Ваш запрос, после команды\n\n_Например: /find @postfinder Как найти нужный пост в группе?_
 43 |     parse_channel_error: |
 44 |       Аргументы не были переданы.\n
 45 |     no_rights: |
 46 |       Error: Acces denied! You don't have rights for this.
 47 |     unknown_message_error: |
 48 |       🚫 Я не распознал ваш запрос.
 49 | 
 50 |       Чтобы продолжить диалог, пожалуйста, ответьте на одно из предыдущих сообщений или воспользуйтесь командой:
 51 |       */find [Канал] [Запрос]*
 52 | 
 53 |   searching: |
 54 |     👀 Ищем ответы...
 55 |   
 56 |   moderation: 
 57 |     channel:
 58 |       processing: |
 59 |         Предложенный Вами канал в статусе рассмотрения администрацией, пожалуйста, ожидайте.
 60 |       deny: |
 61 |         
 62 |     user: 
 63 |       processing: |
 64 |         Вы еще не прошли модерацию, пожалуйста, ожидайте
 65 |       answer: |
 66 |         Ваш аккаунт в статусе рассмотрения модерацией, пожалуйста, ожидайте    
 67 |   action_to_continue: |
 68 |     🔹 Чтобы продолжить, ответьте на это сообщение    
 69 |   action_processed: |
 70 |     Action for user {} processed!
 71 |   unknown: |
 72 |     Упс... Похоже я не знаю такой команды 😬
 73 |   admin:
 74 |     users:
 75 |       remove:
 76 |         success: |
 77 |           User {} was removed
 78 |         fail: |
 79 |           User ID not found in the whitelist.
 80 |       add:
 81 |         success: |
 82 |           User {} was successfully added!
 83 |         fail: |
 84 |           User {} is already in whitelist!
 85 |     channel:
 86 |       remove:
 87 |         success: |
 88 |           Channel {} was removed
 89 |         fail: |
 90 |           Channel {} isn't in pool
 91 |       add:
 92 |         success: |
 93 |           Channel {} was successfully added!
 94 |         fail: | 
 95 |           Channel {} is already in pool!
 96 |   user: 
 97 |     registered: |
 98 |       User {} registered!
 99 |     already_registered: |
100 |       User {} is already registered!
101 | 
102 | 
103 | templates:
104 |   extract: |
105 |     Write the tags for the query below:
106 |     ----------------------------------------
107 |     Query: "{query}"
108 |     ----------------------------------------
109 |     Instruction:
110 |     1) You must use language as in original query
111 |     2) Do not duplicate the query in your response.
112 |     3) Do not use words from original query in your response, your tags must be new and unique
113 |     4) Write the tags in the order of relevance, separated by commas, without any other characters.
114 | 
115 |     Examples:
116 |     Query: 'Какой рецепт борща?'
117 |     Answer: 'Кулинария, суп'
118 | 
119 |     Query: 'Кто создал ChatGPT?'
120 |     Answer: 'AI, OpenAI, LLM'
121 | 
122 |     Query: 'Что такое МЛ симулятор?'
123 |     Answer: 'Машинное обучение, искусственный интеллект'
124 |   prompt: |
125 |    Answer the question based on the context below. Use language as in question. "\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:
126 | users:
127 |   - 11
128 | 
129 |   - 326213970
130 | admins:
131 |   - 2
132 |   - 326213970
133 | 


--------------------------------------------------------------------------------
/src/config/managment.py:
--------------------------------------------------------------------------------
  1 | from ruamel.yaml import YAML
  2 | from loguru import logger
  3 | 
  4 | 
  5 | class Config:
  6 |     def __init__(self, filename: str) -> None:
  7 |         self.filename = filename
  8 |         self.yaml = YAML()
  9 |         self.yaml.preserve_quotes = True
 10 |         self.yaml.indent(mapping=2, sequence=4, offset=2)
 11 |         self.admin_ids = []
 12 |         self.whitelist = []
 13 |         self.load_ids()
 14 |         logger.info(f"Config initialized with {self.filename}")
 15 | 
 16 |     def load_ids(self):
 17 |         """Load user and admin IDs from the configuration file into attributes."""
 18 |         config = self._load_config()
 19 |         self.admin_ids = config.get("admins", [])
 20 |         self.whitelist = config.get("users", [])
 21 |         logger.info("Admin and user IDs loaded from config.")
 22 | 
 23 |     def add_id(self, id_type, user_id):
 24 |         """
 25 |         Add a unique user or admin ID to the respective list in the configuration and update the attribute.
 26 | 
 27 |         Parameters
 28 |         ----------
 29 |         id_type : str
 30 |             The type of ID to add ('admins' or 'users').
 31 |         user_id : int
 32 |             The ID of the user or admin to add.
 33 | 
 34 |         Returns
 35 |         -------
 36 |         bool
 37 |             True if the ID was added, False if the ID was already present.
 38 |         """
 39 |         config = self._load_config()
 40 | 
 41 |         if config.get(id_type) is None:
 42 |             config[id_type] = []
 43 | 
 44 |         if user_id not in config[id_type]:
 45 |             config[id_type].append(user_id)
 46 |             self._save_config(config)
 47 | 
 48 |             if id_type == "users":
 49 |                 self.whitelist.append(user_id)
 50 |             elif id_type == "admins":
 51 |                 self.admin_ids.append(user_id)
 52 | 
 53 |             logger.info(f"ID {user_id} added to the {id_type} list and saved.")
 54 |             return True
 55 |         else:
 56 |             logger.warning(f"ID {user_id} is already in the {id_type} list.")
 57 |             return False
 58 | 
 59 |     def remove_id(self, id_type, user_id):
 60 |         """
 61 |         Remove a user or admin ID from the respective list in the configuration and update the attribute.
 62 | 
 63 |         Parameters
 64 |         ----------
 65 |         id_type : str
 66 |             The type of ID to remove ('admins' or 'users').
 67 |         user_id : int
 68 |             The ID of the user or admin to remove.
 69 | 
 70 |         Returns
 71 |         -------
 72 |         bool
 73 |             True if the ID was removed, False if the ID was not found.
 74 |         """
 75 |         config = self._load_config()
 76 | 
 77 |         if config.get(id_type) and user_id in config[id_type]:
 78 |             config[id_type].remove(user_id)
 79 |             self._save_config(config)
 80 | 
 81 |             if id_type == "users":
 82 |                 self.whitelist.remove(user_id)
 83 |             elif id_type == "admins":
 84 |                 self.admin_ids.remove(user_id)
 85 | 
 86 |             logger.info(f"ID {user_id} removed from the {id_type} list and saved.")
 87 |             return True
 88 |         else:
 89 |             logger.error(f"ID {user_id} not found in the {id_type} list.")
 90 |             return False
 91 | 
 92 |     def get(self, keys: list, if_na=None)  -> str:
 93 |         """
 94 |         Get value from config.
 95 |         Parameters
 96 |         ----------
 97 |         keys: list
 98 |             List of strings(keys for yaml)  ('admins' or 'users').
 99 |         Returns
100 |         -------
101 |         int or str
102 |             Value from yaml config.
103 |         """
104 |         config = self._load_config()
105 |         value = config
106 |         pointer = 0
107 |         try:
108 |             while pointer!=len(keys):
109 |                 value = value[keys[pointer]]
110 |                 pointer += 1
111 |             return value
112 |         
113 |         except Exception:
114 |             logger.error(f"Failed to get value by keys {keys}")
115 |             return if_na
116 |     
117 |     def _load_config(self) -> dict:
118 |         try:
119 |             with open(self.filename, "r", encoding="utf-8") as file:
120 |                 return self.yaml.load(file) or {}
121 |         except FileNotFoundError as e:
122 |             logger.error(f"Configuration file not found: {e}")
123 |             return {}
124 | 
125 |     def _save_config(self, config: dict) -> None:
126 |         try:
127 |             with open(self.filename, "w", encoding="utf-8") as file:
128 |                 self.yaml.dump(config, file)
129 |             logger.info(f"Configuration saved to {self.filename}")
130 |         except Exception as e:
131 |             logger.error(f"Failed to save configuration: {e}")
132 | 
133 | 


--------------------------------------------------------------------------------
/src/database/__init__.py:
--------------------------------------------------------------------------------
1 | from .database import *
2 | 


--------------------------------------------------------------------------------
/src/database/backup.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import datetime
 3 | from loguru import logger
 4 | 
 5 | from src.config import DB_HOST, DB_NAME, DB_PORT, DB_USER
 6 | 
 7 | 
 8 | def backup_database(
 9 |     backup_path: str = "src/artifacts",
10 |     host: str = DB_HOST,
11 |     port: str = DB_PORT,
12 |     user: str = DB_USER,
13 |     dbname: str = DB_NAME,
14 | ):
15 |     """Create PostgreSQL database backup file"""
16 |     date_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
17 |     filename = f"{backup_path}/backup_{dbname}_{date_str}.dump"
18 | 
19 |     command = f"pg_dump -h {host} -p {port} -U {user} -d {dbname} -F c > {filename}"
20 | 
21 |     try:
22 |         subprocess.run(command, check=True, shell=True)
23 |         logger.debug(f"Backup successful: {filename}")
24 |     except subprocess.CalledProcessError as e:
25 |         logger.error(f"Error during backup: {e}")
26 | 
27 | 
28 | def restore_database(
29 |     backup_filepath: str,
30 |     host: str = DB_HOST,
31 |     port: str = DB_PORT,
32 |     user: str = DB_USER,
33 |     dbname: str = DB_NAME,
34 | ):
35 |     """Restore PostgreSQL database from backup file"""
36 |     command = (
37 |         f"pg_restore -h {host} -p {port} -U {user} -d {dbname} < {backup_filepath}"
38 |     )
39 | 
40 |     try:
41 |         subprocess.run(command, check=False, shell=True)
42 |         logger.debug(f"Database restored successfully from {backup_filepath}")
43 |     except subprocess.CalledProcessError as e:
44 |         logger.error(f"Error during database restore: {e}")
45 | 


--------------------------------------------------------------------------------
/src/database/chroma_service.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | from langchain_community.vectorstores.chroma import Chroma
  3 | from langchain_community.document_loaders import DataFrameLoader
  4 | from langchain_core.documents import Document
  5 | import pandas as pd
  6 | 
  7 | from src.app.loader import emb_fn
  8 | from src.utils.scrapper import scrape_telegram_messages
  9 | from src.app.loader import client
 10 | 
 11 | 
 12 | class ChromaManager:
 13 |     def __init__(
 14 |         self, channel: str, persist_directory: str = "./chroma_db", emb_fn=emb_fn
 15 |     ):
 16 |         """
 17 |         Initialize the ChromaDB object.
 18 | 
 19 |         Parameters
 20 |         ----------
 21 |         channel : str
 22 |             The name of the channel.
 23 |         persist_directory : str, optional
 24 |             The directory to persist the ChromaDB data. Defaults to "./chroma_db".
 25 |         emb_fn
 26 |             The embedding function.
 27 |         """
 28 |         self.persist_directory = persist_directory
 29 |         self.emb_fn = emb_fn
 30 |         self.channel = channel
 31 |         self.collection = Chroma(
 32 |             persist_directory=self.persist_directory,
 33 |             embedding_function=self.emb_fn,
 34 |             collection_name=self.channel,
 35 |         )
 36 | 
 37 |     async def create_collection(self, docs: List[Document]) -> None:
 38 |         """
 39 |         Asynchronously creates a collection with the given list of documents.
 40 | 
 41 |         Parameters
 42 |         ----------
 43 |         docs : List[Document]
 44 |             The list of documents to be added to the collection.
 45 | 
 46 |         Returns
 47 |         -------
 48 |         None
 49 |         """
 50 |         await self.collection.afrom_documents(
 51 |             documents=docs,
 52 |             embedding=self.emb_fn,
 53 |             persist_directory=self.persist_directory,
 54 |             collection_name=self.channel,
 55 |         )
 56 | 
 57 |     def last_msg_id(self):
 58 |         """
 59 |         Returns the last message ID from the metadatas collection.
 60 |         """
 61 |         metadatas = self.collection.get()["metadatas"]
 62 |         last_metadata = max(metadatas, key=lambda x: x["message_id"])
 63 |         last_message_id = last_metadata["message_id"]
 64 | 
 65 |         return last_message_id
 66 | 
 67 |     def dataframe_to_documents(self, data: pd.DataFrame) -> List[Document]:
 68 |         """
 69 |         Convert a pandas DataFrame to a list of Document objects.
 70 | 
 71 |         Parameters
 72 |         ----------
 73 |         data : pd.DataFrame
 74 |             The input pandas DataFrame.
 75 | 
 76 |         Returns
 77 |         -------
 78 |         List[Document]
 79 |             A list of Document objects.
 80 |         """
 81 |         loader = DataFrameLoader(data, page_content_column="text")
 82 |         return loader.load()
 83 | 
 84 |     def collection_exists(self) -> bool:
 85 |         """
 86 |         Check if the collection exists and return a boolean value.
 87 |         """
 88 |         return bool(self.collection.get()["ids"])
 89 | 
 90 |     async def update_collection(self):
 91 |         """
 92 |         An async function that updates the collection. It checks if the collection exists, and if so, it retrieves the last message ID and scrapes new data from the Telegram messages. If new data is found, it creates a dataframe, converts it to documents, and adds the documents to the collection. If the collection does not exist, it scrapes initial data from the Telegram messages, creates a dataframe, converts it to documents, and creates a new collection. After the update, it sets the collection with a new Chroma instance.
 93 |         """
 94 |         if self.collection_exists():
 95 |             last_message_id = self.last_msg_id()
 96 |             new_data = await scrape_telegram_messages(
 97 |                 client=client, channel=self.channel, min_id=last_message_id
 98 |             )
 99 |             if new_data:
100 |                 df = pd.DataFrame(new_data)
101 |                 docs = self.dataframe_to_documents(df)
102 |                 await self.collection.aadd_documents(docs)
103 |         else:
104 |             initial_data = await scrape_telegram_messages(
105 |                 client=client, channel=self.channel
106 |             )
107 |             if initial_data:
108 |                 df = pd.DataFrame(initial_data)
109 |                 docs = self.dataframe_to_documents(df)
110 |                 await self.create_collection(docs)
111 | 
112 |         self.collection = Chroma(
113 |             persist_directory=self.persist_directory,
114 |             embedding_function=self.emb_fn,
115 |             collection_name=self.channel,
116 |         )
117 | 


--------------------------------------------------------------------------------
/src/database/database.py:
--------------------------------------------------------------------------------
 1 | from typing import AsyncGenerator
 2 | 
 3 | from sqlalchemy import MetaData
 4 | from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
 5 | from sqlalchemy.orm import declarative_base, sessionmaker
 6 | 
 7 | from src.config import DB_HOST, DB_NAME, DB_PASS, DB_PORT, DB_USER
 8 | 
 9 | DATABASE_URL = f"postgresql+asyncpg://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
10 | Base = declarative_base()
11 | metadata = MetaData()
12 | 
13 | engine = create_async_engine(DATABASE_URL)
14 | async_session_maker = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
15 | 
16 | 
17 | async def get_async_session() -> AsyncGenerator[AsyncSession, None]:
18 |     async with async_session_maker() as session:
19 |         yield session
20 | 


--------------------------------------------------------------------------------
/src/database/models.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from sqlalchemy import TIMESTAMP, Column, ForeignKey, Integer, String
 4 | 
 5 | from src.database import Base
 6 | 
 7 | 
 8 | class User(Base):
 9 |     __tablename__ = "user"
10 | 
11 |     user_id = Column(Integer, primary_key=True)
12 |     telegram_id = Column(Integer, nullable=False, unique=True)
13 |     username = Column(String, nullable=True)
14 |     first_name = Column(String, nullable=True)
15 |     last_name = Column(String, nullable=True)
16 |     bio = Column(String, nullable=True)
17 |     registered_at = Column(TIMESTAMP, default=datetime.utcnow)
18 | 
19 | 
20 | class SubscriptionType(Base):
21 |     __tablename__ = "subscription_type"
22 | 
23 |     type_id = Column(Integer, primary_key=True)
24 |     type_name = Column(String, unique=True, nullable=False)
25 |     monthly_price = Column(Integer, nullable=False)
26 | 
27 | 
28 | class UserSubscription(Base):
29 |     __tablename__ = "user_subscription"
30 | 
31 |     subscription_id = Column(Integer, primary_key=True)
32 |     telegram_id = Column(Integer, ForeignKey("user.telegram_id"))
33 |     type_id = Column(Integer, ForeignKey("subscription_type.type_id"))
34 |     valid_from = Column(TIMESTAMP, nullable=False)
35 |     valid_to = Column(TIMESTAMP, nullable=False)
36 | 
37 | 
38 | class Action(Base):
39 |     __tablename__ = "action"
40 | 
41 |     action_id = Column(Integer, primary_key=True)
42 |     telegram_id = Column(Integer, ForeignKey("user.telegram_id"))
43 |     response_id = Column(Integer, nullable=True)
44 |     platform_type = Column(String, nullable=True)
45 |     resource_name = Column(String, nullable=True)
46 |     query = Column(String, nullable=True)
47 |     prompt = Column(String, nullable=True)
48 |     response = Column(String, nullable=True)
49 |     input_tokens = Column(Integer, nullable=True)
50 |     output_tokens = Column(Integer, nullable=True)
51 |     execution_time = Column(Integer, nullable=True)
52 |     feedback = Column(String, nullable=True)
53 | 
54 | 
55 | class Channel(Base):
56 |     __tablename__ = "channel"
57 |     channel = Column(String, primary_key=True)
58 |     requested_by_id = Column(Integer, nullable=True)
59 |     username = Column(String, nullable=True)
60 |     added_date = Column(TIMESTAMP, default=datetime.utcnow)
61 |     followers = Column(Integer, nullable=True)
62 | 


--------------------------------------------------------------------------------
/src/database/postgres_service.py:
--------------------------------------------------------------------------------
  1 | from sqlalchemy import insert, select, update, delete
  2 | from src.database import async_session_maker
  3 | from src.database.models import User, Action, Channel
  4 | 
  5 | 
  6 | class PostgresManager:
  7 |     async def add_user(
  8 |         self, telegram_id: int, username: str, first_name: str, last_name: str, bio: str
  9 |     ) -> None:
 10 |         """
 11 |         Add a new user to the database with the provided Telegram ID, username, first name, last name, and bio.
 12 | 
 13 |         Parameters
 14 |         ----------
 15 |         telegram_id : int
 16 |             The Telegram ID of the user.
 17 |         username : str
 18 |             The username of the user.
 19 |         first_name : str
 20 |             The first name of the user.
 21 |         last_name : str
 22 |             The last name of the user.
 23 |         bio : str
 24 |             The bio of the user
 25 |         """
 26 |         async with async_session_maker() as session:
 27 |             stm = insert(User).values(
 28 |                 telegram_id=telegram_id,
 29 |                 username=username,
 30 |                 first_name=first_name,
 31 |                 last_name=last_name,
 32 |                 bio=bio,
 33 |             )
 34 | 
 35 |             await session.execute(stm)
 36 |             await session.commit()
 37 | 
 38 |     async def del_user(self, telegram_id: int) -> None:
 39 |         async with async_session_maker() as session:
 40 |             stm = delete(User).where(User.telegram_id == telegram_id)
 41 |             await session.execute(stm)
 42 |             await session.commit()
 43 | 
 44 |     async def user_exists(self, telegram_id: int) -> bool:
 45 |         """
 46 |         Check if a user with the given telegram_id exists in the database.
 47 | 
 48 |         Parameters
 49 |         ----------
 50 |         self : instance
 51 |             The instance of the class.
 52 |         telegram_id : int
 53 |             The telegram_id of the user to check.
 54 | 
 55 |         Returns
 56 |         -------
 57 |         bool
 58 |             True if the user exists, False otherwise.
 59 |         """
 60 |         async with async_session_maker() as session:
 61 |             query = select(User).where(User.telegram_id == telegram_id)
 62 | 
 63 |             result = await session.execute(query)
 64 |             exists = result.mappings().fetchall()
 65 | 
 66 |             return bool(exists)
 67 | 
 68 |     async def add_action(
 69 |         self,
 70 |         telegram_id: int,
 71 |         response_id: int,
 72 |         platform_type: str,
 73 |         resource_name: str,
 74 |         query: str,
 75 |         prompt: str,
 76 |         response: str,
 77 |         input_tokens: int,
 78 |         output_tokens: int,
 79 |         execution_time: int,
 80 |     ):
 81 |         """
 82 |         Add a new action to the database with the provided Telegram ID, response ID, platform type, resource name, query, prompt, response, input tokens, output tokens, and execution time.
 83 | 
 84 |         Parameters
 85 |         ----------
 86 |         telegram_id : int
 87 |             The Telegram ID of the user.
 88 |         response_id : int
 89 |             The response ID of the action.
 90 |         platform_type : str
 91 |             The platform type of the action.
 92 |         resource_name : str
 93 |             The name of the resource.
 94 |         query : str
 95 |             The query of the action.
 96 |         prompt : str
 97 |             The prompt of the action.
 98 |         response : str
 99 |             The response of the action.
100 |         input_tokens : int
101 |             The number of input tokens of the action.
102 |         output_tokens : int
103 |             The number of output tokens of the action.
104 |         execution_time : int
105 |             The execution time of the action.
106 |         """
107 |         async with async_session_maker() as session:
108 |             stm = insert(Action).values(
109 |                 telegram_id=telegram_id,
110 |                 response_id=response_id,
111 |                 platform_type=platform_type,
112 |                 resource_name=resource_name,
113 |                 query=query,
114 |                 prompt=prompt,
115 |                 response=response,
116 |                 input_tokens=input_tokens,
117 |                 output_tokens=output_tokens,
118 |                 execution_time=execution_time,
119 |             )
120 | 
121 |             await session.execute(stm)
122 |             await session.commit()
123 | 
124 |     async def add_feedback(self, response_id: int, feedback: str):
125 |         """
126 |         Add a new feedback to the database with the provided response ID and feedback.
127 | 
128 |         Parameters
129 |         ----------
130 |         response_id : int
131 |             The response ID of the action.
132 |         feedback : str
133 |             The feedback of the user.
134 |         """
135 |         async with async_session_maker() as session:
136 |             stm = (
137 |                 update(Action)
138 |                 .where(Action.response_id == response_id)
139 |                 .values(feedback=feedback)
140 |             )
141 | 
142 |             await session.execute(stm)
143 |             await session.commit()
144 | 
145 |     async def get_previous_context(self, reply_to_message_id: int):
146 |         """
147 |         Get the previous context from the database with the provided reply_to_message_id.
148 | 
149 |         Parameters
150 |         ----------
151 |         reply_to_message_id : int
152 |             The reply_to_message_id of the action.
153 | 
154 |         Returns
155 |         -------
156 |         previus_context
157 |         """
158 |         async with async_session_maker() as session:
159 |             query = select(Action.prompt, Action.response, Action.resource_name).where(
160 |                 Action.response_id == reply_to_message_id
161 |             )
162 | 
163 |             result = await session.execute(query)
164 |             previus_context = result.mappings().fetchone()
165 | 
166 |             return previus_context
167 | 
168 |     async def add_channel(
169 |         self, channel: str, user_id: int, members_count: int, username: str
170 |     ) -> None:
171 |         """
172 |         Add channel to pool.
173 | 
174 |         Parameters
175 |         ----------
176 |         channel:
177 |             Channel to add.
178 |         """
179 |         async with async_session_maker() as session:
180 |             stm = insert(Channel).values(
181 |                 channel=channel,
182 |                 requested_by_id=user_id,
183 |                 username=username,
184 |                 followers=members_count,
185 |             )
186 | 
187 |             await session.execute(stm)
188 |             await session.commit()
189 |             return True
190 | 
191 |     async def del_channel(self, channel: str):
192 |         async with async_session_maker() as session:
193 |             stm = delete(Channel).where(Channel.channel == channel)
194 |             await session.execute(stm)
195 |             await session.commit()
196 | 
197 |         return True
198 | 
199 |     async def channel_exists(self, channel: str) -> bool:
200 |         """
201 |         Check if a user with the given telegram_id exists in the database.
202 | 
203 |         Parameters
204 |         ----------
205 |         self : instance
206 |             The instance of the class.
207 |         channel : str
208 |             Channel to check.
209 | 
210 |         Returns
211 |         -------
212 |         bool
213 |             True if the user exists, False otherwise.
214 |         """
215 |         async with async_session_maker() as session:
216 |             query = select(Channel.channel).where(Channel.channel == channel)
217 | 
218 |             result = await session.execute(query)
219 |             exists = result.mappings().fetchall()
220 | 
221 |             return bool(exists)
222 | 
223 |     async def get_pool(self) -> tuple[str, str, str]:
224 |         """
225 |         Return the pool of channels.
226 | 
227 |         Parameters
228 |         ----------
229 |         self : instance
230 |             The instance of the class.
231 | 
232 |         Returns
233 |         -------
234 |         tuple[str, str, int]
235 |             (channel:str, username:str, followers:int)
236 |         """
237 |         async with async_session_maker() as session:
238 |             query = select(
239 |                 Channel.channel, Channel.username, Channel.followers
240 |             ).order_by()
241 | 
242 |             result = await session.execute(query)
243 |             result = result.all()
244 |             return result
245 | 


--------------------------------------------------------------------------------
/src/handlers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/torchme/PostFinder/14de2c2ea8e6b26e57ba03662ddeec7d8bf1991c/src/handlers/__init__.py


--------------------------------------------------------------------------------
/src/handlers/admin.py:
--------------------------------------------------------------------------------
  1 | from aiogram import Router, types
  2 | from aiogram.filters import Command, CommandObject
  3 | from src.app.loader import pg_manager, bot
  4 | from src.utils.validation import validate_id, validate_add_channel_command_args
  5 | from src.config import config
  6 | 
  7 | router = Router()
  8 | 
  9 | 
 10 | @router.message(Command(commands="add_user"))
 11 | async def add_user(message: types.Message, command: CommandObject):
 12 |     user_id, error_msg = validate_id(message, command, config.admin_ids)
 13 |     telegram_id = message.from_user.id
 14 |     username = message.from_user.username or ""
 15 |     first_name = message.from_user.first_name or ""
 16 |     last_name = message.from_user.last_name or ""
 17 |     user_info = await bot.get_chat(telegram_id)
 18 |     bio = user_info.bio or ""
 19 |     if error_msg:
 20 |         await message.answer(error_msg)
 21 |         return
 22 | 
 23 |     if await pg_manager.user_exists(telegram_id=user_id):
 24 |         await message.answer(
 25 |             config.get(["messages", "admin", "users", "add", "fail"]).format(user_id)
 26 |         )
 27 |         return
 28 |     await pg_manager.add_user(
 29 |         telegram_id=telegram_id,
 30 |         username=username,
 31 |         first_name=first_name,
 32 |         last_name=last_name,
 33 |         bio=bio,
 34 |     )
 35 |     await message.answer(
 36 |         config.get(["messages", "admin", "users", "add", "success"]).format(user_id)
 37 |     )
 38 | 
 39 | 
 40 | @router.message(Command(commands="del_user"))
 41 | async def del_user(message: types.Message, command: CommandObject):
 42 |     user_id, error_msg = validate_id(message, command, config.admin_ids)
 43 | 
 44 |     if error_msg:
 45 |         await message.answer(error_msg)
 46 |         return
 47 | 
 48 |     if user_id not in config.admin_ids:
 49 |         await message.answer(config.get(["messages", "errors", "no_rights"]))
 50 |         return
 51 | 
 52 |     if not await pg_manager.user_exists(telegram_id=message.from_user.id):
 53 |         await message.answer(
 54 |             config.get(["messages", "admin", "users", "remove", "fail"]).format(user_id)
 55 |         )
 56 |         return
 57 |     await pg_manager.del_user(telegram_id=message.from_user.id)
 58 |     await message.answer(
 59 |         config.get(["messages", "admin", "users", "remove", "success"]).format(user_id)
 60 |     )
 61 | 
 62 | 
 63 | @router.message(Command(commands="del_channel"))
 64 | async def del_channel(message: types.Message, command: CommandObject):
 65 |     args = command.args
 66 |     channel, error_msg = await validate_add_channel_command_args(args)
 67 |     user_id = message.from_user.id
 68 |     if error_msg:
 69 |         await message.answer(error_msg)
 70 |         return
 71 | 
 72 |     if user_id not in config.admin_ids:
 73 |         await message.answer(config.get(["messages", "errors", "no_rights"]))
 74 |         return
 75 | 
 76 |     if not await pg_manager.channel_exists(channel=channel):
 77 |         await message.answer(
 78 |             config.get(["messages", "admin", "channel", "remove", "fail"]).format(
 79 |                 channel
 80 |             )
 81 |         )
 82 |         return
 83 |     elif await pg_manager.del_channel(channel):
 84 |         await message.answer(
 85 |             config.get(["messages", "admin", "channel", "remove", "success"]).format(
 86 |                 channel
 87 |             )
 88 |         )
 89 | 
 90 | 
 91 | @router.message(Command(commands="pool"))
 92 | async def show_pool(message: types.Message, command: CommandObject):
 93 |     text = ""
 94 |     result = await pg_manager.get_pool()
 95 |     if result:
 96 |         for i, (channel, username, members) in enumerate(result):
 97 |             text += f"{i+1}. {channel} [{members} подписчиков] добавил @{username}\n"
 98 |     else:
 99 |         text = "Пул каналов пуст"
100 |     await message.answer(text)
101 | 


--------------------------------------------------------------------------------
/src/handlers/callbacks.py:
--------------------------------------------------------------------------------
 1 | from aiogram import F, Router, types
 2 | 
 3 | from src.app.loader import pg_manager, bot
 4 | from src.config import ADMIN_CHAT_ID, config
 5 | from src.utils.schemas import FeedbackCallback, AdminUserCallback, AdminChannelCallback
 6 | 
 7 | router = Router()
 8 | 
 9 | 
10 | @router.callback_query(FeedbackCallback.filter(F.type == "user_evaluation"))
11 | async def get_feedback(
12 |     callback_query: types.CallbackQuery, callback_data: FeedbackCallback
13 | ):
14 |     """
15 |     Asynchronous function for handling user feedback.
16 | 
17 |     Parameters
18 |     ----------
19 |     callback_query : types.CallbackQuery
20 |         The callback query object.
21 |     callback_data : FeedbackCallback
22 |         The callback data object.
23 |     """
24 |     await pg_manager.add_feedback(
25 |         response_id=int(callback_data.message_id), feedback=callback_data.feedback
26 |     )
27 |     await callback_query.answer(callback_data.feedback)
28 | 
29 | 
30 | @router.callback_query(AdminUserCallback.filter(F.type == "admin_user"))
31 | async def admin_action_user(
32 |     callback_query: types.CallbackQuery, callback_data: AdminUserCallback
33 | ):
34 |     user_id = int(callback_data.user_id)
35 |     username = callback_data.username
36 | 
37 |     if callback_data.action == "approve":
38 |         config.add_id(id_type="users", user_id=user_id)
39 |         await pg_manager.add_user(telegram_id=user_id, username=username)
40 |         await bot.send_message(
41 |             ADMIN_CHAT_ID,
42 |             config.get(['callback', 'approve', 'user','to_admins']).format(username=username, user_id=user_id),
43 |             parse_mode=None,
44 |         )
45 |         await bot.send_message(
46 |             int(user_id), config.get(['callback', 'approve','user', 'to_user'])
47 |         )
48 |         
49 |     else:
50 |         await bot.send_message(
51 |             ADMIN_CHAT_ID,
52 |             config.get(['callback', 'deny', 'user', 'to_admins']).format(username=username, user_id=user_id),
53 | 
54 |             parse_mode=None,
55 |         )
56 |         await bot.send_message(
57 |             int(user_id), config.get(['callback', 'deny', 'user', 'to_admins'])
58 |         )
59 | 
60 | 
61 | 
62 | @router.callback_query(AdminChannelCallback.filter(F.type == "admin_channel"))
63 | async def admin_action_channel(
64 |     callback_query: types.CallbackQuery, callback_data: AdminChannelCallback
65 | ):
66 |     user_id = int(callback_data.user_id)
67 |     channel = callback_data.channel
68 |     username = callback_data.username
69 |     chat_info=await bot.get_chat(channel)
70 |     if callback_data.action == "approve":
71 |         await pg_manager.add_channel(
72 |         channel=channel,
73 |         user_id=user_id,
74 |         username=username,
75 |         members_count=await chat_info.get_member_count()
76 |         )
77 | 
78 |         await bot.send_message(
79 |             ADMIN_CHAT_ID,
80 |             config.get(['callback', 'approve', 'channel', 'to_admins']).format(channel),
81 |             parse_mode=None,
82 |         )
83 |         await bot.send_message(
84 |             int(user_id), config.get(['callback', 'approve', 'channel', 'to_user']).format(channel)
85 |         )
86 |         
87 |     else:
88 |         await bot.send_message(
89 |             ADMIN_CHAT_ID,
90 |             config.get(['callback', 'deny', 'channel', 'to_admins']).format(channel),
91 | 
92 |             parse_mode=None,
93 |         )
94 |         await bot.send_message(
95 |             int(user_id), config.get(['callback', 'deny', 'channel', 'to_user']).format(channel)
96 |         )


--------------------------------------------------------------------------------
/src/handlers/commands.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | from aiogram import Router, types
  4 | from aiogram.enums.chat_action import ChatAction
  5 | from aiogram.filters import Command, CommandObject
  6 | from langchain.prompts import PromptTemplate
  7 | from loguru import logger
  8 | 
  9 | from src.app.loader import llm, pg_manager, bot, encoding, extractor
 10 | from src.database.chroma_service import ChromaManager
 11 | from src.config import config
 12 | from src.utils.validation import validate_parse_command_args, validate_add_channel_command_args
 13 | from src.utils.filters import UnknownCommandFilter
 14 | from src.utils.markup import inline_markup_feedback
 15 | from src.utils.admin_service import send_user_to_admins, send_channel_to_admins
 16 | from src.config import ADMIN_CHAT_ID
 17 | router = Router()
 18 | 
 19 | 
 20 | @router.message(Command(commands=["start", "help"]))
 21 | async def send_welcome(message: types.Message):
 22 |     """
 23 |     Sends a welcome message to the user and registers the user in the system if not already registered.
 24 |     Takes a message object as input.
 25 |     """
 26 |     welcome_message = config.get(['messages', 'welcome'])
 27 | 
 28 | 
 29 |     await message.answer(welcome_message)
 30 | 
 31 |     telegram_id = message.from_user.id
 32 |     username = message.from_user.username or ""
 33 |     first_name = message.from_user.first_name or ""
 34 |     last_name = message.from_user.last_name or ""
 35 | 
 36 |     user_info = await bot.get_chat(telegram_id)
 37 |     bio = user_info.bio or ""
 38 | 
 39 |     if not await pg_manager.user_exists(telegram_id=telegram_id):
 40 |         await pg_manager.add_user(
 41 |             telegram_id=telegram_id,
 42 |             username=username,
 43 |             first_name=first_name,
 44 |             last_name=last_name,
 45 |             bio=bio,
 46 |         )
 47 |         logger.info(config.get(['messages', 'user', 'registered']).format(telegram_id))
 48 |     else:
 49 |         logger.info(config.get(['messages', 'user', 'already_registered']).format(telegram_id))
 50 | 
 51 |     if telegram_id not in config.whitelist:
 52 |         await send_user_to_admins(
 53 |             user_id=telegram_id,
 54 |             username="@" + username,
 55 |             first_name=first_name,
 56 |             last_name=last_name,
 57 |         )
 58 |         await message.answer(
 59 |             config.get(['messages', 'moderation', 'answer'])
 60 |         )
 61 | 
 62 | 
 63 | @router.message(Command(commands="find"))
 64 | async def find_answer(message: types.Message, command: CommandObject):
 65 |     """
 66 |     Asynchronous function for finding an answer based on the given message and command object.
 67 | 
 68 |     Parameters
 69 |     ----------
 70 |         message: aiogram.types.Message
 71 |             The message object.
 72 |         command: aiogram.filters.CommandObject
 73 |             The command object containing arguments.
 74 |     """
 75 |     if message.from_user.id not in config.whitelist:
 76 |         await message.answer(config.get(['messages', 'moderation','processing']))
 77 |         return
 78 | 
 79 |     args = command.args
 80 | 
 81 |     channel, query, _, error_message = validate_parse_command_args(args)
 82 | 
 83 |     if error_message:
 84 |         await message.answer(error_message)
 85 |         return
 86 | 
 87 |     start_time = time.time()
 88 | 
 89 |     msg = await message.answer(config.get(["messages", "searching"]))
 90 | 
 91 |     chroma_manager = ChromaManager(channel=channel)
 92 | 
 93 |     await bot.send_chat_action(message.chat.id, ChatAction.TYPING)
 94 | 
 95 |     await chroma_manager.update_collection()
 96 | 
 97 |     retriever = chroma_manager.collection.as_retriever()
 98 |     docs = retriever.get_relevant_documents(
 99 |         extractor.add_features(query=query), search_kwargs={"k": 5}
100 |     )
101 | 
102 |     context_text = "\n\n---\n\n".join(
103 |         [f"Text №{i}" + doc.page_content for i, doc in enumerate(docs)]
104 |     )
105 |     cut_length = [
106 |         7 if len(doc.page_content.split()) > 7 else len(doc.page_content.split())
107 |         for doc in docs
108 |     ]
109 |     relevant_post_urls = [
110 |         f"[{' '.join(doc.page_content.split()[:(cut_length[i])])}...](t.me/{channel}/{doc.metadata['message_id']})"
111 |         for i, doc in enumerate(docs)
112 |     ][:5]
113 | 
114 |     QUERY_TEAMPLATE = PromptTemplate(
115 |         input_variables=["question", "context"],
116 |         template=config.get(["templates", "prompt"]),
117 |     )
118 | 
119 |     query_prompt = QUERY_TEAMPLATE.format(context=context_text, question=query)
120 |     msg_text = "🙋🏼‍♂️ *Ваш вопрос:*\n" + query + "\n\n🔍 *Найденный ответ:*\n"
121 |     await msg.edit_text(msg_text)
122 |     response = ""
123 | 
124 |     async for stream_response in llm.astream(query_prompt):
125 |         if len(stream_response.content) != 0:
126 |             response += stream_response.content
127 |             msg_text += stream_response.content
128 |         if (len(msg_text.split()) % 7 == 0) and len(msg_text.split()) >= 7:
129 |             await msg.edit_text(msg_text)
130 | 
131 |     msg_text += "\n\n• " + "\n• ".join(relevant_post_urls)
132 |     msg_text += f"""\n\n{config.get(['messages', 'action_to_continue'])}"""
133 |     await msg.edit_text(
134 |         msg_text,
135 |         reply_markup=inline_markup_feedback(message_id=msg.message_id),
136 |         disable_web_page_preview=True,
137 |     )
138 | 
139 |     input_tokens = len(encoding.encode(query_prompt))
140 |     output_tokens = len(encoding.encode(response))
141 |     end_time = time.time()
142 |     execution_time = int(end_time - start_time)
143 | 
144 |     await pg_manager.add_action(
145 |         telegram_id=message.from_user.id,
146 |         response_id=msg.message_id,
147 |         platform_type="telegram",
148 |         resource_name=channel,
149 |         prompt=query_prompt,
150 |         query=query,
151 |         response=response,
152 |         input_tokens=input_tokens,
153 |         output_tokens=output_tokens,
154 |         execution_time=execution_time,
155 |     )
156 | 
157 |     logger.info(
158 |         config.get(["messages", "action_processed"]).format(message.from_user.id)
159 |     )
160 | 
161 | 
162 | 
163 | @router.message(Command(commands=["add_channel"]))
164 | async def add_channel(message: types.Message,  command: CommandObject):
165 |     args = command.args
166 |     channel, error_message = await validate_add_channel_command_args(args)
167 | 
168 |     if error_message:
169 |         await message.answer(error_message)
170 |         return
171 |     chat_info = await bot.get_chat(channel)
172 |     user_id, chat_id = message.from_user.id, message.chat.id
173 |     username = message.from_user.username
174 | 
175 |     if message.from_user.id not in config.whitelist:
176 |         await message.answer(
177 |             config.get(['messages', 'moderation', 'channel', 'processing'])
178 |             )
179 |         return
180 |     elif await pg_manager.channel_exists(channel=channel):
181 |         await message.answer(
182 |             config.get(['messages', 'admin', 'channel', 'add', 'fail']).format(channel)
183 |             )
184 |         return
185 |     elif chat_id == int(ADMIN_CHAT_ID):
186 |         await pg_manager.add_channel(channel=channel, user_id=message.from_user.id, members_count=await chat_info.get_member_count(), username=username)
187 |         await message.answer(
188 |             config.get(['callback', 'approve', 'channel', 'to_admins']).format(channel)
189 |             )
190 |         return
191 |     
192 |     await bot.send_message(chat_id=user_id, text=config.get(['messages', 'moderation', 'channel', 'processing']))
193 |     await send_channel_to_admins(user_id=user_id, channel=channel, username=username)   
194 | 
195 | @router.message(UnknownCommandFilter())
196 | async def unknown_command(message: types.Message):
197 | 
198 |     await message.answer(config.get(["messages", "unknown"]))
199 | 


--------------------------------------------------------------------------------
/src/handlers/dialog.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from aiogram import Router, types
 3 | from langchain.schema import HumanMessage, AIMessage
 4 | from loguru import logger
 5 | 
 6 | from src.app.loader import pg_manager, llm, encoding
 7 | from src.utils.filters import MessageReplyFilter
 8 | from src.utils.markup import inline_markup_feedback
 9 | from src.config import config
10 | 
11 | router = Router()
12 | 
13 | 
14 | @router.message(MessageReplyFilter())
15 | async def dialog(message: types.Message):
16 |     """
17 |     Asynchronous function that handles a dialog message and performs various operations on the message content and context.
18 |     Takes a types.Message object as a parameter. Does not return anything.
19 |     """
20 |     if message.from_user.id not in config.whitelist:
21 |         await message.answer(
22 |             config.get(['no_rights'])
23 |         )
24 |         return
25 | 
26 |     start_time = time.time()
27 | 
28 |     previous_context = await pg_manager.get_previous_context(
29 |         reply_to_message_id=message.reply_to_message.message_id
30 |     )
31 |     previous_prompt, previous_response, resource_name = (
32 |         previous_context["prompt"],
33 |         previous_context["response"],
34 |         previous_context["resource_name"],
35 |     )
36 |     query = message.text
37 | 
38 |     messages = [
39 |         HumanMessage(content=previous_prompt),
40 |         AIMessage(content=previous_response),
41 |         HumanMessage(content=f"Question: {message.text}\nAnswer:"),
42 |     ]
43 |     prompt = "\n".join([item.content for item in messages])
44 |     print(prompt)
45 | 
46 |     msg_text = "🙋🏼‍♂️ *Ваш вопрос:*\n" + query + "\n\n🔍 *Найденный ответ:*\n"
47 |     response = ""
48 | 
49 |     msg = await message.answer(msg_text)
50 | 
51 |     async for stream_response in llm.astream(messages):
52 |         response += stream_response.content
53 |         msg_text += stream_response.content
54 | 
55 |         if (len(msg_text.split()) % 7 == 0) and len(msg_text.split()) >= 7:
56 |             await msg.edit_text(msg_text)
57 | 
58 |     msg_text += config.get(['messages','action_to_continue'])
59 |     await msg.edit_text(
60 |         msg_text,
61 |         reply_markup=inline_markup_feedback(message_id=msg.message_id),
62 |         disable_web_page_preview=True,
63 |     )
64 | 
65 |     input_tokens = len(encoding.encode(prompt))
66 |     output_tokens = len(encoding.encode(response))
67 | 
68 |     end_time = time.time()
69 |     execution_time = int(end_time - start_time)
70 | 
71 |     await pg_manager.add_action(
72 |         telegram_id=message.from_user.id,
73 |         response_id=msg.message_id,
74 |         platform_type="telegram",
75 |         resource_name=resource_name,
76 |         prompt=prompt,
77 |         query=query,
78 |         response=response,
79 |         input_tokens=input_tokens,
80 |         output_tokens=output_tokens,
81 |         execution_time=execution_time,
82 |     )
83 | 
84 |     logger.info(config.get(['messages', 'action_processed']).format(message.from_user.id))
85 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/torchme/PostFinder/14de2c2ea8e6b26e57ba03662ddeec7d8bf1991c/src/utils/__init__.py


--------------------------------------------------------------------------------
/src/utils/admin_service.py:
--------------------------------------------------------------------------------
 1 | from src.app.loader import bot
 2 | from src.config import config, ADMIN_CHAT_ID
 3 | from src.utils.markup import inline_markup_admin_user, inline_markup_admin_channel 
 4 | 
 5 | 
 6 | async def send_user_to_admins(user_id: int, username: str, first_name: str, last_name: str):
 7 |     if user_id not in config.whitelist:
 8 |         await bot.send_message(
 9 |             ADMIN_CHAT_ID,
10 |             f"user_id: {user_id}\nusername: {username}\nfirst_name: {first_name}\nlast_name: {last_name}\n\nРазрешить доступ?",
11 |             reply_markup=inline_markup_admin_user(user_id=user_id, username=username),
12 |             parse_mode=None,
13 |         )
14 |     
15 | async def send_channel_to_admins(user_id: int, channel: str, username:str):
16 |     await bot.send_message(
17 |         ADMIN_CHAT_ID,
18 |         "Channel: {channel}\nДобавить в пул каналов?".format(channel=channel),
19 |         reply_markup=inline_markup_admin_channel(channel=channel, user_id=user_id, username=username),
20 |         parse_mode=None
21 |     )   


--------------------------------------------------------------------------------
/src/utils/antifrod.py:
--------------------------------------------------------------------------------
1 | from src.app.loader import bot
2 | 
3 | async def validate_channel(channel:str):
4 |     chat_info = await bot.get_chat(channel)
5 |     members_count = await chat_info.get_member_count()
6 |     if members_count < 1000:
7 |         return False
8 |     return True


--------------------------------------------------------------------------------
/src/utils/extractor.py:
--------------------------------------------------------------------------------
 1 | from langchain.prompts import PromptTemplate
 2 | from src.config import config
 3 | 
 4 | 
 5 | class Extractor:
 6 |     def __init__(self, llm):
 7 |         self.llm = llm
 8 |         extract_template = config.get(['templates', 'extract'])
 9 |         self.template = PromptTemplate(
10 |             input_variables=["query"], template=extract_template
11 |         )
12 | 
13 |     def add_features(self, query):
14 |         features = self.llm.predict(self.template.format(query=query))
15 |         query += f"\nТеги: {features}"
16 | 
17 |         return query
18 | 


--------------------------------------------------------------------------------
/src/utils/filters.py:
--------------------------------------------------------------------------------
 1 | from aiogram.filters import Filter
 2 | from aiogram.types import Message
 3 | 
 4 | 
 5 | class UnknownCommandFilter(Filter):
 6 |     def __init__(self) -> None:
 7 |         self.commands = ["/start", "/help", "/find"]
 8 | 
 9 |     async def __call__(self, message: Message) -> bool:
10 |         try:
11 |             return message.text.startswith("/") and message.text not in self.commands
12 |         except Exception:
13 |             return False
14 | 
15 | 
16 | class MessageReplyFilter(Filter):
17 |     async def __call__(self, message: Message) -> bool:
18 |         if message.reply_to_message:
19 |             return True
20 |         return False
21 | 


--------------------------------------------------------------------------------
/src/utils/markup.py:
--------------------------------------------------------------------------------
 1 | from aiogram.types import InlineKeyboardButton, InlineKeyboardMarkup
 2 | 
 3 | from src.utils.schemas import AdminUserCallback, AdminChannelCallback, FeedbackCallback
 4 | 
 5 | 
 6 | def inline_markup_feedback(message_id: int) -> InlineKeyboardMarkup:
 7 |     buttons = [
 8 |         InlineKeyboardButton(
 9 |             text="👎",
10 |             callback_data=FeedbackCallback(
11 |                 type="user_evaluation", message_id=str(message_id), feedback="dislike"
12 |             ).pack(),
13 |         ),
14 |     ]
15 | 
16 |     return InlineKeyboardMarkup(inline_keyboard=[buttons], one_time_keyboard=True)
17 | 
18 | 
19 | def inline_markup_admin_user(user_id: int, username: str) -> InlineKeyboardMarkup:
20 |     buttons = [
21 |         InlineKeyboardButton(
22 |             text="✅",
23 |             callback_data=AdminUserCallback(
24 |                 type="admin_user", user_id=str(user_id), username=username, action="approve"
25 |             ).pack(),
26 |         ),
27 |         InlineKeyboardButton(
28 |             text="❌",
29 |             callback_data=AdminUserCallback(
30 |                 type="admin_user", user_id=str(user_id), username=username, action="dismiss"
31 |             ).pack(),
32 |         ),
33 |     ]
34 | 
35 |     return InlineKeyboardMarkup(inline_keyboard=[buttons], one_time_keyboard=True)
36 | 
37 | 
38 | def inline_markup_admin_channel(user_id:int, channel: str, username:str) -> InlineKeyboardMarkup:
39 |     buttons = [
40 |         InlineKeyboardButton(
41 |             text="✅",
42 |             callback_data=AdminChannelCallback(
43 |                 type="admin_channel", channel=str(channel), user_id=str(user_id),action="approve", username=username
44 |             ).pack(),
45 |         ),
46 |         InlineKeyboardButton(
47 |             text="❌",
48 |             callback_data=AdminChannelCallback(
49 |                 type="admin_channel", channel=str(channel), user_id=str(user_id), action="dismiss", username=username
50 |             ).pack(),
51 |         ),
52 |     ]
53 | 
54 |     return InlineKeyboardMarkup(inline_keyboard=[buttons], one_time_keyboard=True)
55 | 


--------------------------------------------------------------------------------
/src/utils/schemas.py:
--------------------------------------------------------------------------------
 1 | from aiogram.filters.callback_data import CallbackData
 2 | 
 3 | 
 4 | class FeedbackCallback(CallbackData, prefix="feedback"):
 5 |     type: str
 6 |     message_id: str
 7 |     feedback: str
 8 | 
 9 | 
10 | class AdminUserCallback(CallbackData, prefix="user"):
11 |     type: str
12 |     user_id: str
13 |     username: str
14 |     action: str
15 | 
16 | class AdminChannelCallback(CallbackData, prefix="channel"):
17 |     type: str
18 |     user_id: str
19 |     channel: str
20 |     username: str
21 |     action: str
22 | 


--------------------------------------------------------------------------------
/src/utils/scrapper.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | import pandas as pd
 3 | from loguru import logger
 4 | from telethon import TelegramClient
 5 | 
 6 | 
 7 | async def scrape_telegram_messages(
 8 |     client: TelegramClient, channel: str, min_id: int = 0, limit: int = 10_000
 9 | ) -> Optional[pd.DataFrame]:
10 |     """Scraping messages from telegram-channel"""
11 |     await client.start()
12 | 
13 |     logger.info("Client for scrapping Created")
14 |     logger.info("Scrapping...")
15 | 
16 |     result = []
17 |     async for message in client.iter_messages(channel, limit=limit, min_id=min_id):
18 |         try:
19 |             message_info = {
20 |                 "message_id": message.id,
21 |                 "date": str(message.date),
22 |                 "text": message.text if message.text else "",
23 |             }
24 |             result.append(message_info)
25 |         except Exception:
26 |             logger.exception(f"Failed to parse message with id {message.id}")
27 |             continue
28 | 
29 |     if result:
30 |         return result
31 |     return None
32 | 


--------------------------------------------------------------------------------
/src/utils/validation.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Tuple
 2 | from aiogram import types
 3 | from aiogram.filters import CommandObject
 4 | from src.config import config
 5 | from src.app.loader import bot
 6 | 
 7 | 
 8 | def validate_parse_command_args(args_str: Optional[str]):
 9 |     if not args_str:
10 |         return (
11 |             None,
12 |             None,
13 |             None,
14 |             config.get(["messages", "parse_error"]),
15 |         )
16 |     args = args_str.split()
17 |     channel = args[0].replace("@", "")
18 |     context = " ".join(args[1:])
19 |     limit = 100
20 | 
21 |     return channel, context, limit, ""
22 | 
23 | 
24 | async def validate_add_channel_command_args(args_str: Optional[str]):
25 |     if not args_str:
26 |         return (
27 |             None,
28 |             config.get(["messages", "errors", "parse_channel_error"]),
29 |         )
30 |     args = args_str.split()
31 |     channel = args[0]
32 | 
33 |     try:
34 |         await bot.get_chat(channel)
35 |     except Exception:
36 |         return None, "Channel not found"
37 | 
38 |     return channel, ""
39 | 
40 | 
41 | def validate_id(
42 |     message: types.Message, command: CommandObject, admin_ids: List[int]
43 | ) -> Tuple[Optional[int], str]:
44 |     if message.from_user.id not in admin_ids:
45 |         return None, "Access denied! You don't have rights for this"
46 | 
47 |     try:
48 |         args = command.args
49 |         user_id = int(args.strip())
50 |         error_msg = ""
51 |     except ValueError:
52 |         user_id = None
53 |         error_msg = "Error: User id must be an integer!"
54 | 
55 |     return user_id, error_msg
56 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/torchme/PostFinder/14de2c2ea8e6b26e57ba03662ddeec7d8bf1991c/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import pytest
 3 | 
 4 | 
 5 | @pytest.fixture(scope="session", autouse=True)
 6 | def event_loop():
 7 |     loop = asyncio.get_event_loop_policy().new_event_loop()
 8 |     yield loop
 9 |     loop.close()
10 | 


--------------------------------------------------------------------------------
/tests/test_db.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | @pytest.mark.asyncio
5 | class TestChromaManager:
6 |     pass
7 | 


--------------------------------------------------------------------------------