├── .dockerignore ├── .github ├── dependabot.yml └── workflows │ ├── publish.yml │ ├── push-to-registry.yml │ ├── ruff.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── Dockerfile ├── LICENSE ├── README.md ├── docs ├── dashboard.md ├── distributed_processing.md ├── doc_images │ ├── backend_image 1.png │ ├── backend_image 2.png │ ├── backend_image 3.png │ ├── backend_image 4.png │ ├── backend_image 5.png │ ├── backend_image 6.png │ ├── backend_image 7.png │ ├── backend_image 8.png │ ├── backend_image.png │ └── index_api.png ├── evaluation.md ├── index.md ├── index_api.md ├── installation.md ├── process.md ├── rag.md ├── rcp_and_production.md ├── retriever_api_specs.yaml ├── uv.md └── websearch.md ├── examples ├── cc │ └── process_config.yaml ├── index │ ├── bge │ │ └── config.yaml │ ├── config.yaml │ ├── openai │ │ ├── config.yaml │ │ └── ner.db │ └── qwen │ │ ├── config.yaml │ │ └── ner.db ├── postprocessor │ └── config.yaml ├── process │ └── config.yaml ├── pubmedqa │ ├── index_bge.yaml │ ├── index_stella.yaml │ └── pp.yaml ├── rag │ ├── config.yaml │ ├── config_api.yaml │ ├── evaluation │ │ ├── indexer_eval_example_config.yaml │ │ ├── rag_eval_example_config.yaml │ │ ├── rag_evaluated_example_config.yaml │ │ └── rag_evaluator_example.py │ ├── openai │ │ ├── output.jsonl │ │ └── rag_openai.yaml │ ├── output.json │ ├── queries.jsonl │ ├── swissai │ │ ├── output.jsonl │ │ └── rag_swissai.yaml │ └── vllm │ │ └── rag_vllm.yaml ├── retriever_api │ └── config.yaml ├── sample_data │ ├── docx │ │ └── ums.docx │ ├── eml │ │ └── sample.eml │ ├── example_outputs_ner │ │ ├── merged │ │ │ └── merged_results.jsonl │ │ └── pp │ │ │ ├── 1___🌍 Language ID.jsonl │ │ │ ├── 2___🍷 FineWeb Quality.jsonl │ │ │ ├── 3___🧩 Chunker.jsonl │ │ │ ├── 4___🗣️ Lang Detector.jsonl │ │ │ └── final_pp.jsonl │ ├── html │ │ ├── ex1.htm │ │ └── gorilla.html │ ├── images │ │ └── image.png │ ├── md │ │ └── test.md │ ├── media │ │ ├── sample_audio.mp3 │ │ └── video.mp4 │ ├── pdf │ │ ├── Corona Virus Disease-19 pandemic.pdf │ │ ├── calendar.pdf │ │ └── llava-interleave.pdf │ ├── pptx │ │ └── ada.pptx │ ├── spreadsheet │ │ └── survey.xlsx │ └── txt │ │ ├── poeme.txt │ │ └── test.txt ├── websearchRAG │ ├── config.yaml │ ├── config_api.yaml │ └── enhanced_results_trial.json └── who │ ├── index │ ├── index_config_bge.yaml │ ├── index_config_e5.yaml │ ├── index_config_gte.yaml │ ├── index_config_medembed.yaml │ └── index_config_stella.yaml │ ├── pp │ ├── chunk_semantic.yaml │ ├── chunk_sentence.yaml │ └── pp_config.yaml │ ├── scott_ds │ ├── k1 │ │ ├── scott_ds_gpt4o_bge.jsonl │ │ ├── scott_ds_meditron8b_bge.jsonl │ │ ├── scott_gpt4o_bge.yaml │ │ ├── scott_meditron70b_bge.yaml │ │ └── scott_meditron8b_bge.yaml │ ├── k3 │ │ ├── scott_ds_gpt4o_bge.jsonl │ │ ├── scott_ds_meditron8b_bge.jsonl │ │ ├── scott_gpt4o_bge.yaml │ │ ├── scott_meditron70b_bge.yaml │ │ └── scott_meditron8b_bge.yaml │ └── norag │ │ ├── scott_ds_gpt4o_norag.jsonl │ │ ├── scott_ds_gpt4o_norag_100tokens.jsonl │ │ ├── scott_ds_meditron70b_norag.jsonl │ │ ├── scott_ds_meditron70b_norag_100tokens.jsonl │ │ ├── scott_ds_meditron8b_norag.jsonl │ │ ├── scott_ds_meditron8b_norag_100tokens.jsonl │ │ ├── scott_gpt4o_norag.yaml │ │ ├── scott_meditron70b_norag.yaml │ │ └── scott_meditron8b_norag.yaml │ ├── scott_ds_meditron70b_bge.jsonl │ ├── scott_ds_meditron70b_medembed.jsonl │ ├── scott_ds_meditron8b_bge.jsonl │ ├── scott_ds_meditron8b_bge_1.jsonl │ ├── scott_ds_openai_medembed.jsonl │ ├── scott_ds_openai_medembed_1.jsonl │ ├── scott_ds_queries.jsonl │ ├── scott_gpt4o_bge.yaml │ ├── scott_meditron70b_bge.yaml │ ├── scott_meditron70b_bge_1.yaml │ ├── scott_meditron70b_medembed.yaml │ ├── scott_meditron8b_bge.yaml │ ├── scottcorr_ds_meditron70b_bge.jsonl │ ├── scottcorr_ds_meditron8b_bge.jsonl │ └── scottcorr_ds_queries.jsonl ├── mmore_logo.jpg ├── production-config ├── index │ └── config.yaml ├── postprocessor │ └── config.yaml ├── process │ └── config.yaml ├── rag │ ├── config.yaml │ ├── config_api.yaml │ └── queries.jsonl └── retriever_api │ └── config.yaml ├── pyproject.toml ├── scripts ├── data_extractor.py ├── lm_eval_rag.sh ├── process_distributed.sh ├── runai │ └── entrypoint.sh └── setup.sh ├── src ├── __init__.py └── mmore │ ├── __init__.py │ ├── __main__.py │ ├── cli.py │ ├── dashboard │ ├── __init__.py │ ├── backend │ │ ├── __init__.py │ │ ├── benchmarks │ │ │ ├── __init__.py │ │ │ ├── statistics.py │ │ │ └── test_spam.py │ │ ├── client.py │ │ └── model.py │ ├── frontend │ │ ├── .gitignore │ │ ├── eslint.config.js │ │ ├── index.html │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── public │ │ │ └── vite.svg │ │ ├── src │ │ │ ├── App.css │ │ │ ├── App.tsx │ │ │ ├── GraphAndSublistView.tsx │ │ │ ├── OverallProgressView.tsx │ │ │ ├── Progress.ts │ │ │ ├── Report.ts │ │ │ ├── ReportsChartView.tsx │ │ │ ├── ReportsTableView.tsx │ │ │ ├── WorkerLatest.ts │ │ │ ├── assets │ │ │ │ └── react.svg │ │ │ ├── index.css │ │ │ ├── main.tsx │ │ │ └── vite-env.d.ts │ │ ├── tsconfig.app.json │ │ ├── tsconfig.json │ │ ├── tsconfig.node.json │ │ └── vite.config.ts │ ├── node_modules │ │ ├── .package-lock.json │ │ └── safecolor │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── package.json │ │ │ └── safecolor.js │ ├── package-lock.json │ └── package.json │ ├── index │ ├── __init__.py │ └── indexer.py │ ├── process │ ├── __init__.py │ ├── config.py │ ├── crawler.py │ ├── dispatcher.py │ ├── drive_download.py │ ├── execution_state.py │ ├── post_processor │ │ ├── __init__.py │ │ ├── base.py │ │ ├── chunker │ │ │ ├── __init__.py │ │ │ ├── multimodal.py │ │ │ └── utils.py │ │ ├── filter │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── datatrove_wrapper.py │ │ ├── metafuse │ │ │ ├── __init__.py │ │ │ └── base.py │ │ ├── ner │ │ │ ├── __init__.py │ │ │ ├── _prompts.py │ │ │ ├── extractor.py │ │ │ ├── ner.py │ │ │ └── output_parser.py │ │ ├── pipeline.py │ │ ├── tagger │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── file_namer.py │ │ │ ├── hash.py │ │ │ ├── lang_detector.py │ │ │ ├── modalities.py │ │ │ └── words.py │ │ └── translator │ │ │ ├── __init__.py │ │ │ └── base.py │ ├── processors │ │ ├── __init__.py │ │ ├── base.py │ │ ├── docx_processor.py │ │ ├── eml_processor.py │ │ ├── html_processor.py │ │ ├── md_processor.py │ │ ├── media_processor.py │ │ ├── pdf_processor.py │ │ ├── pptx_processor.py │ │ ├── spreadsheet_processor.py │ │ ├── txt_processor.py │ │ └── url_processor.py │ └── utils.py │ ├── rag │ ├── __init__.py │ ├── evaluator.py │ ├── llm.py │ ├── model │ │ ├── __init__.py │ │ ├── dense │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── multimodal.py │ │ └── sparse │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── splade.py │ ├── pipeline.py │ ├── retriever.py │ ├── types.py │ └── vectorstore.py │ ├── run_dashboard_backend.py │ ├── run_index.py │ ├── run_index_api.py │ ├── run_live_retrieval.py │ ├── run_postprocess.py │ ├── run_process.py │ ├── run_rag.py │ ├── run_ragcli.py │ ├── run_retriever.py │ ├── run_websearch.py │ ├── type.py │ ├── utils.py │ └── websearchRAG │ ├── config.py │ ├── logging_config.py │ ├── pipeline.py │ └── websearch.py └── tests ├── test_indexer.py ├── test_postprocessors.py └── test_processors_local.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .venv 2 | tests 3 | test_data 4 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # https://docs.github.com/en/code-security/dependabot/working-with-dependabot/dependabot-options-reference#package-ecosystem- 2 | version: 2 3 | updates: 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | interval: "monthly" 8 | - package-ecosystem: "pip" 9 | directory: "/" 10 | schedule: 11 | interval: "weekly" -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: 📦 Publish Python Package 2 | on: 3 | release: 4 | types: [published] 5 | permissions: 6 | contents: read 7 | jobs: 8 | release-build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v5 12 | - uses: actions/setup-python@v5 13 | with: 14 | python-version: "3.x" 15 | - name: Build release distributions 16 | run: | 17 | python -m pip install build 18 | python -m build 19 | - name: Upload distributions 20 | uses: actions/upload-artifact@v4 21 | with: 22 | name: release-dists 23 | path: dist/ 24 | pypi-publish: 25 | runs-on: ubuntu-latest 26 | needs: 27 | - release-build 28 | permissions: 29 | id-token: write 30 | # Dedicated environments with protections for publishing are strongly recommended. 31 | environment: 32 | name: pypi 33 | url: https://pypi.org/p/mmore 34 | steps: 35 | - name: Retrieve release distributions 36 | uses: actions/download-artifact@v5 37 | with: 38 | name: release-dists 39 | path: dist/ 40 | - name: Publish release distributions to PyPI 41 | uses: pypa/gh-action-pypi-publish@release/v1 -------------------------------------------------------------------------------- /.github/workflows/push-to-registry.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | workflow_dispatch: 8 | 9 | jobs: 10 | docker: 11 | runs-on: ubuntu-latest 12 | 13 | strategy: 14 | matrix: 15 | platform: [cpu, gpu] 16 | user: 17 | - { uid: 1000, gid: 1000, tag: "10001000" } 18 | 19 | steps: 20 | - name: Checkout 21 | uses: actions/checkout@v5 22 | 23 | - name: Set up QEMU 24 | uses: docker/setup-qemu-action@v3 25 | 26 | - name: Set up Docker Buildx 27 | uses: docker/setup-buildx-action@v3 28 | 29 | - name: Login to Docker Hub 30 | uses: docker/login-action@v3 31 | with: 32 | username: ${{ secrets.DOCKERHUB_USERNAME }} 33 | password: ${{ secrets.DOCKERHUB_TOKEN }} 34 | 35 | - name: Docker meta 36 | id: meta 37 | uses: docker/metadata-action@v5 38 | with: 39 | images: androz2091/swiss-ai-mmore 40 | tags: | 41 | type=ref,event=branch 42 | type=sha 43 | 44 | - name: Build & push ${{ matrix.platform }} variant ${{ matrix.user.tag }} 45 | uses: docker/build-push-action@v6 46 | with: 47 | context: . 48 | push: true 49 | tags: | 50 | ${{ steps.meta.outputs.tags }}-${{ matrix.platform }}-${{ matrix.user.tag }} 51 | labels: ${{ steps.meta.outputs.labels }} 52 | build-args: | 53 | PLATFORM=${{ matrix.platform }} 54 | USER_UID=${{ matrix.user.uid }} 55 | USER_GID=${{ matrix.user.gid }} 56 | cache-from: type=gha 57 | cache-to: type=gha,mode=max 58 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: 🧹 Ruff linter checks 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | jobs: 8 | lint: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v5 12 | - uses: astral-sh/ruff-action@v3 13 | - run: ruff check 14 | - run: ruff format --check -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: 🧪 PyTest unit tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | 13 | strategy: 14 | matrix: 15 | python-version: ["3.10", "3.11", "3.12", "3.x"] 16 | 17 | steps: 18 | - name: Checkout code 19 | uses: actions/checkout@v5 20 | 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install -e '.[rag,dev]' # or custom setup 30 | pip install pytest # if not in requirements.txt 31 | 32 | - name: Show installed cohere and langchain-cohere versions 33 | run: | 34 | pip show cohere || echo "Cohere not installed" 35 | pip show langchain-cohere || echo "Langchain-cohere not installed" 36 | 37 | 38 | - name: Run tests 39 | run: | 40 | pytest 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | processed_texts/ 2 | 0000/ 3 | 0000_small/ 4 | *.out 5 | *.lock 6 | *.zip 7 | *.log 8 | .local/ 9 | .bashrc 10 | .zcompdump 11 | dask.yaml 12 | *.fish 13 | uv-receipt.json 14 | *.crl 15 | .gitconfig 16 | 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | share/python-wheels/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | MANIFEST 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | .tox/ 58 | .nox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | *.py,cover 66 | .hypothesis/ 67 | .pytest_cache/ 68 | cover/ 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # MacOS 84 | .DS_Store 85 | 86 | # PyCharm 87 | .idea/ 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pdm 94 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 95 | #pdm.lock 96 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 97 | # in version control. 98 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 99 | .pdm.toml 100 | .pdm-python 101 | .pdm-build/ 102 | 103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 104 | __pypackages__/ 105 | 106 | # Environments 107 | .env 108 | .*venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Milvus DB 116 | db/ 117 | *.db 118 | 119 | # Project files 120 | tmp/ 121 | scheduler-file.json 122 | requirements-dev.lock 123 | requirements.lock 124 | . 125 | examples/outputs/ 126 | test*.sh 127 | 128 | .venv 129 | 130 | # Examples 131 | examples/outputs 132 | outputs/ -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/PyCQA/isort 3 | rev: 6.0.1 4 | hooks: 5 | - id: isort 6 | - repo: https://github.com/astral-sh/ruff-pre-commit 7 | rev: v0.12.11 8 | hooks: 9 | - id: ruff-check 10 | args: [ 11 | --fix, # auto-fix lint + style issues 12 | --unsafe-fixes, # allows formatting & import sorting 13 | ] 14 | 15 | - repo: https://github.com/codespell-project/codespell 16 | rev: v2.4.1 17 | hooks: 18 | - id: codespell # See pyproject.toml for args 19 | additional_dependencies: 20 | - tomli 21 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PLATFORM 2 | ARG UV_ARGUMENTS="" 3 | 4 | FROM nvidia/cuda:12.2.2-base-ubuntu22.04 AS gpu 5 | ARG PLATFORM 6 | RUN echo "Using GPU image" 7 | 8 | FROM ubuntu:22.04 AS cpu 9 | ARG PLATFORM 10 | ARG UV_ARGUMENTS="--extra cpu" 11 | RUN echo "Using CPU-only image" 12 | 13 | FROM ${PLATFORM:-gpu} AS build 14 | ARG UV_ARGUMENTS 15 | 16 | ARG USER_UID=1000 17 | ARG USER_GID=1000 18 | 19 | RUN apt-get update && \ 20 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 21 | python3-venv python3-pip \ 22 | tzdata nano curl ffmpeg libsm6 libxext6 chromium-browser libnss3 libgconf-2-4 \ 23 | libxi6 libxrandr2 libxcomposite1 libxcursor1 libxdamage1 libxfixes3 libxrender1 \ 24 | libasound2 libatk1.0-0 libgtk-3-0 libreoffice libjpeg-dev libpango-1.0-0 \ 25 | libpangoft2-1.0-0 weasyprint && \ 26 | ln -fs /usr/share/zoneinfo/Europe/Zurich /etc/localtime && \ 27 | dpkg-reconfigure --frontend noninteractive tzdata && \ 28 | apt-get clean && rm -rf /var/lib/apt/lists/* 29 | 30 | # Create a non-root user 31 | RUN groupadd --gid ${USER_GID} mmoreuser \ 32 | && useradd --uid ${USER_UID} --gid ${USER_GID} -m mmoreuser 33 | 34 | WORKDIR /app 35 | RUN chown -R mmoreuser:mmoreuser /app 36 | 37 | USER mmoreuser 38 | 39 | RUN python3 -m venv .venv \ 40 | && .venv/bin/pip install --no-cache-dir uv 41 | 42 | COPY pyproject.toml poetry.lock* /app/ 43 | COPY --chown=mmoreuser:mmoreuser . /app 44 | 45 | RUN .venv/bin/uv pip install --no-cache ${UV_ARGUMENTS} -e . 46 | 47 | ENV PATH="/app/.venv/bin:$PATH" 48 | ENV DASK_DISTRIBUTED__WORKER__DAEMON=False 49 | 50 | ENTRYPOINT ["/bin/bash"] 51 | -------------------------------------------------------------------------------- /docs/distributed_processing.md: -------------------------------------------------------------------------------- 1 | # Distributed Document Processing Guide 2 | 3 | This guide explains how to set up and run distributed document processing for the RAG system across multiple nodes. 4 | 5 | ## Overview 6 | 7 | Distributed processing allows you to scale document indexing across multiple machines, significantly improving processing speed for large document collections. The system uses Dask for distributed task scheduling and execution. 8 | 9 | ## Prerequisites 10 | 11 | - Multiple machines/nodes with network connectivity 12 | - Python environment on each node 13 | - Access to a shared filesystem or the ability to copy files between nodes 14 | 15 | ## Setup Process 16 | 17 | ### 1. Prepare Your Configuration File 18 | 19 | Check your processing configuration file ([example](/examples/process/config.yaml)), to include the distributed settings: 20 | 21 | ```yaml 22 | dispatcher_config: 23 | distributed: true 24 | scheduler_file: "/path/to/scheduler.json" # Shared location accessible by all nodes 25 | ``` 26 | 27 | Other important configuration options: 28 | - `input_folder`: Path to your documents 29 | - `output_folder`: Where processed results will be stored 30 | - `use_fast_processors`: Set to `true` for faster processing (may reduce accuracy) 31 | 32 | ### 2. Install Dependencies on all Nodes 33 | 34 | On each node, run: 35 | 36 | ```bash 37 | # Clone the repository (if not already done) 38 | git clone 39 | cd mmore 40 | 41 | # Make a virtual environment 42 | python -m venv .venv 43 | source .venv/bin/activate 44 | 45 | # Install dependencies 46 | pip install -e . 47 | ``` 48 | 49 | ### 3. Launch the Distributed Processing 50 | 51 | #### Step 1: Start the Master Node (Rank 0) 52 | 53 | ```bash 54 | bash scripts/process_distributed.sh --config-file /path/to/config.yaml --rank 0 55 | ``` 56 | 57 | The master node will: 58 | - Start the Dask scheduler 59 | - Launch a worker process 60 | - Prompt you to start the processing when ready 61 | 62 | #### Step 2: Start Worker Nodes (Rank > 0) 63 | 64 | On each additional node, run: 65 | 66 | ```bash 67 | bash scripts/process_distributed.sh --config-path /path/to/config.yaml --rank 1 68 | ``` 69 | 70 | Replace `rank 1` with a unique rank number for each node (1, 2, 3, etc.). The node should be ready in a matter of 5 seconds. 71 | 72 | #### Step 3: Begin Processing 73 | 74 | Once all nodes are running, return to the master node and type `go`. The master node proceeds to crawl the input folder, split the workload among connected nodes and make them start their work. 75 | 76 | The dask server will be automatically shut down by the master node at the end of the processing. This will also shut down the dask workers on all the connected nodes. 77 | 78 | ## Monitoring Progress 79 | 80 | You can monitor the processing using the dashboard, just check its [documentation](./dashboard.md). 81 | 82 | The dashboard provides: 83 | - Real-time progress visualization 84 | - Worker status monitoring 85 | - The ability to gracefully stop workers 86 | 87 | ## Output Structure 88 | 89 | After processing completes, the output will be organized as follows: 90 | 91 | ``` 92 | output_folder/ 93 | ├── processors/ 94 | │ ├── Processor_type_1/ 95 | │ │ └── results.jsonl 96 | │ ├── Processor_type_2/ 97 | │ │ └── results.jsonl 98 | │ └── ... 99 | ├── merged/ 100 | │ └── merged_results.jsonl 101 | └── images/ 102 | ``` 103 | 104 | ## Troubleshooting 105 | 106 | - **Workers not connecting**: Ensure all nodes can access the scheduler file location 107 | - **Processing errors**: Check logs on the master node 108 | - **Performance issues**: Adjust batch sizes and worker counts in the configuration 109 | 110 | ## Advanced Configuration 111 | 112 | For optimal performance, consider adjusting: 113 | - Processor batch sizes 114 | - Number of threads per worker 115 | - Memory limits for workers 116 | 117 | Refer to the [process documentation](./process.md) for more details on configuration options. 118 | -------------------------------------------------------------------------------- /docs/doc_images/backend_image 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/docs/doc_images/backend_image 1.png -------------------------------------------------------------------------------- /docs/doc_images/backend_image 2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/docs/doc_images/backend_image 2.png -------------------------------------------------------------------------------- /docs/doc_images/backend_image 3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/docs/doc_images/backend_image 3.png -------------------------------------------------------------------------------- /docs/doc_images/backend_image 4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/docs/doc_images/backend_image 4.png -------------------------------------------------------------------------------- /docs/doc_images/backend_image 5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/docs/doc_images/backend_image 5.png -------------------------------------------------------------------------------- /docs/doc_images/backend_image 6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/docs/doc_images/backend_image 6.png -------------------------------------------------------------------------------- /docs/doc_images/backend_image 7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/docs/doc_images/backend_image 7.png -------------------------------------------------------------------------------- /docs/doc_images/backend_image 8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/docs/doc_images/backend_image 8.png -------------------------------------------------------------------------------- /docs/doc_images/backend_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/docs/doc_images/backend_image.png -------------------------------------------------------------------------------- /docs/doc_images/index_api.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/docs/doc_images/index_api.png -------------------------------------------------------------------------------- /docs/evaluation.md: -------------------------------------------------------------------------------- 1 | # 🤖 MMORE RAG Evaluation Pipeline 2 | 3 | ## 💡 TL;DR 4 | 5 | The `RAG` module comes with an Evaluator that allows you to evaluate your full RAG pipeline—from the context retrieval to the LLM's output: 6 | 7 | 1. **Prepare your benchmark evaluation dataset** in the required format. 8 | 2. **Choose your list of metrics** to evaluate [Available Metrics](https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/). 9 | 3. **Set up your configuration files** for the evaluator, indexer, and RAG pipeline. 10 | 4. **Run the evaluation** for your RAG setup (retriever + LLM). 11 | 12 | 🚀 **Powered by RAGAS**: Ragas is a library designed to supercharge the evaluation of Large Language Model (LLM) applications. 13 | 14 | ## 💻 Minimal Example 15 | 16 | Here's a step-by-step guide to set up the evaluation pipeline: 17 | 18 | ### 1. **Create the Evaluator Config File**: 19 | 20 | This file defines the evaluation settings for your pipeline. 21 | 22 | ```yaml 23 | hf_dataset_name: "Mallard74/eval_medical_benchmark" # Hugging Face Eval dataset name (Example dataset) 24 | split: "train" # Dataset split 25 | hf_feature_map: {'user_input': 'user_input', 'reference': 'reference', 'corpus': 'corpus', 'query_id': 'query_ids'} # Column mapping 26 | metrics: # List of metrics to evaluate 27 | - LLMContextRecall 28 | - Faithfulness 29 | - FactualCorrectness 30 | - SemanticSimilarity 31 | embeddings_name: "all-MiniLM-L6-v2" # Evaluator Embedding model name 32 | llm: # Evaluator LLM config 33 | llm_name: "gpt-4o" 34 | max_new_tokens: 150 35 | ``` 36 | ### 2. **Create the Indexer Config File**: 37 | 38 | This file configures the indexer for your evaluation. 39 | 40 | ```yaml 41 | dense_model_name: sentence-transformers/all-MiniLM-L6-v2 42 | sparse_model_name: splade 43 | db: 44 | uri: "./examples/rag/milvus_mock_eval_medical_benchmark.db" # Dataset's Vectorstore URI 45 | name: "mock_eval_medical_benchmark" 46 | chunker: 47 | chunking_strategy: sentence # Your chunking strategy 48 | ``` 49 | 50 | ### 3. **Create the RAG Pipeline Config File**: 51 | 52 | This file sets up the RAG pipeline for evaluation. 53 | 54 | ```yaml 55 | llm: 56 | llm_name: "gpt-4o-mini" # RAG LLM model to evaluate 57 | max_new_tokens: 150 58 | retriever: 59 | db: 60 | uri: "./examples/rag/milvus_mock_eval_medical_benchmark.db" # Dataset's Vectorstore URI 61 | hybrid_search_weight: 0.5 62 | k: 3 63 | ``` 64 | 65 | ### 4. **Run the Evaluation**: 66 | 67 | Once the configuration files are in place, you can run the evaluation pipeline with the following Python script: 68 | 69 | ```python 70 | from mmore.rag.evaluator import RAGEvaluator 71 | 72 | # Instantiate RAGEvaluator 73 | evaluator = RAGEvaluator.from_config(args.eval_config) 74 | 75 | # Run the evaluation 76 | result = evaluator( 77 | indexer_config=args.indexer_config, 78 | rag_config=args.rag_config 79 | ) 80 | ``` 81 | 82 | - See [`examples/rag/evaluation`](../examples/rag/evaluation) for a simple example. 83 | > :warning: Note that you should create a separate database file for each dataset. The pipeline will create partitions per dense model for convenience. -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # :robot: MMORE Index 2 | 3 | ## :bulb: TL;DR 4 | 5 | > The `Index` module handles the indexing and post-processing of the extracted data from the multimodal documents. It creates an indexed Vector Store DB based on [Milvus](https://milvus.io/). We enable the use of *hybrid* retrieval, combining both *dense* and *sparse* retrieval. 6 | > 7 | > You can customize various parts of the pipeline by defining an inference indexing config file. 8 | 9 | ## :computer: Minimal Example: 10 | Here is a minimal example to index [processed documents](process.md). 11 | 1. Create a config file based on [the example](/examples/index/config.yaml). 12 | 13 | 2. Index your documents by calling the inference script: 14 | ```bash 15 | python3 -m mmore index --config_file /path/to/config.yaml 16 | ``` 17 | See [`examples/index`](../examples/index/) for other examples. -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | To install `mmore`, run the following: 4 | 5 | 1. Clone the repository 6 | ```bash 7 | git clone https://github.com/swiss-ai/mmore 8 | ``` 9 | 10 | 2. Install the package 11 | ```bash 12 | pip install -e . 13 | ``` 14 | 15 | ### Alternative #1: `uv` 16 | 17 | ##### Step 1: Install system dependencies 18 | 19 | ```bash 20 | sudo apt update 21 | sudo apt install -y ffmpeg libsm6 libxext6 chromium-browser libnss3 \ 22 | libgconf-2-4 libxi6 libxrandr2 libxcomposite1 libxcursor1 libxdamage1 \ 23 | libxext6 libxfixes3 libxrender1 libasound2 libatk1.0-0 libgtk-3-0 libreoffice \ 24 | libpango-1.0-0 libpangoft2-1.0-0 weasyprint 25 | ``` 26 | 27 | ##### Step 2: Install `uv` 28 | 29 | Refer to the [uv installation guide](https://docs.astral.sh/uv/getting-started/installation/) for detailed instructions. 30 | ``` 31 | curl -LsSf https://astral.sh/uv/install.sh | sh 32 | ``` 33 | 34 | ##### Step 3: Clone this repository 35 | 36 | ```bash 37 | git clone https://github.com/swiss-ai/mmore 38 | cd mmore 39 | ``` 40 | 41 | ##### Step 4: Install project and dependencies 42 | 43 | ```bash 44 | uv sync 45 | ``` 46 | 47 | For CPU-only installation, use: 48 | 49 | ```bash 50 | uv sync --extra cpu 51 | ``` 52 | 53 | ##### Step 5: Run a test command 54 | 55 | Activate the virtual environment before running commands: 56 | 57 | ```bash 58 | source .venv/bin/activate 59 | ``` 60 | ### Alternative #2: `Docker` 61 | 62 | **Note:** For manual installation without Docker, refer to the section below. 63 | 64 | ##### Step 1: Install Docker 65 | 66 | Follow the official [Docker installation guide](https://docs.docker.com/get-started/get-docker/). 67 | 68 | ##### Step 2: Build the Docker image 69 | 70 | ```bash 71 | sudo docker build . --tag mmore 72 | ``` 73 | 74 | To build for CPU-only platforms (results in a smaller image size): 75 | 76 | ```bash 77 | sudo docker build --build-arg PLATFORM=cpu -t mmore . 78 | ``` 79 | 80 | *Running on RCP:* you can specify a `USER_UID` and a `USER_GID` variable. Set it to your RCP user ID and group ID to run it there. 81 | 82 | ##### Step 3: Start an interactive session 83 | 84 | ```bash 85 | sudo docker run --gpus all -it -v ./examples:/app/examples -v ./.cache:/mmoreuser/.cache mmore 86 | ``` 87 | 88 | For CPU-only platforms: 89 | ```bash 90 | sudo docker run -it -v ./examples:/app/examples -v ./.cache:/mmoreuser/.cache mmore 91 | ``` 92 | 93 | > [!WARNING] 94 | > You may need the Nvidia toolkit so the containers can access your GPUs. 95 | > Read [this tutorial](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) if something breaks here! 96 | > 97 | > Configure the production repository: 98 | > 99 | > ```sh 100 | > curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ 101 | > && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ 102 | > sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ 103 | > sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list 104 | > ``` 105 | > 106 | > ```sh 107 | > sudo apt update 108 | > sudo apt install -y nvidia-container-toolkit 109 | > ``` 110 | > 111 | > Modify the Docker daemon to use Nvidia: 112 | > 113 | > ```sh 114 | > sudo nvidia-ctk runtime configure --runtime=docker 115 | > sudo systemctl restart docker 116 | > ``` 117 | > 118 | > You can now use `docker run --gpus all`! 119 | 120 | *Note:* The `examples` folder is mapped to `/app/examples` inside the container, corresponding to the default path in `examples/process/config.yaml`. 121 | -------------------------------------------------------------------------------- /docs/uv.md: -------------------------------------------------------------------------------- 1 | # Use `uv` to install `mmore` 2 | 3 | `uv` is an extremely fast Python package and project manager, written in Rust. It can act as a wrapper around `pip` to speedup the installations. 4 | 5 |

6 | 7 | 8 | 9 | Shows a bar chart with benchmark results. 10 | 11 |

12 | 13 | # Install `uv` 14 | 15 | Install uv with the standalone installers: 16 | 17 | ```bash 18 | # On macOS and Linux. 19 | curl -LsSf https://astral.sh/uv/install.sh | sh 20 | ``` 21 | 22 | ```bash 23 | # On Windows. 24 | powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex" 25 | ``` 26 | 27 | ## Install `mmore` with `uv` 28 | First create a new venv at repo's location 29 | ```bash 30 | uv venv 31 | source .venv/bin/activate 32 | ``` 33 | 34 | Then install mmore prepending `uv` to basic commands 35 | ```bash 36 | uv pip install -e . 37 | ``` 38 | 39 | -------------------------------------------------------------------------------- /examples/cc/process_config.yaml: -------------------------------------------------------------------------------- 1 | dispatcher_config: 2 | output_path: examples/mmore_output_fast/ 3 | use_fast_processors: true 4 | distributed: false 5 | extract_images: true 6 | scheduler_file: scheduler-file.json 7 | process_batch_sizes: 8 | - PDFProcessor: 4000 9 | processor_config: 10 | PDFProcessor: 11 | - PDFTEXT_CPU_WORKERS: 0 12 | - DETECTOR_BATCH_SIZE: 1 13 | - DETECTOR_POSTPROCESSING_CPU_WORKERS: 0 14 | - RECOGNITION_BATCH_SIZE: 1 15 | - OCR_PARALLEL_WORKERS: 0 16 | - TEXIFY_BATCH_SIZE: 1 17 | - LAYOUT_BATCH_SIZE: 1 18 | - ORDER_BATCH_SIZE: 1 19 | - TABLE_REC_BATCH_SIZE: 1 20 | data_path: examples/sample_data 21 | -------------------------------------------------------------------------------- /examples/index/bge/config.yaml: -------------------------------------------------------------------------------- 1 | indexer: 2 | dense_model: 3 | model_name: BAAI/bge-small-en 4 | sparse_model: 5 | model_name: splade 6 | db: 7 | uri: ./examples/index/bge/ner.db 8 | collection_name: my_docs 9 | documents_path: examples/example_outputs_ner/pp/final_pp.jsonl -------------------------------------------------------------------------------- /examples/index/config.yaml: -------------------------------------------------------------------------------- 1 | indexer: 2 | dense_model: 3 | model_name: sentence-transformers/all-MiniLM-L6-v2 4 | is_multimodal: false 5 | sparse_model: 6 | model_name: splade 7 | is_multimodal: false 8 | db: 9 | uri: ./proc_demo.db 10 | name: my_db 11 | collection_name: my_docs 12 | documents_path: 'examples/postprocessor/outputs/merged/final_pp.jsonl' 13 | -------------------------------------------------------------------------------- /examples/index/openai/config.yaml: -------------------------------------------------------------------------------- 1 | indexer: 2 | dense_model: 3 | model_name: text-embedding-3-small 4 | sparse_model: 5 | model_name: splade 6 | db: 7 | uri: ./examples/index/openai/ner.db 8 | collection_name: my_docs 9 | documents_path: examples/example_outputs_ner/pp/final_pp.jsonl -------------------------------------------------------------------------------- /examples/index/openai/ner.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/examples/index/openai/ner.db -------------------------------------------------------------------------------- /examples/index/qwen/config.yaml: -------------------------------------------------------------------------------- 1 | indexer: 2 | dense_model: 3 | model_name: Qwen/Qwen2-VL-7B 4 | is_multimodal: true 5 | sparse_model: 6 | model_name: splade 7 | db: 8 | uri: ./examples/index/qwen/ner.db 9 | collection_name: my_docs 10 | documents_path: examples/example_outputs_ner -------------------------------------------------------------------------------- /examples/index/qwen/ner.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/examples/index/qwen/ner.db -------------------------------------------------------------------------------- /examples/postprocessor/config.yaml: -------------------------------------------------------------------------------- 1 | pp_modules: 2 | - type: file_namer 3 | - type: chunker 4 | args: 5 | chunking_strategy: sentence 6 | - type: translator 7 | args: 8 | target_language: en 9 | attachment_tag: 10 | confidence_threshold: 0.7 11 | constrained_languages: 12 | - fr 13 | - en 14 | - type: metafuse 15 | args: 16 | metadata_keys: 17 | - file_name 18 | content_template: Content from {file_name} 19 | position: beginning 20 | 21 | output: 22 | output_path: examples/postprocessor/outputs/merged/ 23 | save_each_step: True 24 | -------------------------------------------------------------------------------- /examples/process/config.yaml: -------------------------------------------------------------------------------- 1 | data_path: examples/sample_data/ #put absolute path! Possible to pass a list of folders as well 2 | google_drive_ids: [] #put ids of google drive folders 3 | dispatcher_config: 4 | output_path: examples/process/outputs/ #put absolute path or relative to the root of the module 5 | use_fast_processors: false 6 | distributed: false 7 | dashboard_backend_url: null 8 | extract_images: true 9 | scheduler_file: null # for instance /path/to/mmore/scheduler-file.json 10 | process_batch_sizes: 11 | - URLProcessor: 40 12 | - DOCXProcessor: 100 13 | - PDFProcessor: 4000 14 | - MediaProcessor: 40 15 | - SpreadsheetProcessor: 100 16 | - TXTProcessor: 100 17 | - PPTXProcessor: 100 18 | - MarkdownProcessor: 100 19 | - EMLProcessor: 100 20 | - HTMLProcessor: 100 21 | processor_config: 22 | MediaProcessor: 23 | - normal_model: "openai/whisper-large-v3-turbo" 24 | - fast_model: "openai/whisper-tiny" 25 | - type: "automatic-speech-recognition" 26 | - sample_rate: 10 27 | - batch_size: 4 28 | 29 | PDFProcessor: 30 | - PDFTEXT_CPU_WORKERS: 0 31 | - DETECTOR_BATCH_SIZE: 1 32 | - DETECTOR_POSTPROCESSING_CPU_WORKERS: 0 33 | - RECOGNITION_BATCH_SIZE: 1 34 | - OCR_PARALLEL_WORKERS: 0 35 | - TEXIFY_BATCH_SIZE: 1 36 | - LAYOUT_BATCH_SIZE: 1 37 | - ORDER_BATCH_SIZE: 1 38 | - TABLE_REC_BATCH_SIZE: 1 39 | -------------------------------------------------------------------------------- /examples/pubmedqa/index_bge.yaml: -------------------------------------------------------------------------------- 1 | indexer: 2 | dense_model: 3 | model_name: BAAI/bge-large-en-v1.5 4 | sparse_model: 5 | model_name: splade 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/pubmedqa/processed/chunked_bge_ivf.db 8 | collection_name: who 9 | documents_path: "/mnt/mlo/scratch/homes/mmore/datasets/pubmedqa/processed/pp/1___🧩 Chunker.jsonl" 10 | batch_size: 64 -------------------------------------------------------------------------------- /examples/pubmedqa/index_stella.yaml: -------------------------------------------------------------------------------- 1 | indexer: 2 | dense_model: 3 | model_name: dunzhang/stella_en_400M_v5 4 | sparse_model: 5 | model_name: splade 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/pubmedqa/processed/chunked_stella_ivf.db 8 | collection_name: who 9 | documents_path: "/mnt/mlo/scratch/homes/mmore/datasets/pubmedqa/processed/pp/1___🧩 Chunker.jsonl" 10 | batch_size: 64 -------------------------------------------------------------------------------- /examples/pubmedqa/pp.yaml: -------------------------------------------------------------------------------- 1 | data_path: /mnt/mlo/scratch/homes/mmore/datasets/pubmedqa/processed/merged/merged_results_corrected.jsonl 2 | pipeline: 3 | pp_modules: 4 | - type: chunker 5 | args: 6 | chunking_strategy: sentence 7 | - type: filter_gopher-repetition 8 | args: 9 | exclusion_writer: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/logs/gopher-rep.jsonl 10 | - type: filter_gopher-quality 11 | args: 12 | exclusion_writer: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/logs/gopher-quality.jsonl 13 | - type: filter_c4 14 | args: 15 | exclusion_writer: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/logs/c4.jsonl 16 | - type: filter_fineweb 17 | args: 18 | exclusion_writer: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/logs/fineweb.jsonl 19 | output: 20 | output_path: /mnt/mlo/scratch/homes/mmore/datasets/pubmedqa/processed/pp 21 | save_each_step: True -------------------------------------------------------------------------------- /examples/rag/config.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: OpenMeditron/meditron3-8b 4 | max_new_tokens: 1200 5 | retriever: 6 | db: 7 | uri: ./proc_demo.db 8 | name: 'my_db' 9 | hybrid_search_weight: 0.5 10 | k: 5 11 | use_web: false 12 | reranker_model_name: BAAI/bge-reranker-base 13 | system_prompt: "Use the following context to answer the questions.\n\nContext:\n{context}" 14 | mode: local 15 | mode_args: 16 | input_file: examples/rag/queries.jsonl 17 | output_file: examples/rag/output.json 18 | -------------------------------------------------------------------------------- /examples/rag/config_api.yaml: -------------------------------------------------------------------------------- 1 | # RAG Config 2 | rag: 3 | # LLM Config 4 | llm: 5 | llm_name: Qwen/Qwen3-8B # "epfl-llm/meditron-70b" # "gpt-4o-mini" # Anything supported 6 | max_new_tokens: 1200 7 | temperature: 0.8 8 | # Retriever Config 9 | retriever: 10 | db: 11 | uri: ./proc_demo.db 12 | name: my_db 13 | hybrid_search_weight: 0.5 14 | k: 5 15 | # Prompt Args 16 | system_prompt: "Answer the question using the context.\n\nContext: {context}" 17 | # Mode Config 18 | mode: api 19 | mode_args: 20 | endpoint: '/rag' 21 | port: 8000 22 | host: 'localhost' 23 | -------------------------------------------------------------------------------- /examples/rag/evaluation/indexer_eval_example_config.yaml: -------------------------------------------------------------------------------- 1 | dense_model_name: 'all-MiniLM-L6-v2' 2 | sparse_model_name: 'splade' 3 | db: 4 | uri: "./examples/rag/mock_eval_benchmark.db" # Dataset's Vectorstore URI 5 | name: "mock_eval_benchmark" 6 | chunker: 7 | chunking_strategy: sentence # Your chunking strat 8 | -------------------------------------------------------------------------------- /examples/rag/evaluation/rag_eval_example_config.yaml: -------------------------------------------------------------------------------- 1 | hf_dataset_name: "Mallard74/eval_medical_benchmark" # Hugging Face Eval dataset name (Note: this one is a mock example) 2 | split: "train" # Dataset split 3 | hf_feature_map: {'user_input': 'user_input', 'reference': 'reference', 'corpus': 'corpus', 'query_id': 'query_ids'} # Column mapping from your dataset feature map to our standard column map 4 | metrics: # List of metrics to evaluate 5 | - LLMContextRecall 6 | - Faithfulness 7 | - FactualCorrectness 8 | - SemanticSimilarity 9 | embeddings_name: "all-MiniLM-L6-v2" # Evaluator Embedding model name 10 | llm: # Evaluator LLM config 11 | llm_name: "gpt-4o" 12 | max_new_tokens: 150 13 | 14 | 15 | -------------------------------------------------------------------------------- /examples/rag/evaluation/rag_evaluated_example_config.yaml: -------------------------------------------------------------------------------- 1 | llm: 2 | llm_name: "gpt-4o-mini" # RAG LLM model to evaluate 3 | max_new_tokens: 150 4 | retriever: 5 | db: 6 | uri: "./examples/rag/mock_eval_benchmark.db" # Dataset's Vectorstore URI 7 | hybrid_search_weight: 0.5 8 | k: 3 -------------------------------------------------------------------------------- /examples/rag/evaluation/rag_evaluator_example.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from dotenv import load_dotenv 4 | 5 | from ....src.mmore.rag.evaluator import RAGEvaluator 6 | 7 | load_dotenv() 8 | 9 | MOCK_EVALUATOR_CONFIG = "./examples/rag/evaluation/rag_eval_example_config.yaml" 10 | MOCK_INDEXER_CONFIG = "./examples/rag/evaluation/indexer_eval_example_config.yaml" 11 | MOCK_RAG_CONFIG = "./examples/rag/evaluation/rag_evaluated_example_config.yaml" 12 | 13 | 14 | def get_args(): 15 | parser = argparse.ArgumentParser( 16 | description="Run RAG Evaluation pipeline with specified parameters or use default mock data" 17 | ) 18 | parser.add_argument( 19 | "--eval-config", 20 | type=str, 21 | default=MOCK_EVALUATOR_CONFIG, 22 | help="Path to a rag evaluator config file.", 23 | ) 24 | parser.add_argument( 25 | "--indexer-config", 26 | type=str, 27 | default=MOCK_INDEXER_CONFIG, 28 | help="Path to an Indexer config file.", 29 | ) 30 | parser.add_argument( 31 | "--rag-config", 32 | type=str, 33 | default=MOCK_RAG_CONFIG, 34 | help="Path to a rag config file.", 35 | ) 36 | 37 | return parser.parse_args() 38 | 39 | 40 | if __name__ == "__main__": 41 | args = get_args() 42 | 43 | # Instantiate RAGEvaluator 44 | evaluator = RAGEvaluator.from_config(args.eval_config) 45 | 46 | # Run the evaluation 47 | result = evaluator(indexer_config=args.indexer_config, rag_config=args.rag_config) 48 | 49 | print(result) 50 | -------------------------------------------------------------------------------- /examples/rag/openai/output.jsonl: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "answer": "Barack Obama was born on August 4, 1961.", 4 | "input": "When was Barack Obama born?", 5 | "context": "[1] Barack Obama was born on August 4, 1961, in Honolulu, Hawaii.\n\n[2] Barack Obama was born on August 4, 1961, in Honolulu, Hawaii.\n\n[3] Barack Obama was born on August 4, 1961, in Honolulu, Hawaii.\n\n[4] Barack Obama was born on August 4, 1961, in Honolulu, Hawaii.\n\n[5] Elon Musk announced that SpaceX will launch its next rocket from Cape Canaveral on December 12, 2024." 6 | }, 7 | { 8 | "answer": "Google was founded by Larry Page and Sergey Brin.", 9 | "input": "Who founded Google?", 10 | "context": "[1] Google LLC, headquartered in Mountain View, California, was founded by Larry Page and Sergey Brin.\n\n[2] Google LLC, headquartered in Mountain View, California, was founded by Larry Page and Sergey Brin.\n\n[3] Google LLC, headquartered in Mountain View, California, was founded by Larry Page and Sergey Brin.\n\n[4] Google LLC, headquartered in Mountain View, California, was founded by Larry Page and Sergey Brin.\n\n[5] Elon Musk announced that SpaceX will launch its next rocket from Cape Canaveral on December 12, 2024." 11 | }, 12 | { 13 | "answer": "The Eiffel Tower is located in Paris, France.", 14 | "input": "Where is the Eiffel Tower located?", 15 | "context": "[1] The Eiffel Tower, located in Paris, France, attracts millions of tourists every year.\n\n[2] The Eiffel Tower, located in Paris, France, attracts millions of tourists every year.\n\n[3] The Eiffel Tower, located in Paris, France, attracts millions of tourists every year.\n\n[4] The Eiffel Tower, located in Paris, France, attracts millions of tourists every year.\n\n[5] Amazon shipped the package from its warehouse in Seattle to a customer in New York." 16 | }, 17 | { 18 | "answer": "The artificial intelligence conference will be held in Tokyo on March 15, 2024.", 19 | "input": "When will the artificial intelligence conference be held?", 20 | "context": "[1] The conference on artificial intelligence will be held in Tokyo on March 15, 2024.\n\n[2] The conference on artificial intelligence will be held in Tokyo on March 15, 2024.\n\n[3] The conference on artificial intelligence will be held in Tokyo on March 15, 2024.\n\n[4] The conference on artificial intelligence will be held in Tokyo on March 15, 2024.\n\n[5] Elon Musk announced that SpaceX will launch its next rocket from Cape Canaveral on December 12, 2024." 21 | } 22 | ] -------------------------------------------------------------------------------- /examples/rag/openai/rag_openai.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: gpt-4o-mini 4 | max_new_tokens: 150 5 | retriever: 6 | db: 7 | uri: ./examples/index/openai/ner.db 8 | name: openai 9 | hybrid_search_weight: 0.5 10 | k: 5 11 | system_prompt: "Use the following context to answer the questions.\n\nContext:\n{context}" 12 | mode: api 13 | -------------------------------------------------------------------------------- /examples/rag/queries.jsonl: -------------------------------------------------------------------------------- 1 | {"input": "When was Barack Obama born?", "collection_name": "my_docs"} 2 | {"input": "Who founded Google?", "collection_name": "my_docs"} 3 | {"input": "Where is the Eiffel Tower located?", "collection_name": "my_docs"} 4 | {"input": "When will the artificial intelligence conference be held?", "collection_name": "my_docs"} -------------------------------------------------------------------------------- /examples/rag/swissai/rag_swissai.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: meta-llama/Meta-Llama-3.1-70B-Instruct 4 | base_url: https://fmapi.swissai.cscs.ch 5 | organization: swissai 6 | max_new_tokens: 200 7 | retriever: 8 | db: 9 | uri: ./examples/index/qwen/ner.db 10 | hybrid_search_weight: 0.5 11 | k: 5 12 | system_prompt: "Use the following context to answer the questions.\n\nContext:\n{context}" 13 | inference: 14 | input_file: ./examples/rag/queries.jsonl 15 | output_file: ./examples/rag/swissai/output.jsonl -------------------------------------------------------------------------------- /examples/rag/vllm/rag_vllm.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: "mosaicml/mpt-7b" 4 | base_url: "http://localhost:8000/v1" 5 | max_new_tokens: 100 6 | retriever: 7 | db: 8 | uri: ./examples/index/llamavision/ner.db 9 | hybrid_search_weight: 0.5 10 | k: 5 11 | system_prompt: "Use the following context to answer the questions.\n\nContext:\n{context}" 12 | mode: local 13 | mode_args: 14 | input_file: ./examples/rag/queries.jsonl 15 | output_file: ./examples/rag/vllm/output.jsonl -------------------------------------------------------------------------------- /examples/retriever_api/config.yaml: -------------------------------------------------------------------------------- 1 | db: 2 | uri: ./proc_demo.db 3 | name: my_db 4 | hybrid_search_weight: 0.5 5 | k: 5 6 | collection_name: my_docs -------------------------------------------------------------------------------- /examples/sample_data/docx/ums.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/examples/sample_data/docx/ums.docx -------------------------------------------------------------------------------- /examples/sample_data/example_outputs_ner/merged/merged_results.jsonl: -------------------------------------------------------------------------------- 1 | {"text": "Barack Obama was born on August 4, 1961, in Honolulu, Hawaii.", "modalities": [], "metadata": {"xNER": [{"entity": "Barack Obama", "type": "PERSON"}, {"entity": "August 4, 1961", "type": "DATE"}, {"entity": "Honolulu", "type": "LOCATION"}, {"entity": "Hawaii", "type": "LOCATION"}]}} 2 | {"text": "Google LLC, headquartered in Mountain View, California, was founded by Larry Page and Sergey Brin.", "modalities": [], "metadata": {"xNER": [{"entity": "Google LLC", "type": "ORGANIZATION"}, {"entity": "Mountain View", "type": "LOCATION"}, {"entity": "California", "type": "LOCATION"}, {"entity": "Larry Page", "type": "PERSON"}, {"entity": "Sergey Brin", "type": "PERSON"}]}} 3 | {"text": "The Eiffel Tower, located in Paris, France, attracts millions of tourists every year.", "modalities": [], "metadata": {"xNER": [{"entity": "Eiffel Tower", "type": "LOCATION"}, {"entity": "Paris", "type": "LOCATION"}, {"entity": "France", "type": "LOCATION"}]}} 4 | {"text": "The conference on artificial intelligence will be held in Tokyo on March 15, 2024.", "modalities": [], "metadata": {"xNER": [{"entity": "artificial intelligence", "type": "EVENT"}, {"entity": "Tokyo", "type": "LOCATION"}, {"entity": "March 15, 2024", "type": "DATE"}]}} 5 | {"text": "Amazon shipped the package from its warehouse in Seattle to a customer in New York.", "modalities": [], "metadata": {"xNER": [{"entity": "Amazon", "type": "ORGANIZATION"}, {"entity": "Seattle", "type": "LOCATION"}, {"entity": "New York", "type": "LOCATION"}]}} 6 | {"text": "Elon Musk announced that SpaceX will launch its next rocket from Cape Canaveral on December 12, 2024.", "modalities": [], "metadata": {"xNER": [{"entity": "Elon Musk", "type": "PERSON"}, {"entity": "SpaceX", "type": "ORGANIZATION"}, {"entity": "Cape Canaveral", "type": "LOCATION"}, {"entity": "December 12, 2024", "type": "DATE"}]}} 7 | {"text": "François Hollande est né le 12 août 1954 à Rouen, en France.", "modalities": [], "metadata": {"xNER": [{"entity": "François Hollande", "type": "PERSON"}, {"entity": "12 août 1954", "type": "DATE"}, {"entity": "Rouen", "type": "LOCATION"}, {"entity": "France", "type": "LOCATION"}]}} 8 | {"text": "Google LLC, dont le siège est à Mountain View, en Californie, a été fondée par Larry Page et Sergey Brin.", "modalities": [], "metadata": {"xNER": [{"entity": "Google LLC", "type": "ORGANIZATION"}, {"entity": "Mountain View", "type": "LOCATION"}, {"entity": "Californie", "type": "LOCATION"}, {"entity": "Larry Page", "type": "PERSON"}, {"entity": "Sergey Brin", "type": "PERSON"}]}} 9 | {"text": "La tour Eiffel, située à Paris, en France, attire des millions de touristes chaque année.", "modalities": [], "metadata": {"xNER": [{"entity": "tour Eiffel", "type": "LOCATION"}, {"entity": "Paris", "type": "LOCATION"}, {"entity": "France", "type": "LOCATION"}]}} 10 | {"text": "La conférence sur l'intelligence artificielle se tiendra à Tokyo le 15 mars 2024.", "modalities": [], "metadata": {"xNER": [{"entity": "intelligence artificielle", "type": "EVENT"}, {"entity": "Tokyo", "type": "LOCATION"}, {"entity": "15 mars 2024", "type": "DATE"}]}} 11 | {"text": "Bernard Arnault a annoncé que LVMH lancera sa prochaine campagne publicitaire pour les JO de Paris 2024 le 12 décembre 2024.", "modalities": [], "metadata": {"xNER": [{"entity": "Bernard Arnault", "type": "PERSON"}, {"entity": "LVMH", "type": "ORGANIZATION"}, {"entity": "Paris", "type": "LOCATION"}, {"entity": "12 décembre 2024", "type": "DATE"}]}} 12 | {"text": "La Poste a expédié le colis de son entrepôt à Marseille à un client à Biarritz.", "modalities": [], "metadata": {"xNER": [{"entity": "La Poste", "type": "ORGANIZATION"}, {"entity": "Marseille", "type": "LOCATION"}, {"entity": "Biarritz", "type": "LOCATION"}]}} -------------------------------------------------------------------------------- /examples/sample_data/example_outputs_ner/pp/1___🌍 Language ID.jsonl: -------------------------------------------------------------------------------- 1 | {"text": "Barack Obama was born on August 4, 1961, in Honolulu, Hawaii.", "modalities": [], "metadata": {"xNER": [{"entity": "Barack Obama", "type": "PERSON"}, {"entity": "August 4, 1961", "type": "DATE"}, {"entity": "Honolulu", "type": "LOCATION"}, {"entity": "Hawaii", "type": "LOCATION"}], "language": "en", "language_score": 0.9943996071815491}} 2 | {"text": "Google LLC, headquartered in Mountain View, California, was founded by Larry Page and Sergey Brin.", "modalities": [], "metadata": {"xNER": [{"entity": "Google LLC", "type": "ORGANIZATION"}, {"entity": "Mountain View", "type": "LOCATION"}, {"entity": "California", "type": "LOCATION"}, {"entity": "Larry Page", "type": "PERSON"}, {"entity": "Sergey Brin", "type": "PERSON"}], "language": "en", "language_score": 0.9700106978416443}} 3 | {"text": "The Eiffel Tower, located in Paris, France, attracts millions of tourists every year.", "modalities": [], "metadata": {"xNER": [{"entity": "Eiffel Tower", "type": "LOCATION"}, {"entity": "Paris", "type": "LOCATION"}, {"entity": "France", "type": "LOCATION"}], "language": "en", "language_score": 0.9781987071037292}} 4 | {"text": "The conference on artificial intelligence will be held in Tokyo on March 15, 2024.", "modalities": [], "metadata": {"xNER": [{"entity": "artificial intelligence", "type": "EVENT"}, {"entity": "Tokyo", "type": "LOCATION"}, {"entity": "March 15, 2024", "type": "DATE"}], "language": "en", "language_score": 0.9582112431526184}} 5 | {"text": "Amazon shipped the package from its warehouse in Seattle to a customer in New York.", "modalities": [], "metadata": {"xNER": [{"entity": "Amazon", "type": "ORGANIZATION"}, {"entity": "Seattle", "type": "LOCATION"}, {"entity": "New York", "type": "LOCATION"}], "language": "en", "language_score": 0.953260600566864}} 6 | {"text": "Elon Musk announced that SpaceX will launch its next rocket from Cape Canaveral on December 12, 2024.", "modalities": [], "metadata": {"xNER": [{"entity": "Elon Musk", "type": "PERSON"}, {"entity": "SpaceX", "type": "ORGANIZATION"}, {"entity": "Cape Canaveral", "type": "LOCATION"}, {"entity": "December 12, 2024", "type": "DATE"}], "language": "en", "language_score": 0.9796376824378967}} 7 | -------------------------------------------------------------------------------- /examples/sample_data/example_outputs_ner/pp/2___🍷 FineWeb Quality.jsonl: -------------------------------------------------------------------------------- 1 | {"text": "Barack Obama was born on August 4, 1961, in Honolulu, Hawaii.", "modalities": [], "metadata": {"xNER": [{"entity": "Barack Obama", "type": "PERSON"}, {"entity": "August 4, 1961", "type": "DATE"}, {"entity": "Honolulu", "type": "LOCATION"}, {"entity": "Hawaii", "type": "LOCATION"}], "language": "en", "language_score": 0.9943996071815491}} 2 | {"text": "Google LLC, headquartered in Mountain View, California, was founded by Larry Page and Sergey Brin.", "modalities": [], "metadata": {"xNER": [{"entity": "Google LLC", "type": "ORGANIZATION"}, {"entity": "Mountain View", "type": "LOCATION"}, {"entity": "California", "type": "LOCATION"}, {"entity": "Larry Page", "type": "PERSON"}, {"entity": "Sergey Brin", "type": "PERSON"}], "language": "en", "language_score": 0.9700106978416443}} 3 | {"text": "The Eiffel Tower, located in Paris, France, attracts millions of tourists every year.", "modalities": [], "metadata": {"xNER": [{"entity": "Eiffel Tower", "type": "LOCATION"}, {"entity": "Paris", "type": "LOCATION"}, {"entity": "France", "type": "LOCATION"}], "language": "en", "language_score": 0.9781987071037292}} 4 | {"text": "The conference on artificial intelligence will be held in Tokyo on March 15, 2024.", "modalities": [], "metadata": {"xNER": [{"entity": "artificial intelligence", "type": "EVENT"}, {"entity": "Tokyo", "type": "LOCATION"}, {"entity": "March 15, 2024", "type": "DATE"}], "language": "en", "language_score": 0.9582112431526184}} 5 | {"text": "Amazon shipped the package from its warehouse in Seattle to a customer in New York.", "modalities": [], "metadata": {"xNER": [{"entity": "Amazon", "type": "ORGANIZATION"}, {"entity": "Seattle", "type": "LOCATION"}, {"entity": "New York", "type": "LOCATION"}], "language": "en", "language_score": 0.953260600566864}} 6 | {"text": "Elon Musk announced that SpaceX will launch its next rocket from Cape Canaveral on December 12, 2024.", "modalities": [], "metadata": {"xNER": [{"entity": "Elon Musk", "type": "PERSON"}, {"entity": "SpaceX", "type": "ORGANIZATION"}, {"entity": "Cape Canaveral", "type": "LOCATION"}, {"entity": "December 12, 2024", "type": "DATE"}], "language": "en", "language_score": 0.9796376824378967}} 7 | -------------------------------------------------------------------------------- /examples/sample_data/example_outputs_ner/pp/3___🧩 Chunker.jsonl: -------------------------------------------------------------------------------- 1 | {"text": "Barack Obama was born on August 4, 1961, in Honolulu, Hawaii.", "modalities": [], "metadata": {"xNER": [{"entity": "Barack Obama", "type": "PERSON"}, {"entity": "August 4, 1961", "type": "DATE"}, {"entity": "Honolulu", "type": "LOCATION"}, {"entity": "Hawaii", "type": "LOCATION"}], "language": "en", "language_score": 0.9943996071815491}} 2 | {"text": "Google LLC, headquartered in Mountain View, California, was founded by Larry Page and Sergey Brin.", "modalities": [], "metadata": {"xNER": [{"entity": "Google LLC", "type": "ORGANIZATION"}, {"entity": "Mountain View", "type": "LOCATION"}, {"entity": "California", "type": "LOCATION"}, {"entity": "Larry Page", "type": "PERSON"}, {"entity": "Sergey Brin", "type": "PERSON"}], "language": "en", "language_score": 0.9700106978416443}} 3 | {"text": "The Eiffel Tower, located in Paris, France, attracts millions of tourists every year.", "modalities": [], "metadata": {"xNER": [{"entity": "Eiffel Tower", "type": "LOCATION"}, {"entity": "Paris", "type": "LOCATION"}, {"entity": "France", "type": "LOCATION"}], "language": "en", "language_score": 0.9781987071037292}} 4 | {"text": "The conference on artificial intelligence will be held in Tokyo on March 15, 2024.", "modalities": [], "metadata": {"xNER": [{"entity": "artificial intelligence", "type": "EVENT"}, {"entity": "Tokyo", "type": "LOCATION"}, {"entity": "March 15, 2024", "type": "DATE"}], "language": "en", "language_score": 0.9582112431526184}} 5 | {"text": "Amazon shipped the package from its warehouse in Seattle to a customer in New York.", "modalities": [], "metadata": {"xNER": [{"entity": "Amazon", "type": "ORGANIZATION"}, {"entity": "Seattle", "type": "LOCATION"}, {"entity": "New York", "type": "LOCATION"}], "language": "en", "language_score": 0.953260600566864}} 6 | {"text": "Elon Musk announced that SpaceX will launch its next rocket from Cape Canaveral on December 12, 2024.", "modalities": [], "metadata": {"xNER": [{"entity": "Elon Musk", "type": "PERSON"}, {"entity": "SpaceX", "type": "ORGANIZATION"}, {"entity": "Cape Canaveral", "type": "LOCATION"}, {"entity": "December 12, 2024", "type": "DATE"}], "language": "en", "language_score": 0.9796376824378967}} 7 | -------------------------------------------------------------------------------- /examples/sample_data/example_outputs_ner/pp/4___🗣️ Lang Detector.jsonl: -------------------------------------------------------------------------------- 1 | {"text": "Barack Obama was born on August 4, 1961, in Honolulu, Hawaii.", "modalities": [], "metadata": {"xNER": [{"entity": "Barack Obama", "type": "PERSON"}, {"entity": "August 4, 1961", "type": "DATE"}, {"entity": "Honolulu", "type": "LOCATION"}, {"entity": "Hawaii", "type": "LOCATION"}], "language": "en", "language_score": 0.9943996071815491, "lang": "en"}} 2 | {"text": "Google LLC, headquartered in Mountain View, California, was founded by Larry Page and Sergey Brin.", "modalities": [], "metadata": {"xNER": [{"entity": "Google LLC", "type": "ORGANIZATION"}, {"entity": "Mountain View", "type": "LOCATION"}, {"entity": "California", "type": "LOCATION"}, {"entity": "Larry Page", "type": "PERSON"}, {"entity": "Sergey Brin", "type": "PERSON"}], "language": "en", "language_score": 0.9700106978416443, "lang": "en"}} 3 | {"text": "The Eiffel Tower, located in Paris, France, attracts millions of tourists every year.", "modalities": [], "metadata": {"xNER": [{"entity": "Eiffel Tower", "type": "LOCATION"}, {"entity": "Paris", "type": "LOCATION"}, {"entity": "France", "type": "LOCATION"}], "language": "en", "language_score": 0.9781987071037292, "lang": "en"}} 4 | {"text": "The conference on artificial intelligence will be held in Tokyo on March 15, 2024.", "modalities": [], "metadata": {"xNER": [{"entity": "artificial intelligence", "type": "EVENT"}, {"entity": "Tokyo", "type": "LOCATION"}, {"entity": "March 15, 2024", "type": "DATE"}], "language": "en", "language_score": 0.9582112431526184, "lang": "en"}} 5 | {"text": "Amazon shipped the package from its warehouse in Seattle to a customer in New York.", "modalities": [], "metadata": {"xNER": [{"entity": "Amazon", "type": "ORGANIZATION"}, {"entity": "Seattle", "type": "LOCATION"}, {"entity": "New York", "type": "LOCATION"}], "language": "en", "language_score": 0.953260600566864, "lang": "en"}} 6 | {"text": "Elon Musk announced that SpaceX will launch its next rocket from Cape Canaveral on December 12, 2024.", "modalities": [], "metadata": {"xNER": [{"entity": "Elon Musk", "type": "PERSON"}, {"entity": "SpaceX", "type": "ORGANIZATION"}, {"entity": "Cape Canaveral", "type": "LOCATION"}, {"entity": "December 12, 2024", "type": "DATE"}], "language": "en", "language_score": 0.9796376824378967, "lang": "en"}} 7 | -------------------------------------------------------------------------------- /examples/sample_data/example_outputs_ner/pp/final_pp.jsonl: -------------------------------------------------------------------------------- 1 | {"text": "Barack Obama was born on August 4, 1961, in Honolulu, Hawaii.", "modalities": [], "metadata": {"xNER": [{"entity": "Barack Obama", "type": "PERSON"}, {"entity": "August 4, 1961", "type": "DATE"}, {"entity": "Honolulu", "type": "LOCATION"}, {"entity": "Hawaii", "type": "LOCATION"}], "language": "en", "language_score": 0.9943996071815491, "lang": "en"}} 2 | {"text": "Google LLC, headquartered in Mountain View, California, was founded by Larry Page and Sergey Brin.", "modalities": [], "metadata": {"xNER": [{"entity": "Google LLC", "type": "ORGANIZATION"}, {"entity": "Mountain View", "type": "LOCATION"}, {"entity": "California", "type": "LOCATION"}, {"entity": "Larry Page", "type": "PERSON"}, {"entity": "Sergey Brin", "type": "PERSON"}], "language": "en", "language_score": 0.9700106978416443, "lang": "en"}} 3 | {"text": "The Eiffel Tower, located in Paris, France, attracts millions of tourists every year.", "modalities": [], "metadata": {"xNER": [{"entity": "Eiffel Tower", "type": "LOCATION"}, {"entity": "Paris", "type": "LOCATION"}, {"entity": "France", "type": "LOCATION"}], "language": "en", "language_score": 0.9781987071037292, "lang": "en"}} 4 | {"text": "The conference on artificial intelligence will be held in Tokyo on March 15, 2024.", "modalities": [], "metadata": {"xNER": [{"entity": "artificial intelligence", "type": "EVENT"}, {"entity": "Tokyo", "type": "LOCATION"}, {"entity": "March 15, 2024", "type": "DATE"}], "language": "en", "language_score": 0.9582112431526184, "lang": "en"}} 5 | {"text": "Amazon shipped the package from its warehouse in Seattle to a customer in New York.", "modalities": [], "metadata": {"xNER": [{"entity": "Amazon", "type": "ORGANIZATION"}, {"entity": "Seattle", "type": "LOCATION"}, {"entity": "New York", "type": "LOCATION"}], "language": "en", "language_score": 0.953260600566864, "lang": "en"}} 6 | {"text": "Elon Musk announced that SpaceX will launch its next rocket from Cape Canaveral on December 12, 2024.", "modalities": [], "metadata": {"xNER": [{"entity": "Elon Musk", "type": "PERSON"}, {"entity": "SpaceX", "type": "ORGANIZATION"}, {"entity": "Cape Canaveral", "type": "LOCATION"}, {"entity": "December 12, 2024", "type": "DATE"}], "language": "en", "language_score": 0.9796376824378967, "lang": "en"}} 7 | -------------------------------------------------------------------------------- /examples/sample_data/html/ex1.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | Test Page 4 | 5 |

Hello, World!

6 |

This is a test HTML document.

7 | Test Image 8 | 9 | 10 | -------------------------------------------------------------------------------- /examples/sample_data/images/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/examples/sample_data/images/image.png -------------------------------------------------------------------------------- /examples/sample_data/md/test.md: -------------------------------------------------------------------------------- 1 | # Test with images 2 | 3 | ![Image](../images/image.png) 4 | 5 | # Test with remote images 6 | 7 | ![Image](https://www.epfl.ch/about/overview/wp-content/uploads/2020/07/logo-epfl-1152x648.png) 8 | -------------------------------------------------------------------------------- /examples/sample_data/media/sample_audio.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/examples/sample_data/media/sample_audio.mp3 -------------------------------------------------------------------------------- /examples/sample_data/media/video.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/examples/sample_data/media/video.mp4 -------------------------------------------------------------------------------- /examples/sample_data/pdf/Corona Virus Disease-19 pandemic.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/examples/sample_data/pdf/Corona Virus Disease-19 pandemic.pdf -------------------------------------------------------------------------------- /examples/sample_data/pdf/calendar.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/examples/sample_data/pdf/calendar.pdf -------------------------------------------------------------------------------- /examples/sample_data/pdf/llava-interleave.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/examples/sample_data/pdf/llava-interleave.pdf -------------------------------------------------------------------------------- /examples/sample_data/pptx/ada.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/examples/sample_data/pptx/ada.pptx -------------------------------------------------------------------------------- /examples/sample_data/spreadsheet/survey.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/examples/sample_data/spreadsheet/survey.xlsx -------------------------------------------------------------------------------- /examples/sample_data/txt/poeme.txt: -------------------------------------------------------------------------------- 1 | Demain, dès l’aube, à l’heure où blanchit la campagne, 2 | Je partirai. Vois-tu, je sais que tu m’attends. 3 | J’irai par la forêt, j’irai par la montagne. 4 | Je ne puis demeurer loin de toi plus longtemps. 5 | 6 | Je marcherai les yeux fixés sur mes pensées, 7 | Sans rien voir au dehors, sans entendre aucun bruit, 8 | Seul, inconnu, le dos courbé, les mains croisées, 9 | Triste, et le jour pour moi sera comme la nuit. 10 | 11 | Je ne regarderai ni l’or du soir qui tombe, 12 | Ni les voiles au loin descendant vers Harfleur, 13 | Et quand j’arriverai, je mettrai sur ta tombe 14 | Un bouquet de houx vert et de bruyère en fleur. 15 | 16 | -------------------------------------------------------------------------------- /examples/sample_data/txt/test.txt: -------------------------------------------------------------------------------- 1 | The 3 a.m. Puzzle 2 | 3 | At 3 a.m., the screen was bare, 4 | We sat and wondered what to put there. 5 | Ideas came, then slipped from view, 6 | A riddle? A joke? We had no clue. 7 | 8 | So here’s a verse, both true and plain: 9 | At 3 a.m., we puzzled in vain. -------------------------------------------------------------------------------- /examples/websearchRAG/config.yaml: -------------------------------------------------------------------------------- 1 | websearch: 2 | use_rag: true 3 | rag_config_path: examples/rag/config.yaml 4 | use_summary: true 5 | n_subqueries : 2 6 | input_file: examples/rag/output.json 7 | input_queries: examples/rag/queries.jsonl 8 | output_file: examples/websearchRAG/enhanced_results_trial.json 9 | n_loops: 2 10 | max_searches: 5 11 | mode: local 12 | llm_config: 13 | llm_name: OpenMeditron/meditron3-8b 14 | max_new_tokens: 1200 15 | -------------------------------------------------------------------------------- /examples/websearchRAG/config_api.yaml: -------------------------------------------------------------------------------- 1 | websearch: 2 | rag_config_path: examples/rag/config_api.yaml 3 | n_subqueries : 3 4 | n_loops: 2 5 | max_searches: 10 6 | mode: api 7 | llm_config: 8 | llm_name: OpenMeditron/meditron3-8b 9 | max_new_tokens: 1200 10 | 11 | # Mode Config 12 | mode_args: 13 | endpoint: '/rag' 14 | port: 8000 15 | host: 'localhost' 16 | -------------------------------------------------------------------------------- /examples/who/index/index_config_bge.yaml: -------------------------------------------------------------------------------- 1 | indexer: 2 | dense_model: 3 | model_name: BAAI/bge-large-en-v1.5 4 | sparse_model: 5 | model_name: splade 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_bge_ivf2.db 8 | collection_name: who 9 | documents_path: "/mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/pp/2___🧩 Chunker.jsonl" 10 | batch_size: 64 -------------------------------------------------------------------------------- /examples/who/index/index_config_e5.yaml: -------------------------------------------------------------------------------- 1 | indexer: 2 | dense_model: 3 | model_name: intfloat/e5-large-v2 4 | sparse_model: 5 | model_name: splade 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_e5_ivf.db 8 | collection_name: who 9 | documents_path: "/mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/pp/2___🧩 Chunker.jsonl" 10 | batch_size: 64 -------------------------------------------------------------------------------- /examples/who/index/index_config_gte.yaml: -------------------------------------------------------------------------------- 1 | indexer: 2 | dense_model: 3 | model_name: Alibaba-NLP/gte-large-en-v1.5 4 | sparse_model: 5 | model_name: splade 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_gte_ivf.db 8 | collection_name: who 9 | documents_path: "/mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/pp/2___🧩 Chunker.jsonl" 10 | batch_size: 64 -------------------------------------------------------------------------------- /examples/who/index/index_config_medembed.yaml: -------------------------------------------------------------------------------- 1 | indexer: 2 | dense_model: 3 | model_name: abhinand/MedEmbed-large-v0.1 4 | sparse_model: 5 | model_name: splade 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_medembed_ivf.db 8 | collection_name: who 9 | documents_path: "/mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/pp/2___🧩 Chunker.jsonl" 10 | batch_size: 64 -------------------------------------------------------------------------------- /examples/who/index/index_config_stella.yaml: -------------------------------------------------------------------------------- 1 | indexer: 2 | dense_model: 3 | model_name: dunzhang/stella_en_400M_v5 4 | sparse_model: 5 | model_name: splade 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_stella_ivf.db 8 | collection_name: who 9 | documents_path: "/mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/pp/2___🧩 Chunker.jsonl" 10 | batch_size: 64 -------------------------------------------------------------------------------- /examples/who/pp/chunk_semantic.yaml: -------------------------------------------------------------------------------- 1 | data_path: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/merged/merged_results.jsonl 2 | pipeline: 3 | pp_modules: 4 | - type: chunker 5 | args: 6 | chunking_strategy: semantic 7 | - type: lang_detector 8 | output: 9 | output_path: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk-semantic 10 | save_each_step: True -------------------------------------------------------------------------------- /examples/who/pp/chunk_sentence.yaml: -------------------------------------------------------------------------------- 1 | data_path: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/merged/merged_results.jsonl 2 | pipeline: 3 | pp_modules: 4 | - type: chunker 5 | args: 6 | chunking_strategy: sentence 7 | - type: lang_detector 8 | output: 9 | output_path: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk-sentence 10 | save_each_step: True -------------------------------------------------------------------------------- /examples/who/pp/pp_config.yaml: -------------------------------------------------------------------------------- 1 | data_path: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/merged/merged_results.jsonl 2 | pipeline: 3 | pp_modules: 4 | - type: datatrove_filter 5 | args: 6 | datatrove_name: language 7 | exclusion_writer: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/logs/language.jsonl 8 | datatrove_args: 9 | languages: ["en"] 10 | - type: chunker 11 | args: 12 | chunking_strategy: sentence 13 | - type: datatrove_filter 14 | args: 15 | datatrove_name: gopher-repetition 16 | exclusion_writer: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/logs/fineweb.jsonl 17 | - type: datatrove_filter 18 | args: 19 | datatrove_name: gopher-quality 20 | exclusion_writer: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/logs/fineweb.jsonl 21 | - type: datatrove_filter 22 | args: 23 | datatrove_name: c4 24 | exclusion_writer: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/logs/fineweb.jsonl 25 | - type: datatrove_filter 26 | args: 27 | datatrove_name: fineweb 28 | exclusion_writer: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/logs/fineweb.jsonl 29 | output: 30 | output_path: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/pp 31 | save_each_step: True -------------------------------------------------------------------------------- /examples/who/scott_ds/k1/scott_gpt4o_bge.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: gpt-4o 4 | retriever: 5 | db: 6 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_bge_ivf.db 7 | hybrid_search_weight: 0.5 8 | k: 1 9 | system_prompt: "You will have to answer a question. You can use or not the following context to answer the question.\n\nContext:\n{context}" 10 | mode: local 11 | mode_args: 12 | input_file: examples/who/scott_ds_queries.jsonl 13 | output_file: examples/who/scott_ds/k1/scott_ds_gpt4o_bge.jsonl -------------------------------------------------------------------------------- /examples/who/scott_ds/k1/scott_meditron70b_bge.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: OpenMeditron/Meditron3-70B 4 | max_new_tokens: 8000 5 | retriever: 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_bge_ivf2.db 8 | hybrid_search_weight: 0.5 9 | k: 1 10 | system_prompt: "You will have to answer a question. You can use or not the following context to answer the question.\n\nContext:\n{context}" 11 | mode: local 12 | mode_args: 13 | input_file: examples/who/scott_ds_queries.jsonl 14 | output_file: examples/who/scott_ds/k1/scott_ds_meditron70b_bge.jsonl -------------------------------------------------------------------------------- /examples/who/scott_ds/k1/scott_meditron8b_bge.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: OpenMeditron/Meditron3-8B 4 | max_new_tokens: 8000 5 | retriever: 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_bge_ivf2.db 8 | hybrid_search_weight: 0.5 9 | k: 1 10 | system_prompt: "You will have to answer a question. You can use or not the following context to answer the question.\n\nContext:\n{context}" 11 | mode: local 12 | mode_args: 13 | input_file: examples/who/scott_ds_queries.jsonl 14 | output_file: examples/who/scott_ds/k1/scott_ds_meditron8b_bge.jsonl -------------------------------------------------------------------------------- /examples/who/scott_ds/k3/scott_gpt4o_bge.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: gpt-4o 4 | retriever: 5 | db: 6 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_bge_ivf.db 7 | hybrid_search_weight: 0.5 8 | k: 3 9 | system_prompt: "You will have to answer a question. You can use or not the following context to answer the question.\n\nContext:\n{context}" 10 | mode: local 11 | mode_args: 12 | input_file: examples/who/scott_ds_queries.jsonl 13 | output_file: examples/who/scott_ds/k3/scott_ds_gpt4o_bge.jsonl -------------------------------------------------------------------------------- /examples/who/scott_ds/k3/scott_meditron70b_bge.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: OpenMeditron/Meditron3-70B 4 | max_new_tokens: 8000 5 | retriever: 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_bge_ivf2.db 8 | hybrid_search_weight: 0.5 9 | k: 3 10 | system_prompt: "You will have to answer a question. You can use or not the following context to answer the question.\n\nContext:\n{context}" 11 | mode: local 12 | mode_args: 13 | input_file: examples/who/scott_ds_queries.jsonl 14 | output_file: examples/who/scott_ds/k3/scott_ds_meditron70b_bge.jsonl -------------------------------------------------------------------------------- /examples/who/scott_ds/k3/scott_meditron8b_bge.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: OpenMeditron/Meditron3-8B 4 | max_new_tokens: 8000 5 | retriever: 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_bge_ivf.db 8 | hybrid_search_weight: 0.5 9 | k: 3 10 | system_prompt: "You will have to answer a question. You can use or not the following context to answer the question.\n\nContext:\n{context}" 11 | mode: local 12 | mode_args: 13 | input_file: examples/who/scott_ds_queries.jsonl 14 | output_file: examples/who/scott_ds/k3/scott_ds_meditron8b_bge.jsonl -------------------------------------------------------------------------------- /examples/who/scott_ds/norag/scott_ds_gpt4o_norag_100tokens.jsonl: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "answer": "In a patient with severe malaria and acute kidney injury, the dosing of quinine needs to be carefully adjusted to prevent accumulation and toxicity, while still ensuring effective treatment of the malaria. Quinine has renal clearance, and in the context of renal impairment, its elimination is reduced, necessitating dosage adjustment.\n\nHere is a general guideline for dosing quinine in patients with renal impairment:\n\n1. **Loading Dose**: Start with a standard loading dose of quinine, which is typically 20 mg/kg given", 4 | "input": "I am treating a patient with severe malaria who has an acute kidney injury due to their disease severity. We do not have artesunate in stock and we cant do dialysis here, we only have quinine in a parenteral solution. How should I adjust the dosage for Quinine in this patient?", 5 | "context": "" 6 | }, 7 | { 8 | "answer": "In the case of co-infection with tuberculosis (TB) and HIV in children, it's important to manage both conditions carefully to optimize outcomes and minimize complications. For an 8-year-old patient with TB and newly diagnosed HIV, the timing of initiating antiretroviral therapy (ART) should be considered carefully.\n\nCurrent guidelines generally recommend starting ART as soon as possible in all children with HIV, regardless of TB co-infection. However, in the specific scenario of TB meningitis, there is a need", 9 | "input": "My 8yr old female patient was recently diagnosed with TB and TB meningitis. She has improved since starting her TB treatment 3 weeks ago, but she was also diagnosed with HIV, can I start her ART now?", 10 | "context": "" 11 | }, 12 | { 13 | "answer": "In the scenario you described, where a neonate is not breathing within one minute after birth and has a heart rate of 30-40 beats per minute, both positive pressure ventilation (PPV) and chest compressions are indicated. According to neonatal resuscitation guidelines, if a neonate's heart rate is below 60 beats per minute after 30 seconds of effective positive pressure ventilation, chest compressions should be started in addition to continuing ventilation. In this situation, the heart rate is significantly", 14 | "input": "When two providers are resuscitating a neonate who isn't breathing within one minute after birth and the heartrate is 30-40 beats per minute should we assist ventilation and do chest compressions or just ventilate?", 15 | "context": "" 16 | } 17 | ] -------------------------------------------------------------------------------- /examples/who/scott_ds/norag/scott_ds_meditron70b_norag_100tokens.jsonl: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "input": "I am treating a patient with severe malaria who has an acute kidney injury due to their disease severity. We do not have artesunate in stock and we cant do dialysis here, we only have quinine in a parenteral solution. How should I adjust the dosage for Quinine in this patient?", 4 | "answer": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou will have to answer a question.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nI am treating a patient with severe malaria who has an acute kidney injury due to their disease severity. We do not have artesunate in stock and we cant do dialysis here, we only have quinine in a parenteral solution. How should I adjust the dosage for Quinine in this patient?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nQuinine should be used with caution in patients with acute kidney injury as it is excreted by the kidneys. The dosage should be adjusted according to the patient's renal function. The recommended dose for quinine in patients with severe malaria is 10 mg/kg of body weight every 8 hours, but this dose may need to be adjusted in patients with acute kidney injury. It is recommended to consult a specialist or a drug reference guide for specific dosage adjustments in this patient population.", 5 | "context": "" 6 | }, 7 | { 8 | "input": "My 8yr old female patient was recently diagnosed with TB and TB meningitis. She has improved since starting her TB treatment 3 weeks ago, but she was also diagnosed with HIV, can I start her ART now?", 9 | "answer": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou will have to answer a question.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nMy 8yr old female patient was recently diagnosed with TB and TB meningitis. She has improved since starting her TB treatment 3 weeks ago, but she was also diagnosed with HIV, can I start her ART now?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n## Initiating Antiretroviral Therapy (ART) in a Child with TB Meningitis and HIV\n\n### Background:\nIn patients with TB meningitis and HIV, the timing of ART initiation is crucial to prevent further complications and ensure optimal outcomes.\n\n### Factors to Consider:\n- **TB Meningitis Severity**: The patient's condition must be stabilized before starting ART.\n- **TB Treatment Response**: The patient should show improvement on TB treatment before initiating ART.\n- **HIV Treatment Guidelines**:", 10 | "context": "" 11 | }, 12 | { 13 | "input": "When two providers are resuscitating a neonate who isn't breathing within one minute after birth and the heartrate is 30-40 beats per minute should we assist ventilation and do chest compressions or just ventilate?", 14 | "answer": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou will have to answer a question.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhen two providers are resuscitating a neonate who isn't breathing within one minute after birth and the heartrate is 30-40 beats per minute should we assist ventilation and do chest compressions or just ventilate?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n## Neonatal Resuscitation: Ventilation and Chest Compressions\n\n### Indications for Chest Compressions\n\nChest compressions are indicated in neonatal resuscitation when the heart rate remains below 60 beats per minute despite:\n\n1. **Effective Ventilation**: Proper ventilation has been provided for at least 30 seconds.\n2. **No Improvement in Heart Rate**: The heart rate has not increased above 60 beats per minute.\n\n### Initial Steps\n\n1. **Ventilation**: Assist ventilation", 15 | "context": "" 16 | } 17 | ] -------------------------------------------------------------------------------- /examples/who/scott_ds/norag/scott_ds_meditron8b_norag_100tokens.jsonl: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "context": "", 4 | "answer": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou will have to answer a question.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nI am treating a patient with severe malaria who has an acute kidney injury due to their disease severity. We do not have artesunate in stock and we cant do dialysis here, we only have quinine in a parenteral solution. How should I adjust the dosage for Quinine in this patient?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n## Adjusting Quinine Dosage in Patients with Acute Kidney Injury\n\n### Background:\nQuinine is a medication used to treat severe malaria. However, patients with acute kidney injury (AKI) may require dose adjustments to ensure optimal efficacy and minimize toxicity.\n\n### Factors to Consider:\n\n* **Kidney Function:** Quinine is primarily excreted by the kidneys. In patients with AKI, the kidneys' ability to clear the drug is impaired, leading to potential accumulation and toxicity.\n", 5 | "input": "I am treating a patient with severe malaria who has an acute kidney injury due to their disease severity. We do not have artesunate in stock and we cant do dialysis here, we only have quinine in a parenteral solution. How should I adjust the dosage for Quinine in this patient?" 6 | }, 7 | { 8 | "context": "", 9 | "answer": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou will have to answer a question.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nMy 8yr old female patient was recently diagnosed with TB and TB meningitis. She has improved since starting her TB treatment 3 weeks ago, but she was also diagnosed with HIV, can I start her ART now?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nYes, it is recommended to start antiretroviral therapy (ART) as soon as possible after the diagnosis of HIV, even if the patient has been diagnosed with TB. This is because starting ART early can help to reduce the risk of HIV transmission to others, and it can also help to improve the patient's immune system and overall health. It is important to work closely with a healthcare provider to ensure that the patient is receiving the best possible care and to monitor for any potential side effects of", 10 | "input": "My 8yr old female patient was recently diagnosed with TB and TB meningitis. She has improved since starting her TB treatment 3 weeks ago, but she was also diagnosed with HIV, can I start her ART now?" 11 | }, 12 | { 13 | "context": "", 14 | "answer": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou will have to answer a question.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhen two providers are resuscitating a neonate who isn't breathing within one minute after birth and the heartrate is 30-40 beats per minute should we assist ventilation and do chest compressions or just ventilate?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n## Neonatal Resuscitation Protocol\n\n### Overview\nNeonatal resuscitation is a critical process that involves providing life-saving interventions to newborns who are not breathing or have a weak pulse. The American Academy of Pediatrics (AAP) and the American Heart Association (AHA) have established a step-by-step approach for neonatal resuscitation.\n\n### Initial Steps\nThe initial steps in neonatal resuscitation include:\n\n1. **Clearing the Airway**\n - Remove any me", 15 | "input": "When two providers are resuscitating a neonate who isn't breathing within one minute after birth and the heartrate is 30-40 beats per minute should we assist ventilation and do chest compressions or just ventilate?" 16 | } 17 | ] -------------------------------------------------------------------------------- /examples/who/scott_ds/norag/scott_gpt4o_norag.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: gpt-4o 4 | retriever: 5 | db: 6 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_770/who_guidelines_bge_sample.db 7 | k: 0 8 | system_prompt: "You will have to answer a question.{context}" 9 | mode: local 10 | mode_args: 11 | input_file: examples/who/scott_ds_queries.jsonl 12 | output_file: examples/who/scott_ds/norag/scott_ds_gpt4o_norag.jsonl -------------------------------------------------------------------------------- /examples/who/scott_ds/norag/scott_meditron70b_norag.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: OpenMeditron/Meditron3-70B 4 | max_new_tokens: 8000 5 | retriever: 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_770/who_guidelines_bge_sample_3.db 8 | k: 0 9 | system_prompt: "You will have to answer a question.{context}" 10 | mode: local 11 | mode_args: 12 | input_file: examples/who/scott_ds_queries.jsonl 13 | output_file: examples/who/scott_ds/norag/scott_ds_meditron70b_norag.jsonl -------------------------------------------------------------------------------- /examples/who/scott_ds/norag/scott_meditron8b_norag.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: OpenMeditron/Meditron3-8B 4 | max_new_tokens: 8000 5 | retriever: 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_770/who_guidelines_bge_sample_2.db 8 | k: 0 9 | system_prompt: "You will have to answer a question.{context}" 10 | mode: local 11 | mode_args: 12 | input_file: examples/who/scott_ds_queries.jsonl 13 | output_file: examples/who/scott_ds/norag/scott_ds_meditron8b_norag.jsonl -------------------------------------------------------------------------------- /examples/who/scott_ds_queries.jsonl: -------------------------------------------------------------------------------- 1 | {"input": "I am treating a patient with severe malaria who has an acute kidney injury due to their disease severity. We do not have artesunate in stock and we cant do dialysis here, we only have quinine in a parenteral solution. How should I adjust the dosage for Quinine in this patient?", "collection_name": "who"} 2 | {"input": "My 8yr old female patient was recently diagnosed with TB and TB meningitis. She has improved since starting her TB treatment 3 weeks ago, but she was also diagnosed with HIV, can I start her ART now?", "collection_name": "who"} 3 | {"input": "When two providers are resuscitating a neonate who isn't breathing within one minute after birth and the heartrate is 30-40 beats per minute should we assist ventilation and do chest compressions or just ventilate?", "collection_name": "who"} 4 | {"input": "I have a pregnant patient at risk of pre-eclampsia and my colleague suggested aspirin prevention therapy, is this correct? What is the correct dose and is there a risk of PPH?", "collection_name": "who"} -------------------------------------------------------------------------------- /examples/who/scott_gpt4o_bge.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: gpt-4o 4 | retriever: 5 | db: 6 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_medembed.db 7 | hybrid_search_weight: 0.5 8 | k: 1 9 | system_prompt: "You will have to answer a question. You can use or not the following context to answer the question.\n\nContext:\n{context}" 10 | mode: local 11 | mode_args: 12 | input_file: examples/who/scott_ds_queries.jsonl 13 | output_file: examples/who/scott_ds_openai_norag.jsonl -------------------------------------------------------------------------------- /examples/who/scott_meditron70b_bge.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: OpenMeditron/Meditron3-70B 4 | max_new_tokens: 8000 5 | retriever: 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_bge_ivf2.db 8 | hybrid_search_weight: 0.5 9 | k: 3 10 | system_prompt: "You will have to answer a question. You can use or not the following context to answer the question.\n\nContext:\n{context}" 11 | mode: local 12 | mode_args: 13 | input_file: examples/who/scottcorr_ds_queries.jsonl 14 | output_file: examples/who/scottcorr_ds_meditron70b_bge.jsonl -------------------------------------------------------------------------------- /examples/who/scott_meditron70b_bge_1.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: OpenMeditron/Meditron3-70B 4 | max_new_tokens: 8000 5 | retriever: 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_bge_ivf2.db 8 | hybrid_search_weight: 0.5 9 | k: 1 10 | system_prompt: "You will have to answer a question. You can use or not the following context to answer the question.\n\nContext:\n{context}" 11 | mode: local 12 | mode_args: 13 | input_file: examples/who/scott_ds_queries.jsonl 14 | output_file: examples/who/scott_ds_meditron70b_bge.jsonl -------------------------------------------------------------------------------- /examples/who/scott_meditron70b_medembed.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: OpenMeditron/Meditron3-70B 4 | max_new_tokens: 8000 5 | retriever: 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_medembed_ivf.db 8 | hybrid_search_weight: 0.5 9 | k: 3 10 | system_prompt: "You will have to answer a question. You can use or not the following context to answer the question.\n\nContext:\n{context}" 11 | mode: local 12 | mode_args: 13 | input_file: examples/who/scott_ds_queries.jsonl 14 | output_file: examples/who/scott_ds_meditron70b_medembed.jsonl -------------------------------------------------------------------------------- /examples/who/scott_meditron8b_bge.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: OpenMeditron/Meditron3-8B 4 | max_new_tokens: 8000 5 | retriever: 6 | db: 7 | uri: /mnt/mlo/scratch/homes/mmore/datasets/who_guidelines/processed/extraction_results_47k/chunk_guidelines_bge_ivf.db 8 | hybrid_search_weight: 0.5 9 | k: 3 10 | system_prompt: "You will have to answer a question. You can use or not the following context to answer the question.\n\nContext:\n{context}" 11 | mode: local 12 | mode_args: 13 | input_file: examples/who/scottcorr_ds_queries.jsonl 14 | output_file: examples/who/scottcorr_ds_meditron8b_bge.jsonl -------------------------------------------------------------------------------- /examples/who/scottcorr_ds_queries.jsonl: -------------------------------------------------------------------------------- 1 | {"input": "I am treating a patient with severe malaria who has an acute kidney injury due to their disease severity. We do not have artesunate in stock and we cant do dialysis here, we only have quinine in a parenteral solution. How should I adjust the dosage for Quinine in this patient?", "collection_name": "who"} 2 | {"input": "My 8 years old children female patient was recently diagnosed with TB and TB meningitis. She has improved since starting her TB treatment 3 weeks ago, but she was also diagnosed with HIV, can I start her ART now?", "collection_name": "who"} 3 | {"input": "When two providers are resuscitating a neonate who isn't breathing within one minute after birth and the heartrate is 30-40 beats per minute should we assist ventilation and do chest compressions or just ventilate?", "collection_name": "who"} -------------------------------------------------------------------------------- /mmore_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/mmore_logo.jpg -------------------------------------------------------------------------------- /production-config/index/config.yaml: -------------------------------------------------------------------------------- 1 | indexer: 2 | dense_model: 3 | model_name: sentence-transformers/all-MiniLM-L6-v2 4 | is_multimodal: false 5 | sparse_model: 6 | model_name: splade 7 | is_multimodal: false 8 | db: 9 | uri: $ROOT_OUT_DIR/db/proc_demo.db 10 | name: my_db 11 | collection_name: my_docs 12 | documents_path: '$ROOT_OUT_DIR/postprocessor/outputs/merged/final_pp.jsonl' 13 | -------------------------------------------------------------------------------- /production-config/postprocessor/config.yaml: -------------------------------------------------------------------------------- 1 | pp_modules: 2 | - type: file_namer 3 | - type: chunker 4 | args: 5 | chunking_strategy: sentence 6 | - type: translator 7 | args: 8 | target_language: en 9 | attachment_tag: 10 | confidence_threshold: 0.7 11 | constrained_languages: 12 | - fr 13 | - en 14 | - type: metafuse 15 | args: 16 | metadata_keys: 17 | - file_name 18 | content_template: Content from {file_name} 19 | position: beginning 20 | 21 | output: 22 | output_path: $ROOT_OUT_DIR/postprocessor/outputs/merged/ 23 | save_each_step: True 24 | -------------------------------------------------------------------------------- /production-config/process/config.yaml: -------------------------------------------------------------------------------- 1 | data_path: $ROOT_IN_DIR/sample_data/ #put absolute path! 2 | dispatcher_config: 3 | output_path: $ROOT_OUT_DIR/process/outputs/ #put absolute path! 4 | use_fast_processors: false 5 | distributed: false 6 | dashboard_backend_url: null 7 | extract_images: true 8 | scheduler_file: $ROOT_OUT_DIR/scheduler-file.json #put absolute path! 9 | process_batch_sizes: 10 | - URLProcessor: 40 11 | - DOCXProcessor: 100 12 | - PDFProcessor: 4000 13 | - MediaProcessor: 40 14 | - SpreadsheetProcessor: 100 15 | - TXTProcessor: 100 16 | - PPTXProcessor: 100 17 | - MarkdownProcessor: 100 18 | - EMLProcessor: 100 19 | - HTMLProcessor: 100 20 | processor_config: 21 | MediaProcessor: 22 | - normal_model: "openai/whisper-large-v3-turbo" 23 | - fast_model: "openai/whisper-tiny" 24 | - type: "automatic-speech-recognition" 25 | - sample_rate: 10 26 | - batch_size: 4 27 | 28 | PDFProcessor: 29 | - PDFTEXT_CPU_WORKERS: 0 30 | - DETECTOR_BATCH_SIZE: 1 31 | - DETECTOR_POSTPROCESSING_CPU_WORKERS: 0 32 | - RECOGNITION_BATCH_SIZE: 1 33 | - OCR_PARALLEL_WORKERS: 0 34 | - TEXIFY_BATCH_SIZE: 1 35 | - LAYOUT_BATCH_SIZE: 1 36 | - ORDER_BATCH_SIZE: 1 37 | - TABLE_REC_BATCH_SIZE: 1 38 | -------------------------------------------------------------------------------- /production-config/rag/config.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | llm: 3 | llm_name: OpenMeditron/meditron3-8b 4 | max_new_tokens: 1200 5 | retriever: 6 | db: 7 | uri: $ROOT_OUT_DIR/db/proc_demo.db 8 | name: 'my_db' 9 | hybrid_search_weight: 0.5 10 | k: 5 11 | system_prompt: "Use the following context to answer the questions.\n\nContext:\n{context}" 12 | mode: local 13 | mode_args: 14 | input_file: $ROOT_IN_DIR/rag/queries.jsonl 15 | output_file: $ROOT_OUT_DIR/rag/output.json 16 | -------------------------------------------------------------------------------- /production-config/rag/config_api.yaml: -------------------------------------------------------------------------------- 1 | # RAG Config 2 | rag: 3 | # LLM Config 4 | llm: 5 | llm_name: Qwen/Qwen3-8B # "epfl-llm/meditron-70b" # "gpt-4o-mini" # Anything supported 6 | max_new_tokens: 1200 7 | temperature: 0.8 8 | # Retriever Config 9 | retriever: 10 | db: 11 | uri: $ROOT_OUT_DIR/db/proc_demo.db 12 | name: my_db 13 | hybrid_search_weight: 0.5 14 | k: 5 15 | # Prompt Args 16 | system_prompt: "Answer the question using the context.\n\nContext: {context}" 17 | # Mode Config 18 | mode: api 19 | mode_args: 20 | endpoint: '/rag' 21 | port: 8000 22 | host: 'localhost' 23 | -------------------------------------------------------------------------------- /production-config/rag/queries.jsonl: -------------------------------------------------------------------------------- 1 | {"input": "When was Barack Obama born?", "collection_name": "my_docs"} 2 | {"input": "Who founded Google?", "collection_name": "my_docs"} 3 | {"input": "Where is the Eiffel Tower located?", "collection_name": "my_docs"} 4 | {"input": "When will the artificial intelligence conference be held?", "collection_name": "my_docs"} 5 | -------------------------------------------------------------------------------- /production-config/retriever_api/config.yaml: -------------------------------------------------------------------------------- 1 | db: 2 | uri: $ROOT_OUT_DIR/db/proc_demo.db 3 | name: my_db 4 | hybrid_search_weight: 0.5 5 | k: 5 6 | collection_name: my_docs 7 | -------------------------------------------------------------------------------- /scripts/data_extractor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | 4 | import requests 5 | 6 | url = "https://corp.digitalcorpora.org/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/0000.zip" 7 | response = requests.get(url) 8 | with open("0000.zip", "wb") as f: 9 | f.write(response.content) 10 | # Unzip the file 11 | 12 | with zipfile.ZipFile("0000.zip", "r") as zip_ref: 13 | zip_ref.extractall("0000") 14 | 15 | print(os.listdir("0000")) 16 | 17 | # Create a folder to store the 100 files 18 | os.mkdir("0000_small") 19 | 20 | 21 | # Extract 100 files, and copy them in '0000_small' folder 22 | for i in range(100): 23 | os.system(f"cp 0000/{os.listdir('0000')[i]} 0000_small") 24 | -------------------------------------------------------------------------------- /scripts/process_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default values 4 | CONFIG_PATH="" 5 | RANK="" 6 | 7 | # Helper function to show usage 8 | usage() { 9 | echo "Usage: $0 --mmore-folder --config-file --rank " 10 | echo "" 11 | echo "Required arguments:" 12 | echo " --config-file Absolute path to the config.yaml file." 13 | echo " --rank Node rank." 14 | exit 1 15 | } 16 | 17 | # Parse command-line arguments 18 | while [[ $# -gt 0 ]]; do 19 | case $1 in 20 | --config-file) 21 | CONFIG_PATH="$2" 22 | shift 2 23 | ;; 24 | --rank) 25 | RANK="$2" 26 | shift 2 27 | ;; 28 | *) 29 | echo "Unknown argument: $1" 30 | usage 31 | ;; 32 | esac 33 | done 34 | 35 | 36 | # Check required arguments 37 | if [[ -z "$CONFIG_PATH" || -z "$RANK" ]]; then 38 | echo "Error: Missing required arguments." 39 | usage 40 | fi 41 | 42 | # Update and install dependencies 43 | echo "Updating system and installing dependencies..." 44 | 45 | # Extract the distributed configuration from the YAML file 46 | distributed=$(grep -A3 'dispatcher_config:' "$CONFIG_PATH" | grep 'distributed:' | awk '{print $2}') 47 | scheduler_file=$(grep 'scheduler_file:' "$CONFIG_PATH" | awk '{print $2}') 48 | 49 | 50 | # Configure environment variables 51 | echo "Setting up environment variables" 52 | export DASK_DISTRIBUTED__WORKER__DAEMON=False 53 | 54 | # Dask part of the script 55 | 56 | if [ "$distributed" = "true" ]; then 57 | echo "Distributed mode enabled" 58 | # Start the Dask scheduler if the current node is the MASTER (rank 0) 59 | if [ "$RANK" -eq 0 ]; then 60 | echo "Starting the scheduler because it is the MASTER node (rank 0)" 61 | dask scheduler --scheduler-file "$scheduler_file" &> dask_scheduler.log & 62 | SCHEDULER_PID=$! 63 | fi 64 | 65 | # Start the Dask worker 66 | echo "Starting the worker of every node" 67 | dask worker --scheduler-file "$scheduler_file" &> "dask_scheduler_worker_$RANK.log" & 68 | fi 69 | 70 | 71 | # Run the end-to-end test if the current node is the MASTER (rank 0) 72 | if [ "$RANK" -eq 0 ]; then 73 | echo "Running the end-to-end test in the MASTER node (rank 0)" 74 | echo "Command to execute: python -m mmore process --config-file \"$CONFIG_PATH\"" 75 | echo "Should maybe exit here and wait until all the workers are ready!" 76 | echo "Type 'go' to execute the command, or type 'exit' to stop and run it manually later." 77 | 78 | # waiting for the user to type 'go' or 'exit' 79 | while true; do 80 | read -r user_input 81 | if [ "$user_input" = "go" ]; then 82 | echo "Starting processing" 83 | python -m mmore process --config-file "$CONFIG_PATH" 84 | break 85 | elif [ "$user_input" = "exit" ]; then 86 | echo "Exiting without running the command. You can run it manually later:" 87 | echo "python -m mmore process --config-file \"$CONFIG_PATH\"" 88 | exit 0 89 | else 90 | echo "Invalid input. Type 'go' to run the command or 'exit' to stop." 91 | fi 92 | done 93 | 94 | kill -9 $SCHEDULER_PID 95 | fi -------------------------------------------------------------------------------- /scripts/runai/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT="process" 4 | while getopts e:s:c:p: flag 5 | do 6 | case "${flag}" in 7 | s) SCRIPT="${OPTARG}";; 8 | c) CONFIG="${OPTARG}";; 9 | p) REPO_PATH="${OPTARG}";; 10 | esac 11 | done 12 | 13 | # Going to repo dir 14 | if [ -z "$REPO_PATH" ]; then 15 | REPO_PATH="/mmore" # change to the actual repo path 16 | fi 17 | cd $REPO_PATH 18 | 19 | # Loading env vars 20 | set -o allexport 21 | source .env 22 | set +o allexport 23 | 24 | pip install -e . 25 | 26 | echo "Start time: $(date)" 27 | 28 | # Launch script 29 | echo "Running: python -m mmore $SCRIPT --config-file $CONFIG" 30 | python -m mmore $SCRIPT --config-file $CONFIG 31 | 32 | echo "End time: $(date)" -------------------------------------------------------------------------------- /scripts/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a command exits with a non-zero status 4 | set -e 5 | 6 | # Update package lists 7 | sudo apt update 8 | 9 | # Install system dependencies 10 | sudo apt install -y ffmpeg libsm6 libxext6 chromium-browser libnss3 libgconf-2-4 libxi6 libxrandr2 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxrender1 libasound2 libatk1.0-0 libgtk-3-0 libreoffice 11 | 12 | # Install UV 13 | curl -LsSf https://astral.sh/uv/install.sh | sh 14 | echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrcuv sync 15 | uv venv 16 | source .venv/bin/activate -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/src/__init__.py -------------------------------------------------------------------------------- /src/mmore/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/src/mmore/__init__.py -------------------------------------------------------------------------------- /src/mmore/__main__.py: -------------------------------------------------------------------------------- 1 | from .cli import main 2 | 3 | if __name__ == "__main__": 4 | main() 5 | -------------------------------------------------------------------------------- /src/mmore/dashboard/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/src/mmore/dashboard/__init__.py -------------------------------------------------------------------------------- /src/mmore/dashboard/backend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/src/mmore/dashboard/backend/__init__.py -------------------------------------------------------------------------------- /src/mmore/dashboard/backend/benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/src/mmore/dashboard/backend/benchmarks/__init__.py -------------------------------------------------------------------------------- /src/mmore/dashboard/backend/client.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import requests 4 | 5 | 6 | class DashboardClient: 7 | """ 8 | Client to interact with the dashboard backend from the workers side 9 | """ 10 | 11 | def __init__(self, url: Optional[str]): 12 | """ 13 | url: the url of the dashboard backend 14 | """ 15 | self.url = url 16 | 17 | def init_db(self, total_files: int): 18 | """ 19 | initialize the dashboard 20 | :param total_files: the total number of files to process 21 | """ 22 | if self.url is None: 23 | print("Init db skipped, no url provided") 24 | 25 | try: 26 | metadata = {"total_files": total_files} 27 | response = requests.post(f"{self.url}/init-db", json=metadata) 28 | response.raise_for_status() 29 | except Exception as e: 30 | return {"error": str(e)} 31 | 32 | def report(self, worker_id, finished_file_paths) -> bool: 33 | """ 34 | new process finished execution of a group of files 35 | :param worker_id: the worker id 36 | :param finished_file_paths: the list of file paths that have been processed 37 | :return True is the answer body is True, False otherwise 38 | """ 39 | if self.url is None: 40 | print("Report request skipped, no url provided") 41 | return False 42 | assert isinstance(finished_file_paths, list) 43 | assert isinstance(worker_id, str) 44 | try: 45 | metadata = { 46 | "worker_id": worker_id, 47 | "finished_file_paths": finished_file_paths, 48 | } 49 | response = requests.post(f"{self.url}/reports", json=metadata) 50 | return response.json() 51 | except Exception as e: 52 | print(e) 53 | return False 54 | 55 | 56 | if __name__ == "__main__": 57 | # Test the client locally 58 | backend_url = "http://localhost:8000" 59 | print(DashboardClient(backend_url).report("42", ["filex"])) 60 | -------------------------------------------------------------------------------- /src/mmore/dashboard/backend/model.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import List, Optional 3 | 4 | from pydantic import BaseModel, BeforeValidator, Field 5 | from typing_extensions import Annotated 6 | 7 | # ObjectID will be casted into a string before being validated at being a str 8 | PyObjectId = Annotated[str, BeforeValidator(str)] 9 | 10 | 11 | class Report(BaseModel): 12 | """Report model saved in the database.""" 13 | 14 | id: Optional[PyObjectId] = Field(alias="_id", default=None) 15 | worker_id: str = Field(alias="worker_id") 16 | finished_file_paths: List[str] 17 | timestamp: Optional[datetime] = Field(default=None) 18 | 19 | class Config: 20 | populate_by_name = True 21 | extra = "allow" 22 | 23 | 24 | class DashboardMetadata(BaseModel): 25 | """Progress metadata saved in the database.""" 26 | 27 | total_files: int 28 | start_time: Optional[datetime] = Field(default=None) 29 | ask_to_stop: bool = False 30 | 31 | 32 | ########## 33 | 34 | 35 | class LatestReport(BaseModel): 36 | """report info sent to frontend""" 37 | 38 | timestamp: datetime 39 | count: int 40 | # file_paths: List[str] 41 | 42 | 43 | class WorkerLatest(BaseModel): 44 | """worker info sent to frontend""" 45 | 46 | worker_id: str 47 | latest_timestamp: datetime 48 | last_active: str 49 | latest_reports: List[LatestReport] 50 | 51 | 52 | class Progress(BaseModel): 53 | total_files: int 54 | start_time: Optional[datetime] 55 | finished_files: int 56 | progress: float 57 | last_activity: str 58 | ask_to_stop: bool 59 | 60 | 61 | class BatchedReports(BaseModel): 62 | """Batch report model sent to frontend""" 63 | 64 | reports: List[Report] 65 | total_records: int 66 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | pnpm-debug.log* 8 | lerna-debug.log* 9 | 10 | node_modules 11 | dist 12 | dist-ssr 13 | *.local 14 | 15 | # Editor directories and files 16 | .vscode/* 17 | !.vscode/extensions.json 18 | .idea 19 | .DS_Store 20 | *.suo 21 | *.ntvs* 22 | *.njsproj 23 | *.sln 24 | *.sw? 25 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/eslint.config.js: -------------------------------------------------------------------------------- 1 | import js from '@eslint/js' 2 | import globals from 'globals' 3 | import reactHooks from 'eslint-plugin-react-hooks' 4 | import reactRefresh from 'eslint-plugin-react-refresh' 5 | import tseslint from 'typescript-eslint' 6 | 7 | export default tseslint.config( 8 | { ignores: ['dist'] }, 9 | { 10 | extends: [js.configs.recommended, ...tseslint.configs.recommended], 11 | files: ['**/*.{ts,tsx}'], 12 | languageOptions: { 13 | ecmaVersion: 2020, 14 | globals: globals.browser, 15 | }, 16 | plugins: { 17 | 'react-hooks': reactHooks, 18 | 'react-refresh': reactRefresh, 19 | }, 20 | rules: { 21 | ...reactHooks.configs.recommended.rules, 22 | 'react-refresh/only-export-components': [ 23 | 'warn', 24 | { allowConstantExport: true }, 25 | ], 26 | }, 27 | }, 28 | ) 29 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Vite + React + TS 8 | 9 | 10 |
11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mmore-dashboard", 3 | "private": true, 4 | "version": "0.0.0", 5 | "type": "module", 6 | "scripts": { 7 | "dev": "vite", 8 | "build": "tsc -b && vite build", 9 | "lint": "eslint .", 10 | "preview": "vite preview" 11 | }, 12 | "dependencies": { 13 | "chartjs-adapter-date-fns": "3.0.0", 14 | "prettier": "3.4.2", 15 | "primereact": "10.9.1", 16 | "react": "19.0.0", 17 | "react-chartjs-2": "5.3.0", 18 | "react-dom": "19.0.0", 19 | "react-force-graph-2d": "1.26.1" 20 | }, 21 | "devDependencies": { 22 | "@eslint/js": "^9.15.0", 23 | "@types/react": "^18.3.12", 24 | "@types/react-dom": "^18.3.1", 25 | "@vitejs/plugin-react": "^4.3.4", 26 | "eslint": "^9.15.0", 27 | "eslint-plugin-react-hooks": "^5.0.0", 28 | "eslint-plugin-react-refresh": "^0.4.14", 29 | "globals": "^15.12.0", 30 | "typescript": "~5.6.2", 31 | "typescript-eslint": "^8.15.0", 32 | "vite": "^6.0.3" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/public/vite.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/src/App.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: Arial, sans-serif; 3 | margin: 0; 4 | padding: 0; 5 | background-color: #f4f4f9; 6 | display: flex; 7 | justify-content: center; 8 | align-items: flex-start; 9 | min-height: 100vh; 10 | } 11 | 12 | .dashboard { 13 | width: 105%; /* Take the full width of the screen */ 14 | max-width: 1600px; /* Limit the width on larger screens */ 15 | background-color: #ffffff; 16 | border-radius: 8px; 17 | box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); 18 | padding: 20px; 19 | } 20 | 21 | .header { 22 | text-align: center; 23 | background-color: #282c34; 24 | color: white; 25 | padding: 20px 0; 26 | border-radius: 8px; 27 | margin-bottom: 20px; 28 | } 29 | 30 | .system-overview { 31 | display: flex; 32 | justify-content: space-between; 33 | margin-bottom: 20px; 34 | gap: 20px; 35 | } 36 | 37 | .progress, .task-chart { 38 | flex: 1; 39 | padding: 20px; 40 | background: #fff; 41 | border-radius: 8px; 42 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 43 | } 44 | 45 | .progress-bar { 46 | background: #ddd; 47 | border-radius: 4px; 48 | overflow: hidden; 49 | } 50 | 51 | .progress-fill { 52 | background: #4caf50; 53 | height: 8px; 54 | } 55 | 56 | .worker-graph { 57 | margin-bottom: 20px; 58 | background: #fff; 59 | padding: 20px; 60 | border-radius: 8px; 61 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 62 | } 63 | 64 | .workers { 65 | margin-bottom: 20px; 66 | } 67 | 68 | .worker-list { 69 | display: grid; 70 | grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); /* Wider worker cards */ 71 | gap: 20px; 72 | } 73 | 74 | .worker-card { 75 | text-align: center; 76 | background: #fff; 77 | padding: 15px; 78 | border-radius: 8px; 79 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 80 | } 81 | 82 | .logs, .file-explorer { 83 | background: #fff; 84 | padding: 20px; 85 | margin-bottom: 20px; 86 | border-radius: 8px; 87 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 88 | } 89 | 90 | .logs table { 91 | width: 100%; 92 | border-collapse: collapse; 93 | } 94 | 95 | .logs th, .logs td { 96 | text-align: left; 97 | padding: 8px; 98 | border-bottom: 1px solid #ddd; 99 | } 100 | 101 | .controls { 102 | text-align: center; 103 | } 104 | 105 | button { 106 | padding: 10px 20px; 107 | margin: 10px; 108 | border: none; 109 | border-radius: 4px; 110 | cursor: pointer; 111 | } 112 | 113 | button.stop { 114 | background: #f44336; 115 | color: white; 116 | } 117 | 118 | button.start { 119 | background: #4caf50; 120 | color: white; 121 | } 122 | .file-explorer { 123 | display: flex; 124 | flex-direction: column; 125 | gap: 1rem; 126 | } 127 | 128 | .p-datatable { 129 | margin-top: 1rem; 130 | } 131 | 132 | .p-treetable { 133 | width: 100%; 134 | } 135 | 136 | .p-paginator { 137 | margin-top: 0.5rem; 138 | } 139 | 140 | .progress-circle { 141 | display: flex; 142 | justify-content: center; 143 | align-items: center; 144 | position: relative; 145 | width: 200px; 146 | height: 200px; 147 | border-radius: 50%; 148 | background: conic-gradient(#4caf50 70%, #ddd 0); 149 | margin: 20px auto; 150 | box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); 151 | } 152 | 153 | .circle-inner { 154 | display: flex; 155 | flex-direction: column; 156 | justify-content: center; 157 | align-items: center; 158 | width: 160px; 159 | height: 160px; 160 | border-radius: 50%; 161 | background: #ffffff; 162 | box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.1); 163 | } 164 | 165 | .progress-number { 166 | font-size: 2rem; 167 | font-weight: bold; 168 | color: #4caf50; 169 | margin: 0; 170 | } 171 | 172 | .progress-label { 173 | font-size: 1rem; 174 | color: #666; 175 | } 176 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/src/App.tsx: -------------------------------------------------------------------------------- 1 | import {useEffect, useState} from "react"; 2 | import "./App.css"; 3 | import {WorkerLatest} from "./WorkerLatest.ts"; 4 | import "chartjs-adapter-date-fns"; 5 | import ReportsTableView from "./ReportsTableView.tsx"; 6 | import GraphAndSublistView from "./GraphAndSublistView.tsx"; 7 | import ReportsChartView from "./ReportsChartView.tsx"; 8 | import OverallProgressView from "./OverallProgressView.tsx"; 9 | 10 | const apiUrl = import.meta.env.VITE_BACKEND_API_URL; 11 | 12 | function App() { 13 | const [workers, setWorkers] = useState([]); 14 | 15 | function loadWorkersLatest() { 16 | fetch(apiUrl + "/reports/workers/latest") 17 | .then((res) => res.json()) 18 | .then((data) => { 19 | // eslint-disable-next-line @typescript-eslint/ban-ts-comment 20 | // @ts-expect-error 21 | const workers = data.map((worker) => new WorkerLatest(worker)); 22 | setWorkers(workers); 23 | }) 24 | .catch((error) => { 25 | console.error("Error fetching workers latest:", error); 26 | }); 27 | } 28 | 29 | useEffect(() => { 30 | loadWorkersLatest(); 31 | }, []); 32 | 33 | 34 | return ( 35 |
36 |
37 |

MMORE Dashboard 🐮🚀️

38 |
39 | 40 |
41 | 42 | 43 |
44 | 45 | 46 | 47 | 48 |
49 | ); 50 | } 51 | 52 | export default App; 53 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/src/OverallProgressView.tsx: -------------------------------------------------------------------------------- 1 | import {useEffect, useState} from "react"; 2 | import "./App.css"; 3 | import Progress from "./Progress"; 4 | import "chartjs-adapter-date-fns"; 5 | 6 | const apiUrl = import.meta.env.VITE_BACKEND_API_URL; 7 | 8 | function OverallProgressView() { 9 | const [progress, setProgress] = useState(); 10 | 11 | function loadProgress() { 12 | fetch(apiUrl + "/progress") 13 | .then((res) => res.json()) 14 | .then((data) => { 15 | const progress = new Progress(data); 16 | setProgress(progress); 17 | }) 18 | .catch((error) => { 19 | console.error("Error fetching progress:", error); 20 | }); 21 | } 22 | 23 | function stopExecution() { 24 | fetch(apiUrl + "/stop", { 25 | method: "POST", 26 | }) 27 | .then((res) => res.json()) 28 | .then((data) => { 29 | console.log("Stop Execution:", data); 30 | loadProgress(); 31 | }) 32 | .catch((error) => { 33 | alert("Error stopping execution:" + error); 34 | }); 35 | } 36 | 37 | useEffect(() => { 38 | loadProgress(); 39 | }, []); 40 | 41 | return ( 42 |
43 |

Overall Progress

44 |
45 |
46 |

47 | {progress != null ? progress.progress.toFixed(2) : "loading"}% 48 |

49 |

50 | {progress != null 51 | ? progress.askToStop 52 | ? "Stopping..." 53 | : progress.progress === 100 54 | ? "Completed" 55 | : "In Progress" 56 | : "loading"} 57 |

58 |
59 |
60 |

61 | Total Files Processed:{" "} 62 | {progress != null ? progress.finishedFiles : "..."} 63 |

64 |

65 | Total files to process:{" "} 66 | {progress != null ? progress.totalFiles : "..."} 67 |

68 |

69 | Start time :{" "} 70 | 71 | {progress != null ? progress.startTime.toLocaleString() : "..."} 72 | 73 |

74 |

75 | Latest update :{" "} 76 | {progress != null ? progress.lastActivity : "..."} 77 |

78 |
79 | {progress != null && progress.askToStop ? ( 80 |

stopping...

81 | ) : ( 82 | 95 | )} 96 |
97 |
98 | ); 99 | } 100 | 101 | export default OverallProgressView; 102 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/src/Progress.ts: -------------------------------------------------------------------------------- 1 | class Progress { 2 | totalFiles: number; 3 | startTime: Date; 4 | finishedFiles: number; 5 | progress: number; 6 | lastActivity: string; 7 | askToStop: boolean; 8 | 9 | // eslint-disable-next-line @typescript-eslint/ban-ts-comment 10 | // @ts-expect-error 11 | constructor(data) { 12 | this.totalFiles = data.total_files; 13 | this.startTime = new Date(data.start_time); 14 | this.finishedFiles = data.finished_files; 15 | this.progress = data.progress; 16 | this.lastActivity = data.last_activity; 17 | this.askToStop = data.ask_to_stop; 18 | } 19 | } 20 | 21 | export default Progress; -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/src/Report.ts: -------------------------------------------------------------------------------- 1 | class Report { 2 | id: string 3 | workerId: string; 4 | finishedFilePaths: [string]; 5 | timestamp: Date; 6 | 7 | // eslint-disable-next-line @typescript-eslint/ban-ts-comment 8 | // @ts-expect-error 9 | constructor(data) { 10 | this.id = data._id; 11 | this.workerId = data.worker_id 12 | this.finishedFilePaths = data.finished_file_paths; 13 | this.timestamp = new Date(data.timestamp); 14 | } 15 | } 16 | 17 | export default Report; -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/src/ReportsTableView.tsx: -------------------------------------------------------------------------------- 1 | import {useEffect, useState} from "react"; 2 | import {Column} from "primereact/column"; 3 | import {DataTable} from "primereact/datatable"; 4 | import "./App.css"; 5 | import "chartjs-adapter-date-fns"; 6 | import Report from "./Report.ts"; 7 | 8 | const apiUrl = import.meta.env.VITE_BACKEND_API_URL; 9 | 10 | function ReportsTableView() { 11 | const [reports, setReports] = useState([]); 12 | const [page, setPage] = useState(0); 13 | const [pageSize, setPageSize] = useState(5); 14 | const [totalRecords, setTotalRecords] = useState(0); 15 | const [loading, setLoading] = useState(true); 16 | 17 | function loadReports() { 18 | setLoading(true); 19 | fetch(apiUrl + "/reports/latest/?page_idx=" + page + "&page_size=" + pageSize) 20 | .then((res) => res.json()) 21 | .then((data) => { 22 | // eslint-disable-next-line @typescript-eslint/ban-ts-comment 23 | // @ts-expect-error 24 | const reports = data.reports.map((report) => new Report(report)); 25 | 26 | setReports(reports); 27 | setTotalRecords(data.total_records); 28 | setLoading(false); 29 | }) 30 | .catch((error) => { 31 | console.error("Error fetching reports:", error); 32 | }); 33 | } 34 | 35 | useEffect(() => { 36 | loadReports(); 37 | }, [page, pageSize]); 38 | 39 | return ( 40 |
41 |

Activity Logs

42 |

Total record {totalRecords}

43 | { 51 | // eslint-disable-next-line @typescript-eslint/ban-ts-comment 52 | // @ts-expect-error 53 | setPage(e.page); 54 | setPageSize(e.rows); 55 | }} 56 | loading={loading} 57 | rowsPerPageOptions={[5, 10, 20, 50, 100]} 58 | showGridlines={true} 59 | > 60 | rowData.timestamp.toLocaleString()}/> 61 | 62 | { 63 | return ( 64 |
    65 | {rowData.finishedFilePaths.map((file: string, index: number) => ( 66 |
  • {file}
  • 67 | ))} 68 |
69 | ); 70 | }}/> 71 |
72 |
73 | ) 74 | } 75 | 76 | export default ReportsTableView; 77 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/src/WorkerLatest.ts: -------------------------------------------------------------------------------- 1 | class LatestReport { 2 | timestamp: Date; 3 | count: number; 4 | 5 | // eslint-disable-next-line @typescript-eslint/ban-ts-comment 6 | // @ts-expect-error 7 | constructor(data) { 8 | this.timestamp = new Date(data.timestamp); 9 | this.count = data.count; 10 | } 11 | } 12 | 13 | class WorkerLatest { 14 | workerId: string; 15 | latestTimestamp: Date; 16 | lastActive: string; 17 | latestReports: LatestReport[]; 18 | 19 | // eslint-disable-next-line @typescript-eslint/ban-ts-comment 20 | // @ts-expect-error 21 | constructor(data) { 22 | this.workerId = data.worker_id; 23 | this.latestTimestamp = new Date(data.latest_timestamp); 24 | this.lastActive = data.last_active; 25 | 26 | // eslint-disable-next-line @typescript-eslint/ban-ts-comment 27 | // @ts-expect-error 28 | this.latestReports = data.latest_reports.map((report) => new LatestReport(report)); 29 | } 30 | 31 | status(): string { 32 | const now = new Date(); 33 | const diff = now.getTime() - this.latestTimestamp.getTime(); 34 | if (diff > 60 * 60 * 1000) { 35 | return 'error'; 36 | } else if (diff > 30 * 60 * 1000) { 37 | return 'warning'; 38 | } else { 39 | return 'active'; 40 | } 41 | } 42 | } 43 | 44 | export {WorkerLatest, LatestReport}; -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/src/assets/react.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/src/index.css: -------------------------------------------------------------------------------- 1 | :root { 2 | font-family: Inter, system-ui, Avenir, Helvetica, Arial, sans-serif; 3 | line-height: 1.5; 4 | font-weight: 400; 5 | 6 | color-scheme: light dark; 7 | color: rgba(255, 255, 255, 0.87); 8 | background-color: #242424; 9 | 10 | font-synthesis: none; 11 | text-rendering: optimizeLegibility; 12 | -webkit-font-smoothing: antialiased; 13 | -moz-osx-font-smoothing: grayscale; 14 | } 15 | 16 | a { 17 | font-weight: 500; 18 | color: #646cff; 19 | text-decoration: inherit; 20 | } 21 | a:hover { 22 | color: #535bf2; 23 | } 24 | 25 | body { 26 | margin: 0; 27 | display: flex; 28 | place-items: center; 29 | min-width: 320px; 30 | min-height: 100vh; 31 | } 32 | 33 | h1 { 34 | font-size: 3.2em; 35 | line-height: 1.1; 36 | } 37 | 38 | button { 39 | border-radius: 8px; 40 | border: 1px solid transparent; 41 | padding: 0.6em 1.2em; 42 | font-size: 1em; 43 | font-weight: 500; 44 | font-family: inherit; 45 | background-color: #1a1a1a; 46 | cursor: pointer; 47 | transition: border-color 0.25s; 48 | } 49 | button:hover { 50 | border-color: #646cff; 51 | } 52 | button:focus, 53 | button:focus-visible { 54 | outline: 4px auto -webkit-focus-ring-color; 55 | } 56 | 57 | @media (prefers-color-scheme: light) { 58 | :root { 59 | color: #213547; 60 | background-color: #ffffff; 61 | } 62 | a:hover { 63 | color: #747bff; 64 | } 65 | button { 66 | background-color: #f9f9f9; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/src/main.tsx: -------------------------------------------------------------------------------- 1 | import { StrictMode } from 'react' 2 | import { createRoot } from 'react-dom/client' 3 | import './index.css' 4 | import App from './App.tsx' 5 | 6 | createRoot(document.getElementById('root')!).render( 7 | 8 | 9 | , 10 | ) 11 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/src/vite-env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | interface ImportMetaEnv { 3 | readonly BACKEND_API_URL: string; 4 | } 5 | 6 | interface ImportMeta { 7 | readonly env: ImportMetaEnv; 8 | } -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/tsconfig.app.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo", 4 | "target": "ES2020", 5 | "useDefineForClassFields": true, 6 | "lib": ["ES2020", "DOM", "DOM.Iterable"], 7 | "module": "ESNext", 8 | "skipLibCheck": true, 9 | 10 | /* Bundler mode */ 11 | "moduleResolution": "bundler", 12 | "allowImportingTsExtensions": true, 13 | "isolatedModules": true, 14 | "moduleDetection": "force", 15 | "noEmit": true, 16 | "jsx": "react-jsx", 17 | 18 | /* Linting */ 19 | "strict": true, 20 | "noUnusedLocals": true, 21 | "noUnusedParameters": true, 22 | "noFallthroughCasesInSwitch": true, 23 | "noUncheckedSideEffectImports": true 24 | }, 25 | "include": ["src"] 26 | } 27 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": [], 3 | "references": [ 4 | { "path": "./tsconfig.app.json" }, 5 | { "path": "./tsconfig.node.json" } 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/tsconfig.node.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo", 4 | "target": "ES2022", 5 | "lib": ["ES2023"], 6 | "module": "ESNext", 7 | "skipLibCheck": true, 8 | 9 | /* Bundler mode */ 10 | "moduleResolution": "bundler", 11 | "allowImportingTsExtensions": true, 12 | "isolatedModules": true, 13 | "moduleDetection": "force", 14 | "noEmit": true, 15 | 16 | /* Linting */ 17 | "strict": true, 18 | "noUnusedLocals": true, 19 | "noUnusedParameters": true, 20 | "noFallthroughCasesInSwitch": true, 21 | "noUncheckedSideEffectImports": true 22 | }, 23 | "include": ["vite.config.ts"] 24 | } 25 | -------------------------------------------------------------------------------- /src/mmore/dashboard/frontend/vite.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'vite' 2 | import react from '@vitejs/plugin-react' 3 | 4 | // https://vite.dev/config/ 5 | export default defineConfig({ 6 | plugins: [react()], 7 | }) 8 | -------------------------------------------------------------------------------- /src/mmore/dashboard/node_modules/.package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dashboard", 3 | "lockfileVersion": 3, 4 | "requires": true, 5 | "packages": { 6 | "node_modules/safecolor": { 7 | "version": "1.0.1", 8 | "resolved": "https://registry.npmjs.org/safecolor/-/safecolor-1.0.1.tgz", 9 | "integrity": "sha512-Gv+4W+zKdViltaXv36U8/4BIOBxNxN8/mfPVyiO7qobCcSFsBK8+UtrclJizecKgK02458feT/9PXZXLxo0YxA==", 10 | "license": "MIT" 11 | } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/mmore/dashboard/node_modules/safecolor/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jess Uni 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/mmore/dashboard/node_modules/safecolor/README.md: -------------------------------------------------------------------------------- 1 | # SafeColor 2 | SafeColor generates accessible colors that complies with WCAG success criteria 1.4.3 (or any contrast ratio of your choice). 3 | It can be used to: 4 | 5 | 1. generate a random color that is contrast safe with a given color 6 | 2. generate a consistent, contrast safe color for a string 7 | 8 | No need to worry about your base color is light/dark or for foreground/background. If the given color is too light to meet your desired contrast ratio, SafeColor will look for a darker color and vice versa. 9 | 10 | 11 | ## Install 12 | 13 | `npm install safecolor` 14 | 15 | ## Usage 16 | 17 | `import SafeColor from 'safecolor'` 18 | 19 | ### Basic 20 | 21 | This will assume that the generated color should be contrast safe (>= AA standard: 4.5) with black(rgb(0, 0, 0)) 22 | 23 | ```javascript 24 | safeColor = new SafeColor() 25 | 26 | safeColor.random() 27 | // >> rgb(104, 145, 26) 28 | // contrast ratio = 5.65 29 | 30 | safeColor.random('hello world') 31 | // >> rgb(196,226,239) 32 | // contrast ratio = 15.47 33 | ``` 34 | ### With options 35 | 36 | ```javascript 37 | safeColor = new SafeColor({ 38 | color: [255, 255, 255], // 8bit RGB value in array [r, g, b] 39 | contrast: 4.5, // the contrast ratio between the option color and the generated color will >= this 40 | }) 41 | 42 | safeColor.random() 43 | // >> rgb(32,80,46) 44 | // contrast ratio = 9.34 45 | 46 | safeColor.random('hello world') 47 | // >> rgb(20,57,74) 48 | // contrast ratio = 12.25 49 | ``` 50 | 51 | ## Options 52 | 53 | **color** 54 | 55 | - type: `Array` 56 | - default: `[0, 0, 0]` 57 | 58 | **contrast** 59 | 60 | - type: `Number` 61 | - default: `4.5` 62 | 63 | ## Notice 64 | ES6 features: destructing assignment and map are used in this script. You may need polyfill for the script to work properly. 65 | 66 | Note: to keep this as simple as possible, the output is a RGB value in string. If any built-in conversions (to HEX, to HSL) will make SafeColor much more convenient for you, please contact me to add the feature or feel free to pull request. Cheers! 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /src/mmore/dashboard/node_modules/safecolor/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "safecolor", 3 | "version": "1.0.1", 4 | "description": "SafeColor generates accessible colors that compiles with WCAG success criteria 1.4.3 (or any contrast ratio of your choice). It can generate either a random color that is contrast safe with a given color, or a consistent color for a given string.", 5 | "main": "safecolor.js", 6 | "scripts": { 7 | "test": "test" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/jessuni/SafeColor.git" 12 | }, 13 | "keywords": [ 14 | "wacg", 15 | "contrast-ratio", 16 | "accessibility", 17 | "color", 18 | "string", 19 | "hex", 20 | "rgb", 21 | "hsl", 22 | "hci", 23 | "luminance", 24 | "luma" 25 | ], 26 | "author": "jessunix@gmail.com", 27 | "license": "MIT", 28 | "bugs": { 29 | "url": "https://github.com/jessuni/SafeColor/issues" 30 | }, 31 | "homepage": "https://github.com/jessuni/SafeColor#readme" 32 | } 33 | -------------------------------------------------------------------------------- /src/mmore/dashboard/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dashboard", 3 | "lockfileVersion": 3, 4 | "requires": true, 5 | "packages": { 6 | "": { 7 | "dependencies": { 8 | "safecolor": "^1.0.1" 9 | } 10 | }, 11 | "node_modules/safecolor": { 12 | "version": "1.0.1", 13 | "resolved": "https://registry.npmjs.org/safecolor/-/safecolor-1.0.1.tgz", 14 | "integrity": "sha512-Gv+4W+zKdViltaXv36U8/4BIOBxNxN8/mfPVyiO7qobCcSFsBK8+UtrclJizecKgK02458feT/9PXZXLxo0YxA==", 15 | "license": "MIT" 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/mmore/dashboard/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "safecolor": "^1.0.1" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /src/mmore/index/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/src/mmore/index/__init__.py -------------------------------------------------------------------------------- /src/mmore/process/__init__.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from .config import get as get_config 4 | from .config import set as set_config 5 | 6 | 7 | @click.group() 8 | def main(): 9 | """mmore""" 10 | pass 11 | 12 | 13 | # main.add_command(test) 14 | @main.group() 15 | def config(): 16 | """mmore config""" 17 | pass 18 | 19 | 20 | @config.command() 21 | @click.argument("key_path", required=False) 22 | def get(key_path): 23 | value = get_config(key_path) 24 | if value is not None: 25 | if key_path: 26 | click.echo(f"{key_path}: {value}") 27 | else: 28 | click.echo(f"{value}") 29 | 30 | 31 | @config.command() 32 | @click.argument("key_path") 33 | @click.argument("value") 34 | def set(key_path, value): 35 | """Set a value in the config based on the provided key path.""" 36 | config_path = set_config(key_path, value) 37 | if config_path: 38 | click.echo(f"Saved config {key_path} with value {value} to: {config_path}") 39 | 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /src/mmore/process/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import click 5 | import yaml 6 | 7 | default_config = { 8 | "processor": { 9 | "MediaProcessor": [ 10 | {"normal_model": "openai/whisper-large-v3-turbo"}, 11 | {"fast_model": "openai/whisper-tiny"}, 12 | ], 13 | "PDFProcessor": [ 14 | {"PDFTEXT_CPU_WORKERS": 0}, # We use cpu_count() 15 | {"DETECTOR_BATCH_SIZE": 120}, 16 | {"DETECTOR_POSTPROCESSING_CPU_WORKERS": 0}, # We use cpu_count() 17 | {"RECOGNITION_BATCH_SIZE": 64}, 18 | {"OCR_PARALLEL_WORKERS": 0}, # We use cpu_count() 19 | {"TEXIFY_BATCH_SIZE": 120}, 20 | {"LAYOUT_BATCH_SIZE": 120}, 21 | {"ORDER_BATCH_SIZE": 90}, 22 | {"TABLE_REC_BATCH_SIZE": 120}, 23 | ], 24 | }, 25 | "dispatcher": { 26 | "node_batch_sizes": [ 27 | {"URLProcessor": 40}, 28 | {"DOCXProcessor": 100}, 29 | {"PDFProcessor": 3000}, 30 | {"MediaProcessor": 40}, 31 | {"SpreadsheetProcessor": 100}, 32 | {"TXTProcessor": 100}, 33 | {"PPTXProcessor": 100}, 34 | {"MarkdownProcessor": 100}, 35 | {"EMLProcessor": 100}, 36 | {"HTMLProcessor": 100}, 37 | ] 38 | }, 39 | } 40 | 41 | 42 | def get_config_path(): 43 | """Get the path to the YAML config file.""" 44 | config_path = os.getenv("MMORE_CONFIG") 45 | if config_path: 46 | return Path(config_path) 47 | 48 | app_dir = Path(click.get_app_dir("mmore")) 49 | try: 50 | app_dir.mkdir(parents=True, exist_ok=True) 51 | except PermissionError as e: 52 | click.echo(f"Error creating config directory: {e}", err=True) 53 | raise e 54 | return app_dir / "config.yaml" 55 | 56 | 57 | def load_config(): 58 | """Load the data store from the YAML file.""" 59 | file_path = get_config_path() 60 | if file_path.exists(): 61 | try: 62 | with file_path.open("r") as file: 63 | return yaml.safe_load(file) or default_config 64 | except yaml.YAMLError as e: 65 | click.echo(f"Error loading config file: {e}", err=True) 66 | return default_config 67 | return default_config 68 | 69 | 70 | def save_config(data): 71 | """Save the data store to the YAML file.""" 72 | file_path = get_config_path() 73 | try: 74 | with file_path.open("w") as file: 75 | yaml.safe_dump(data, file) 76 | except Exception as e: 77 | click.echo(f"Error saving config file: {e}", err=True) 78 | 79 | 80 | def set_nested_value(d, key_path, value): 81 | keys = key_path.split(".") 82 | current = d 83 | for key in keys[:-1]: 84 | if key not in current: 85 | click.echo(f"Warning: Key '{key}' does not exist in the config.", err=True) 86 | return False 87 | current = current[key] 88 | if keys[-1] not in current: 89 | click.echo(f"Warning: Key '{keys[-1]}' does not exist in the config.", err=True) 90 | return False 91 | current[keys[-1]] = value 92 | return True 93 | 94 | 95 | def get_nested_value(d, key_path): 96 | if not key_path: 97 | return d 98 | keys = key_path.split(".") 99 | current = d 100 | for key in keys: 101 | if key not in current: 102 | click.echo(f"Error: Key '{key}' does not exist in the config.", err=True) 103 | return None 104 | current = current[key] 105 | return current 106 | 107 | 108 | def get(key_path=None): 109 | """Get a value from the config based on the provided key path.""" 110 | config = load_config() 111 | value = get_nested_value(config, key_path) 112 | return value 113 | 114 | 115 | def set(key_path, value): 116 | """Set a value in the config based on the provided key path.""" 117 | config = load_config() 118 | if set_nested_value(config, key_path, value): 119 | save_config(config) 120 | 121 | return get_config_path() 122 | -------------------------------------------------------------------------------- /src/mmore/process/execution_state.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Optional, cast 3 | 4 | from dask.distributed import Variable 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class ExecutionState: 10 | """ 11 | The global state of the execution, this class is static and stores the global state whenever the execution should stop or not. 12 | Every processor can check this state to see if it should stop execution. 13 | Supports both local and distributed execution 14 | In local execution, the state is stored in a static variable 15 | In distributed execution, the state is stored in a dask 'Variable', 16 | you need to connect to a dask cluster to use this feature (use client = Client(...) before calling .initialize()) 17 | and this variable will be shared across the custer 18 | """ 19 | 20 | # static variables 21 | _use_dask: Optional[bool] = None 22 | _dask_var: Optional[Variable] = None 23 | _local_state: bool = False 24 | 25 | @staticmethod 26 | def initialize(distributed_mode=False, client=None): 27 | """ 28 | Initializes the global state manager 29 | :param distributed_mode: Whether the execution is in distributed mode 30 | :param client: connection client to the dask cluster 31 | """ 32 | if ExecutionState._use_dask is not None: 33 | raise Exception("Execution state already initialized") 34 | assert distributed_mode is not None, ( 35 | "Distributed mode must be set to True or False" 36 | ) 37 | ExecutionState._use_dask = distributed_mode 38 | 39 | if distributed_mode: 40 | assert client is not None, ( 41 | "You must be in the context of a dask client to use distributed mode" 42 | ) 43 | ExecutionState._dask_var = Variable("should_stop_execution", client=client) 44 | ExecutionState._dask_var.set(False) 45 | logger.info("Execution state initialized (distributed mode)") 46 | else: 47 | ExecutionState._local_state = False 48 | logger.info("Execution state initialized (local mode)") 49 | 50 | @staticmethod 51 | def shutdown(): 52 | ExecutionState._use_dask = None 53 | ExecutionState._dask_var = None 54 | ExecutionState._local_state = False 55 | 56 | @staticmethod 57 | def get_should_stop_execution() -> bool: 58 | """Returns the global execution state (True if it should stop)""" 59 | if ExecutionState._use_dask is None: 60 | raise Exception("Execution state not initialized") 61 | if ExecutionState._use_dask: 62 | try: 63 | return cast(bool, cast(Variable, ExecutionState._dask_var).get()) 64 | except Exception as e: 65 | logger.error(f"Error getting dask variable: {e}") 66 | return True 67 | else: 68 | return ExecutionState._local_state 69 | 70 | @staticmethod 71 | def set_should_stop_execution(value: bool): 72 | """Sets the global execution stop state""" 73 | logger.info(f"Setting execution state to {value}") 74 | if ExecutionState._use_dask is None: 75 | raise Exception("Execution state not initialized") 76 | if ExecutionState._use_dask: 77 | cast(Variable, ExecutionState._dask_var).set(value) 78 | else: 79 | ExecutionState._local_state = value 80 | logger.info(f"Execution state set to {value}") 81 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import cast 2 | 3 | from ...utils import load_config 4 | from .base import BasePostProcessor, BasePostProcessorConfig 5 | from .filter import FILTER_TYPES, load_filter 6 | from .filter.base import BaseFilterConfig 7 | from .tagger import TAGGER_TYPES, load_tagger 8 | from .tagger.base import BaseTaggerConfig 9 | 10 | __all__ = ["BasePostProcessor", "BasePostProcessorConfig", "load_postprocessor"] 11 | 12 | 13 | def load_postprocessor(config: BasePostProcessorConfig) -> BasePostProcessor: 14 | if config.type in FILTER_TYPES: 15 | return load_filter(cast(BaseFilterConfig, config)) 16 | 17 | elif config.type in TAGGER_TYPES: 18 | return load_tagger(cast(BaseTaggerConfig, config)) 19 | 20 | elif config.type == "chunker": 21 | from .chunker import MultimodalChunker, MultimodalChunkerConfig 22 | 23 | config_chunk = load_config(config.args, MultimodalChunkerConfig) 24 | return MultimodalChunker.from_config(config_chunk) 25 | 26 | elif config.type == "ner": 27 | from .ner import NERecognizer, NERExtractorConfig 28 | 29 | config_ner = load_config(config.args, NERExtractorConfig) 30 | return NERecognizer.from_config(config_ner) 31 | 32 | elif config.type == "translator": 33 | from .translator import TranslatorConfig, TranslatorPostProcessor 34 | 35 | config_translator = load_config(config.args, TranslatorConfig) 36 | return TranslatorPostProcessor.from_config(config_translator) 37 | 38 | elif config.type == "metafuse": 39 | from .metafuse import MetaDataInfusor, MetaDataInfusorConfig 40 | 41 | config_metafuse = load_config(config.args, MetaDataInfusorConfig) 42 | return MetaDataInfusor.from_config(config_metafuse) 43 | 44 | else: 45 | raise ValueError(f"Unrecognized postprocessor type: {config.type}") 46 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from dataclasses import dataclass, field 3 | from typing import Dict, List, Optional 4 | 5 | from tqdm import tqdm 6 | 7 | from ...type import MultimodalSample 8 | 9 | 10 | @dataclass 11 | class BasePostProcessorConfig: 12 | type: str 13 | name: Optional[str] = None 14 | args: Dict = field(default_factory=dict) 15 | 16 | def __post_init__(self): 17 | if self.name is None: 18 | self.name = self.type 19 | 20 | 21 | class BasePostProcessor(ABC): 22 | name: str 23 | 24 | def __init__(self, name: str): 25 | self.name = name 26 | 27 | def __repr__(self): 28 | return f"{self.__class__.__name__}({self.name})" 29 | 30 | def __call__( 31 | self, sample: MultimodalSample, **kwargs 32 | ) -> MultimodalSample | List[MultimodalSample]: 33 | return self.process(sample, **kwargs) 34 | 35 | @abstractmethod 36 | def process(self, sample: MultimodalSample, **kwargs) -> List[MultimodalSample]: 37 | """Abstract method for processing a sample. 38 | 39 | Args: 40 | sample (MultimodalSample): The sample to process. 41 | 42 | Returns: 43 | List[MultimodalSample]: The processed sample(s). 44 | """ 45 | pass 46 | 47 | def batch_process( 48 | self, samples: List[MultimodalSample], **kwargs 49 | ) -> List[MultimodalSample]: 50 | """ 51 | Process a batch of samples. 52 | Args: 53 | samples: a list of samples to process 54 | kwargs: additional arguments to pass to the process method 55 | 56 | Returns: a list of processed samples 57 | """ 58 | res = [] 59 | for s in tqdm(samples, desc=f"{self.name}"): 60 | res += self.process(s, **kwargs) 61 | 62 | return res 63 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/chunker/__init__.py: -------------------------------------------------------------------------------- 1 | from .multimodal import MultimodalChunker, MultimodalChunkerConfig 2 | 3 | __all__ = ["MultimodalChunker", "MultimodalChunkerConfig"] 4 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/chunker/multimodal.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from dataclasses import dataclass, field 4 | from typing import Any, Dict, List, Optional 5 | 6 | from chonkie import BaseChunker, Chunk 7 | 8 | from ....type import MultimodalSample 9 | from .. import BasePostProcessor 10 | from .utils import load_chonkie 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | @dataclass 16 | class MultimodalChunkerConfig: 17 | chunking_strategy: str = "sentence" 18 | text_chunker_config: Dict[str, Any] = field(default_factory=dict) 19 | 20 | 21 | class MultimodalChunker(BasePostProcessor): 22 | text_chunker: BaseChunker 23 | 24 | def __init__(self, text_chunker: BaseChunker): 25 | super().__init__("🦛 Chunker") 26 | self.text_chunker = text_chunker 27 | 28 | @classmethod 29 | def from_config(cls, config: MultimodalChunkerConfig): 30 | text_chunker = load_chonkie( 31 | config.chunking_strategy, config.text_chunker_config 32 | ) 33 | return cls(text_chunker=text_chunker) 34 | 35 | def process(self, sample: MultimodalSample, **kwargs) -> List[MultimodalSample]: 36 | return self.chunk(sample) 37 | 38 | @staticmethod 39 | def _chunk_modalities(sample: MultimodalSample, text_chunks: List[Chunk]): 40 | # Find all attachment 41 | attachment_indices = [ 42 | m.start() for m in re.finditer(r"", sample.text) 43 | ] 44 | # Create an empty list to hold modalities for each chunk 45 | chunked_modalities = [[] for _ in range(len(text_chunks))] 46 | 47 | m = 0 # To track which modality to assign 48 | for idx in attachment_indices: 49 | if m >= len(sample.modalities) - 1: 50 | break 51 | chunk_index = _text_index_to_chunk_index(idx, text_chunks) 52 | assert chunk_index is not None 53 | chunked_modalities[chunk_index].append(sample.modalities[m]) 54 | m += 1 55 | 56 | return chunked_modalities 57 | 58 | def chunk(self, sample: MultimodalSample) -> List[MultimodalSample]: 59 | """Split sample into chunks according to the implementation strategy. 60 | 61 | Args: 62 | sample: Input sample to be chunked 63 | 64 | Returns: 65 | List of Chunk objects containing the chunked text and metadata 66 | """ 67 | if not sample.text or not sample.text.strip(): 68 | logger.warning(f"Empty text in sample {sample.id}. Skipping chunking.") 69 | return [] 70 | try: 71 | # Chunk using the text chunker 72 | text_chunks = self.text_chunker.chunk(sample.text) 73 | except Exception as e: 74 | logger.error( 75 | f"Chunking error on sample with length: {len(sample.text): {e}} " 76 | ) 77 | return [] 78 | # Chunk modalities according to the text chunks 79 | modalities_chunks = MultimodalChunker._chunk_modalities(sample, text_chunks) 80 | 81 | chunks = [] 82 | for i, (chunk, mods) in enumerate(zip(text_chunks, modalities_chunks)): 83 | s = MultimodalSample( 84 | text=chunk.text, 85 | modalities=mods, 86 | metadata=sample.metadata, 87 | id=f"{sample.id}+{i}", 88 | ) 89 | chunks.append(s) 90 | 91 | return chunks 92 | 93 | 94 | def _text_index_to_chunk_index(index: int, chunks: List[Chunk]) -> Optional[int]: 95 | for i, chunk in enumerate(chunks): 96 | if chunk.start_index <= index < chunk.end_index: 97 | return i 98 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/chunker/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | 3 | from chonkie import ( 4 | BaseChunker, 5 | SemanticChunker, 6 | SentenceChunker, 7 | TokenChunker, 8 | WordChunker, 9 | ) 10 | 11 | 12 | def load_chonkie(chunking_strategy: str, chunking_args: Dict[str, Any]) -> BaseChunker: 13 | if chunking_strategy == "sentence": 14 | return SentenceChunker(**chunking_args) 15 | elif chunking_strategy == "semantic": 16 | return SemanticChunker(**chunking_args) 17 | elif chunking_strategy == "word": 18 | return WordChunker(**chunking_args) 19 | elif chunking_strategy == "token": 20 | return TokenChunker(**chunking_args) 21 | else: 22 | raise ValueError(f"Unsupported chunker: {chunking_strategy}") 23 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/filter/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseFilter, BaseFilterConfig 2 | from .datatrove_wrapper import DATATROVE_FILTERS, DatatroveFilter 3 | 4 | __all__ = ["BaseFilter", "DatatroveFilter"] 5 | 6 | DATATROVE_MAP = {c: DatatroveFilter for c in DATATROVE_FILTERS} 7 | 8 | FILTERS_LOADERS_MAP = {**DATATROVE_MAP} 9 | FILTER_TYPES = list(FILTERS_LOADERS_MAP.keys()) 10 | 11 | 12 | def load_filter(config: BaseFilterConfig) -> BaseFilter: 13 | if config.type in FILTERS_LOADERS_MAP: 14 | return FILTERS_LOADERS_MAP[config.type].from_config(config) 15 | else: 16 | raise ValueError(f"Unrecognized filter type: {config.type}") 17 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/filter/base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from dataclasses import dataclass, field 3 | from typing import Any, List, Optional, Tuple 4 | 5 | from tqdm import tqdm 6 | 7 | from ....process.post_processor import BasePostProcessor 8 | from ....type import MultimodalSample 9 | 10 | 11 | @dataclass 12 | class BaseFilterConfig: 13 | type: str 14 | name: Optional[str] = None 15 | args: Any = field(default_factory=dict) 16 | 17 | def __post_init__(self): 18 | if self.name is None: 19 | self.name = self.type 20 | 21 | 22 | class BaseFilter(BasePostProcessor): 23 | name: str 24 | 25 | def __init__(self, name: str): 26 | self.name = name 27 | 28 | def __repr__(self): 29 | return f"{self.__class__.__name__}({self.name})" 30 | 31 | @abstractmethod 32 | def filter(self, sample: MultimodalSample) -> bool | Tuple[bool, str]: 33 | """Abstract method for processing a sample. 34 | 35 | Args: 36 | sample (MultimodalSample): The sample to process. 37 | 38 | Returns: 39 | bool: Whether the doc should be kept. 40 | str: If the document must be ignored, the reason. 41 | """ 42 | pass 43 | 44 | def process(self, sample: MultimodalSample, **kwargs) -> List[MultimodalSample]: 45 | res = self.filter(sample) 46 | if res: 47 | return [sample] 48 | else: 49 | return [] 50 | 51 | def batch_filter( 52 | self, batch: List[MultimodalSample] 53 | ) -> List[bool | Tuple[bool, str]]: 54 | """ 55 | Overwrite this method to implement batched filtering. Batches have size `self.batch_size`, except possibly the last one. 56 | Args: 57 | batch: a list of Document to process 58 | 59 | Returns: a list, the same size as `batch`, containing the filter result for each document 60 | 61 | """ 62 | return list(map(self.filter, tqdm(batch, desc=f"{self.name}"))) 63 | 64 | def batch_process(self, samples, **kwargs) -> List[MultimodalSample]: 65 | """ 66 | Process a batch of samples. 67 | Args: 68 | samples: a list of samples to process 69 | kwargs: additional arguments to pass to the process method 70 | 71 | Returns: a list of processed samples 72 | """ 73 | res = self.batch_filter(samples) 74 | return [s for s, r in zip(samples, res) if r] 75 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/filter/datatrove_wrapper.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Tuple, Union, cast 2 | 3 | import nltk 4 | from datatrove.data import Document, Media 5 | from datatrove.pipeline.filters import ( 6 | C4QualityFilter, 7 | FastTextClassifierFilter, 8 | FineWebQualityFilter, 9 | GopherQualityFilter, 10 | GopherRepetitionFilter, 11 | LambdaFilter, 12 | LanguageFilter, 13 | RegexFilter, 14 | SamplerFilter, 15 | UnigramLogProbFilter, 16 | URLFilter, 17 | ) 18 | from datatrove.pipeline.filters.base_filter import BaseFilter as DatatroveBaseFilter 19 | from datatrove.pipeline.writers.jsonl import JsonlWriter 20 | from tqdm import tqdm 21 | 22 | from ....type import MultimodalSample 23 | from .base import BaseFilter, BaseFilterConfig 24 | 25 | nltk.download("punkt_tab", quiet=True) 26 | 27 | FILTERS_MAP = { 28 | "filter_language": LanguageFilter, 29 | "filter_gopher-repetition": GopherRepetitionFilter, 30 | "filter_gopher-quality": GopherQualityFilter, 31 | "filter_fineweb": FineWebQualityFilter, 32 | "filter_c4": C4QualityFilter, 33 | "sampler": SamplerFilter, 34 | "filter_regex": RegexFilter, 35 | "filter_fasttext": FastTextClassifierFilter, 36 | "filter_lambda": LambdaFilter, 37 | "filter_unigram-logprob": UnigramLogProbFilter, 38 | "filter_url": URLFilter, 39 | } 40 | DATATROVE_FILTERS = list(FILTERS_MAP.keys()) 41 | 42 | 43 | def load_datatrove_filter( 44 | filter_name: str, filter_args: Dict[str, Any] 45 | ) -> DatatroveBaseFilter: 46 | if filter_name not in FILTERS_MAP: 47 | raise ValueError(f"Unsupported filter: {filter_name}") 48 | if "exclusion_writer" in filter_args and isinstance( 49 | filter_args["exclusion_writer"], str 50 | ): 51 | filter_args["exclusion_writer"] = JsonlWriter(filter_args["exclusion_writer"]) 52 | return FILTERS_MAP[filter_name](**filter_args) 53 | 54 | 55 | class DatatroveFilter(BaseFilter): 56 | datatrove_filter: DatatroveBaseFilter 57 | 58 | def __init__(self, name: str, datatrove_filter: DatatroveBaseFilter): 59 | super().__init__(name) 60 | self.datatrove_filter = datatrove_filter 61 | 62 | @classmethod 63 | def from_config(cls, config: BaseFilterConfig) -> "DatatroveFilter": 64 | datatrove_filter = load_datatrove_filter(config.type, config.args) 65 | return cls(name=datatrove_filter.name, datatrove_filter=datatrove_filter) 66 | 67 | @staticmethod 68 | def sample_to_doc(sample: MultimodalSample) -> Document: 69 | def type_as_int(x): 70 | return {"image": 0, "video": 1, "audio": 2}[x] 71 | 72 | return Document( 73 | text=sample.text, 74 | id=sample.id, 75 | media=[ 76 | Media(type=type_as_int(modality.type), url=modality.value) 77 | for modality in sample.modalities 78 | ], 79 | metadata=cast(Dict[str, Union[str, int, float, bool]], sample.metadata), 80 | ) 81 | 82 | def filter(self, sample: MultimodalSample) -> bool | Tuple[bool, str]: 83 | """Abstract method for processing a sample. 84 | 85 | Args: 86 | sample (MultimodalSample): The sample to process. 87 | 88 | Returns: 89 | bool: Whether the doc should be kept. 90 | str: If the document must be ignored, the reason. 91 | """ 92 | # Filter the document 93 | res = self.datatrove_filter.filter(DatatroveFilter.sample_to_doc(sample)) 94 | if isinstance(res, bool): 95 | return res 96 | else: 97 | return res[0] 98 | # return self.datatrove_filter.filter(DatatroveFilter.sample_to_doc(sample)) 99 | 100 | def batch_filter(self, batch): 101 | """Abstract method for processing a batch of samples. 102 | 103 | Args: 104 | batch (List[MultimodalSample]): The batch to process. 105 | 106 | Returns: 107 | List[bool]: Whether each document should be kept. 108 | """ 109 | batch = tqdm( 110 | [DatatroveFilter.sample_to_doc(sample) for sample in batch], 111 | desc=f"{self.name}", 112 | ) 113 | return self.datatrove_filter.filter_batch(cast(List[Document], batch)) 114 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/metafuse/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import MetaDataInfusor, MetaDataInfusorConfig, MetaDataPosition 2 | 3 | __all__ = ["MetaDataInfusor", "MetaDataInfusorConfig", "MetaDataPosition"] 4 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/metafuse/base.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from dataclasses import dataclass 3 | from enum import Enum 4 | from typing import List 5 | 6 | from mmore.process.post_processor.base import BasePostProcessor 7 | from mmore.type import MultimodalSample 8 | 9 | 10 | class MetaDataPosition(Enum): 11 | BEGINNING = "beginning" 12 | END = "end" 13 | 14 | 15 | @dataclass 16 | class MetaDataInfusorConfig: 17 | metadata_keys: List[str] 18 | content_template: str 19 | position: str 20 | 21 | 22 | class MetaDataInfusor(BasePostProcessor): 23 | def __init__( 24 | self, 25 | metadata_keys: List[str], 26 | content_template: str, 27 | position: MetaDataPosition, 28 | ): 29 | super().__init__(name="☕ Metadata Infusor") 30 | self.metadata_keys = metadata_keys 31 | self.content_template = content_template 32 | self.position = position 33 | 34 | @classmethod 35 | def from_config(cls, config: MetaDataInfusorConfig): 36 | metadata_infusor = MetaDataInfusor( 37 | metadata_keys=config.metadata_keys, 38 | content_template=config.content_template, 39 | position=MetaDataPosition(config.position), 40 | ) 41 | return metadata_infusor 42 | 43 | def process(self, sample: MultimodalSample, **kwargs) -> List[MultimodalSample]: 44 | format_mapping = defaultdict() 45 | for key in self.metadata_keys: 46 | value = sample.metadata.get(key, "") 47 | format_mapping[key] = value 48 | 49 | metadata_content = self.content_template.format_map(format_mapping) 50 | 51 | match self.position: 52 | case MetaDataPosition.BEGINNING: 53 | new_content = metadata_content + "\n" + sample.text 54 | case MetaDataPosition.END: 55 | new_content = sample.text + "\n" + metadata_content 56 | case _: 57 | new_content = sample.text 58 | 59 | return [ 60 | MultimodalSample(new_content, sample.modalities, sample.metadata, sample.id) 61 | ] 62 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/ner/__init__.py: -------------------------------------------------------------------------------- 1 | from .ner import NERecognizer, NERExtractorConfig 2 | 3 | __all__ = ["NERecognizer", "NERExtractorConfig"] 4 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/ner/ner.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from ....process.post_processor import BasePostProcessor 4 | from ....type import MultimodalSample 5 | from .extractor import NERExtractor, NERExtractorConfig 6 | 7 | 8 | class NERecognizer(BasePostProcessor): 9 | def __init__(self, extractor: NERExtractor): 10 | super().__init__("🔎 NER") 11 | self._extractor = extractor 12 | 13 | @classmethod 14 | def from_config(cls, config: NERExtractorConfig): 15 | extractor = NERExtractor.from_config(config) 16 | return cls(extractor) 17 | 18 | def process(self, sample: MultimodalSample, **kwargs) -> List[MultimodalSample]: 19 | # Call the extractor to get the relation graph 20 | relation_graph = self._extractor.invoke(sample) 21 | 22 | # Convert the relation graph to a list of relations 23 | entities = [ 24 | {"entity": e, **entity_desc} 25 | for e, entity_desc in relation_graph.nodes(data=True) 26 | ] 27 | 28 | # Add the relations to the sample metadata 29 | sample.metadata["ner"] = entities 30 | 31 | return [sample] 32 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/pipeline.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | from dataclasses import dataclass 5 | from typing import List, Optional 6 | 7 | from ...type import MultimodalSample 8 | from . import BasePostProcessor, BasePostProcessorConfig, load_postprocessor 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | @dataclass 14 | class OutputConfig: 15 | output_path: str 16 | save_each_step: bool = False 17 | 18 | def __post_init__(self): 19 | if not os.path.exists(self.output_path): 20 | os.makedirs(self.output_path) 21 | 22 | 23 | @dataclass 24 | class PPPipelineConfig: 25 | pp_modules: List[BasePostProcessorConfig] 26 | output: OutputConfig 27 | 28 | 29 | class PPPipeline: 30 | def __init__( 31 | self, 32 | *processors: BasePostProcessor, 33 | output_config: Optional[OutputConfig] = None, 34 | ): 35 | if output_config is None: 36 | output_config = OutputConfig(output_path="./results") 37 | 38 | self.post_processors = processors 39 | self.output_config = output_config 40 | 41 | # Log the pipeline 42 | self._log_plan() 43 | 44 | def __add__(self, other): 45 | return PPPipeline( 46 | *self.post_processors, 47 | *other.post_processors, 48 | output_config=self.output_config, 49 | ) 50 | 51 | def _log_plan(self): 52 | logger.info("-" * 50) 53 | logger.info("PP Pipeline:") 54 | for i, processor in enumerate(self.post_processors): 55 | logger.info(f" > {i + 1}. {processor.name}") 56 | logger.info("-" * 50) 57 | 58 | @classmethod 59 | def from_config(cls, config: PPPipelineConfig): 60 | post_processors = [ 61 | load_postprocessor(pp_config) for pp_config in config.pp_modules 62 | ] 63 | return cls(*post_processors, output_config=config.output) 64 | 65 | def __call__(self, samples: List[MultimodalSample]) -> List[MultimodalSample]: 66 | return self.run(samples) 67 | 68 | def run(self, samples: List[MultimodalSample]) -> List[MultimodalSample]: 69 | """ 70 | Run the post-processing pipeline on a list of multimodal samples. 71 | 72 | Args: 73 | samples (List[MultimodalSample]): List of multimodal samples. 74 | 75 | Returns: 76 | List[MultimodalSample]: Post-processed multimodal samples. 77 | """ 78 | for i, processor in enumerate(self.post_processors): 79 | samples = processor.batch_process(samples) 80 | if self.output_config.save_each_step: 81 | self.save_results(samples, f"{i + 1}___{processor.name}.jsonl") 82 | self.save_results(samples, "final_pp.jsonl") 83 | return samples 84 | 85 | def save_results(self, samples: List[MultimodalSample], filename: str) -> None: 86 | """ 87 | Save multimodal samples to a JSONL file. 88 | 89 | Args: 90 | samples (List[MultimodalSample]): List of multimodal samples. 91 | output_path (str): Path to save the samples. 92 | """ 93 | output_path = os.path.join(self.output_config.output_path, filename) 94 | with open(output_path, "w") as f: 95 | for result in samples: 96 | f.write(json.dumps(result.to_dict()) + "\n") 97 | logger.info(f"Results saved to {output_path}!") 98 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/tagger/__init__.py: -------------------------------------------------------------------------------- 1 | from mmore.process.post_processor.tagger.file_namer import FileNamer 2 | 3 | from .base import BaseTaggerConfig 4 | from .lang_detector import LangDetector 5 | from .modalities import ModalitiesCounter 6 | from .words import WordsCounter 7 | 8 | __all__ = ["ModalitiesCounter", "WordsCounter", "LangDetector"] 9 | 10 | TAGGERS_LOADERS_MAP = { 11 | "modalities_counter": ModalitiesCounter, 12 | "words_counter": WordsCounter, 13 | "lang_detector": LangDetector, 14 | "file_namer": FileNamer, 15 | } 16 | TAGGER_TYPES = list(TAGGERS_LOADERS_MAP.keys()) 17 | 18 | 19 | def load_tagger(config: BaseTaggerConfig): 20 | if config.type in TAGGER_TYPES: 21 | return TAGGERS_LOADERS_MAP[config.type].from_config(config) 22 | else: 23 | raise ValueError(f"Unrecognized tagger type: {config.type}") 24 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/tagger/base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from dataclasses import dataclass, field 3 | from typing import Any, List, Optional 4 | 5 | from tqdm import tqdm 6 | 7 | from ....type import MultimodalSample 8 | from .. import BasePostProcessor 9 | 10 | 11 | @dataclass 12 | class BaseTaggerConfig: 13 | type: str 14 | name: Optional[str] = None 15 | metadata_key: Optional[str] = None 16 | args: Any = field(default_factory=lambda: {}) 17 | 18 | def __post_init__(self): 19 | if self.name is None: 20 | self.name = self.type 21 | if self.metadata_key is None: 22 | self.metadata_key = self.type 23 | 24 | 25 | class BaseTagger(BasePostProcessor): 26 | name: str 27 | metadata_key: str 28 | 29 | def __init__(self, name: str, metadata_key: str): 30 | self.name = name 31 | self.metadata_key = metadata_key 32 | 33 | def __repr__(self): 34 | return f"{self.__class__.__name__}({self.name})" 35 | 36 | @abstractmethod 37 | def tag(self, sample: MultimodalSample) -> Any: 38 | """Abstract method for processing a sample. 39 | 40 | Args: 41 | sample (MultimodalSample): The sample to process. 42 | 43 | Returns: 44 | bool: Whether the doc should be kept. 45 | str: If the document must be ignored, the reason. 46 | """ 47 | pass 48 | 49 | def batch_tag(self, batch: List[MultimodalSample]) -> List[Any]: 50 | """ 51 | Overwrite this method to implement batched filtering. Batches have size `self.batch_size`, except possibly the last one. 52 | Args: 53 | batch: a list of Document to process 54 | 55 | Returns: a list, the same size as `batch`, containing the filter result for each document 56 | 57 | """ 58 | return list(map(self.tag, tqdm(batch, desc=f"{self.name}"))) 59 | 60 | def process(self, sample: MultimodalSample, **kwargs) -> List[MultimodalSample]: 61 | tag = self.tag(sample) 62 | sample.metadata[self.metadata_key] = tag 63 | return [sample] 64 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/tagger/file_namer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from .base import BaseTagger, BaseTaggerConfig 4 | 5 | 6 | class FileNamer(BaseTagger): 7 | """ 8 | A tagger that extracts the file name from the sample's metadata. 9 | This tagger is useful for identifying the source file of a sample, especially when dealing with multiple files. 10 | It retrieves the file name from the `file_path` metadata key and uses it as a tag. 11 | Attributes: 12 | name (str): The name of the tagger. 13 | metadata_key (str): The key in the sample's metadata from which to extract the file name. 14 | """ 15 | 16 | def __init__(self, name: str = "🔤 File Namer", metadata_key: str = "file_name"): 17 | """ 18 | Initializes the FileNamer tagger. 19 | Args: 20 | name (str): The name of the tagger. 21 | metadata_key (str): The key in the sample's metadata from which to extract the file name. 22 | """ 23 | super().__init__(name, metadata_key) 24 | 25 | def tag(self, sample): 26 | if "file_path" not in sample.metadata: 27 | return "unknown" 28 | 29 | return os.path.basename(str(sample.metadata["file_path"])) 30 | 31 | @classmethod 32 | def from_config(cls, config: BaseTaggerConfig): 33 | file_namer = FileNamer() 34 | return file_namer 35 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/tagger/hash.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | from .base import BaseTagger 4 | 5 | 6 | class TextHash(BaseTagger): 7 | def __init__(self, name: str = "#️⃣ Auto ID", metadata_key: str = "hash"): 8 | super().__init__(name, metadata_key) 9 | 10 | def tag(self, sample): 11 | return TextHash.hash(sample.text.replace("", "")) 12 | 13 | @staticmethod 14 | def hash(text: str): 15 | return hashlib.md5(text.encode()).hexdigest() 16 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/tagger/lang_detector.py: -------------------------------------------------------------------------------- 1 | from langdetect import detect 2 | 3 | from .base import BaseTagger 4 | 5 | 6 | class LangDetector(BaseTagger): 7 | def __init__(self, name: str = "🗣️ Lang Detector", metadata_key: str = "lang"): 8 | super().__init__(name, metadata_key) 9 | 10 | def tag(self, sample): 11 | text = sample.text.replace("", "") 12 | 13 | try: 14 | lang = detect(text) 15 | except Exception: 16 | lang = "unknown" 17 | 18 | return lang 19 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/tagger/modalities.py: -------------------------------------------------------------------------------- 1 | from .base import BaseTagger 2 | 3 | 4 | class ModalitiesCounter(BaseTagger): 5 | def __init__( 6 | self, name: str = "📸 Modalities Counter", metadata_key="modalities_count" 7 | ): 8 | super().__init__(name, metadata_key) 9 | 10 | def tag(self, sample): 11 | return len(sample.modalities) 12 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/tagger/words.py: -------------------------------------------------------------------------------- 1 | from .base import BaseTagger 2 | 3 | 4 | class WordsCounter(BaseTagger): 5 | def __init__( 6 | self, name: str = "🔤 Words Counter", metadata_key: str = "word_count" 7 | ): 8 | super().__init__(name, metadata_key) 9 | 10 | def tag(self, sample): 11 | return len(sample.text.split()) 12 | -------------------------------------------------------------------------------- /src/mmore/process/post_processor/translator/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import TranslatorConfig, TranslatorPostProcessor 2 | 3 | __all__ = ["TranslatorPostProcessor", "TranslatorConfig"] 4 | -------------------------------------------------------------------------------- /src/mmore/process/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # Register all processors here 2 | 3 | import importlib 4 | import pkgutil 5 | 6 | from .base import Processor, ProcessorRegistry 7 | 8 | 9 | def register_all_processors(preload: bool = False): 10 | for _, module_name, _ in pkgutil.iter_modules([__path__[0]]): 11 | module = importlib.import_module(f"{__name__}.{module_name}") 12 | for attr in dir(module): 13 | cls = getattr(module, attr) 14 | if ( 15 | isinstance(cls, type) 16 | and issubclass(cls, Processor) 17 | and cls is not Processor 18 | ): 19 | ProcessorRegistry.register(cls, preload=preload) 20 | 21 | 22 | register_all_processors() 23 | -------------------------------------------------------------------------------- /src/mmore/process/processors/txt_processor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from ...type import FileDescriptor, MultimodalSample 4 | from ..utils import clean_text 5 | from .base import Processor 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class TextProcessor(Processor): 11 | """ 12 | A processor for handling plain text files (.txt). Reads and cleans the text content. 13 | 14 | Attributes: 15 | files (List[FileDescriptor]): List of files to be processed. 16 | config (ProcessorConfig): Configuration for the processor. 17 | """ 18 | 19 | def __init__(self, config=None): 20 | """ 21 | Args: 22 | files (List[FileDescriptor]): List of files to process. 23 | config (ProcessorConfig, optional): Configuration for the processor. Defaults to None. 24 | """ 25 | super().__init__(config=config) 26 | 27 | @classmethod 28 | def accepts(cls, file: FileDescriptor) -> bool: 29 | """ 30 | Args: 31 | file (FileDescriptor): The file descriptor to check. 32 | 33 | Returns: 34 | bool: True if the file is a plain text file (.txt), False otherwise. 35 | """ 36 | return file.file_extension.lower() in [".txt"] 37 | 38 | def process(self, file_path: str) -> MultimodalSample: 39 | """ 40 | Process a text file, clean its content, and return a dictionary with the cleaned text. 41 | 42 | Args: 43 | file_path (str): Path to the text file. 44 | 45 | Returns: 46 | dict: A dictionary containing cleaned text, an empty list of modalities, and metadata. 47 | """ 48 | try: 49 | with open(file_path, "r", encoding="utf-8") as f: 50 | all_text = f.read() 51 | except (FileNotFoundError, PermissionError) as e: 52 | logger.error(f"Failed to read file {file_path}: {e}") 53 | return self.create_sample([], [], file_path) 54 | except UnicodeDecodeError as e: 55 | logger.error(f"Encoding error in file {file_path}: {e}") 56 | return self.create_sample([], [], file_path) 57 | 58 | all_text = clean_text(all_text) 59 | return self.create_sample([all_text], [], file_path) 60 | -------------------------------------------------------------------------------- /src/mmore/process/processors/url_processor.py: -------------------------------------------------------------------------------- 1 | import io 2 | import logging 3 | import re 4 | 5 | import requests 6 | import trafilatura 7 | from PIL import Image 8 | 9 | from ...type import MultimodalSample, URLDescriptor 10 | from ..utils import clean_text 11 | from .base import Processor, ProcessorConfig 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class URLProcessor(Processor): 17 | def __init__(self, config=None): 18 | """ 19 | Initialize the URLProcessor. 20 | 21 | :param config: ProcessorConfig object with configuration settings. 22 | """ 23 | super().__init__(config=config or ProcessorConfig()) 24 | self.ocr_models = None # Models will be loaded per process 25 | self.driver = None # WebDriver will be initialized per process 26 | 27 | @classmethod 28 | def accepts(cls, file) -> bool: 29 | return isinstance(file, URLDescriptor) 30 | 31 | def process_fast(self, file_path: str) -> MultimodalSample: 32 | try: # wrap in try because urls can be buggy 33 | downloaded = trafilatura.fetch_url(file_path) 34 | if not downloaded: 35 | raise ValueError(f"Failed to fetch content from URL: {file_path}") 36 | result = trafilatura.extract(downloaded, include_images=True) 37 | if not result: 38 | raise ValueError(f"Failed to extract content from URL: {file_path}") 39 | 40 | embedded_images = [] 41 | # replace all ![] with 42 | all_text = re.sub(r"!\[.*\]\(.*\)", self.config.attachment_tag, result) 43 | 44 | if self.config.custom_config.get("extract_images", True): 45 | images = re.findall(r"!\[.*\]\(.*\)", result) 46 | else: 47 | images = [] 48 | 49 | for image in images: 50 | try: 51 | image_url_search = re.search(r"\(.*\)", image) 52 | if image_url_search is None: 53 | raise Exception("Unable to find a valid URL of the image") 54 | 55 | image_url = image_url_search.group(0)[1:-1] 56 | headers = { 57 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" 58 | } 59 | response = requests.get(image_url, headers=headers, timeout=5) 60 | response.raise_for_status() 61 | img = Image.open(io.BytesIO(response.content)).convert("RGB") 62 | embedded_images.append(img) 63 | 64 | except Exception as e: 65 | logger.error(f"Failed to process image {image}: {e}") 66 | 67 | all_text = [clean_text(all_text)] 68 | return self.create_sample(all_text, embedded_images, file_path) 69 | except Exception as e: 70 | logger.error(f"Failed to process URL {file_path}: {e}") 71 | return self.create_sample([], [], file_path) 72 | 73 | def process(self, file_path: str, fast: bool = False) -> MultimodalSample: 74 | return self.process_fast(file_path) 75 | -------------------------------------------------------------------------------- /src/mmore/process/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for processing files, images, PDFs, and text. 3 | These functions can be used across various processors for data extraction, 4 | cleaning, splitting, and aggregation. 5 | """ 6 | 7 | import logging 8 | 9 | import numpy as np 10 | from cleantext import clean 11 | from PIL import Image 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def clean_text(text: str) -> str: 17 | """ 18 | Clean a given text using `cleantext` library. https://pypi.org/project/clean-text/ 19 | 20 | Args: 21 | text (str): Input text to be cleaned. 22 | 23 | Returns: 24 | str: Cleaned text. 25 | """ 26 | return clean( 27 | text=text, 28 | fix_unicode=True, 29 | to_ascii=False, 30 | lower=False, 31 | no_line_breaks=False, 32 | no_urls=False, 33 | no_emails=True, 34 | no_phone_numbers=False, 35 | no_numbers=False, 36 | no_digits=False, 37 | no_currency_symbols=False, 38 | no_punct=False, 39 | replace_with_punct="", 40 | replace_with_url="This is a URL", 41 | replace_with_email="email@email.com", 42 | replace_with_phone_number="", 43 | replace_with_number="123", 44 | replace_with_digit="0", 45 | replace_with_currency_symbol="$", 46 | lang="en", 47 | ) 48 | 49 | 50 | def clean_image( 51 | image: Image.Image, min_width=512, min_height=512, variance_threshold=100 52 | ) -> bool: 53 | """ 54 | Validates an image based on size and variance (whether its one-colored). 55 | 56 | Args: 57 | image (PIL.Image.Image): The image to validate. 58 | min_width (int, optional): The minimum width an image must have to be considered valid. Defaults to 512. 59 | min_height (int, optional): The minimum height an image must have to be considered valid. Defaults to 512. 60 | variance_threshold (int, optional): The minimum variance in pixel intensity required. Images with lower variance are considered "empty". Defaults to 100. 61 | 62 | Returns: 63 | bool: True if the image meets all criteria, False otherwise. 64 | """ 65 | if image is None: 66 | return False 67 | 68 | w, h = image.size 69 | 70 | # Check size criteria 71 | if w < min_width or h < min_height: 72 | return False 73 | 74 | # Check variance threshold 75 | gray = image.convert("L") 76 | arr = np.array(gray) 77 | variance = arr.var() 78 | if variance < variance_threshold: 79 | return False 80 | 81 | return True 82 | -------------------------------------------------------------------------------- /src/mmore/rag/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiss-ai/mmore/a0b47c3df343652f191ac58a396e3904dfdcd4f0/src/mmore/rag/__init__.py -------------------------------------------------------------------------------- /src/mmore/rag/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .dense import DenseModel, DenseModelConfig 2 | from .sparse import SparseModel, SparseModelConfig 3 | 4 | __all__ = ["DenseModel", "SparseModel", "DenseModelConfig", "SparseModelConfig"] 5 | -------------------------------------------------------------------------------- /src/mmore/rag/model/dense/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import DenseModel, DenseModelConfig 2 | 3 | __all__ = ["DenseModel", "DenseModelConfig"] 4 | -------------------------------------------------------------------------------- /src/mmore/rag/model/dense/base.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from langchain_aws import BedrockEmbeddings 4 | from langchain_cohere import CohereEmbeddings 5 | from langchain_community.embeddings import FakeEmbeddings 6 | from langchain_core.embeddings import Embeddings 7 | from langchain_huggingface.embeddings import HuggingFaceEmbeddings 8 | from langchain_mistralai import MistralAIEmbeddings 9 | from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings 10 | from langchain_openai import OpenAIEmbeddings 11 | 12 | from .multimodal import MultimodalEmbeddings 13 | 14 | _OPENAI_MODELS = [ 15 | "text-embedding-3-small", 16 | "text-embedding-3-large", 17 | "text-embedding-ada-002", 18 | ] 19 | 20 | _GOOGLE_MODELS = ["textembedding-gecko@001"] 21 | 22 | _COHERE_MODELS = [ 23 | "embed-english-light-v2.0", 24 | "embed-english-v2.0", 25 | "embed-multilingual-v2.0", 26 | ] 27 | 28 | _MISTRAL_MODELS = ["mistral-textembedding-7B-v1", "mistral-textembedding-13B-v1"] 29 | 30 | _NVIDIA_MODELS = ["nvidia-clarity-text-embedding-v1", "nvidia-megatron-embedding-530B"] 31 | 32 | _AWS_MODELS = ["amazon-titan-embedding-xlarge", "amazon-titan-embedding-light"] 33 | 34 | 35 | loaders = { 36 | "OPENAI": OpenAIEmbeddings, 37 | # 'GOOGLE': VertexAIEmbeddings, 38 | "COHERE": CohereEmbeddings, 39 | "MISTRAL": MistralAIEmbeddings, 40 | "NVIDIA": NVIDIAEmbeddings, 41 | "AWS": BedrockEmbeddings, 42 | "HF": lambda model, **kwargs: HuggingFaceEmbeddings( 43 | model_name=model, model_kwargs={"trust_remote_code": True}, **kwargs 44 | ), 45 | "FAKE": lambda **kwargs: FakeEmbeddings( 46 | size=2048 47 | ), # For testing purposes, don't use in production 48 | } 49 | 50 | 51 | @dataclass 52 | class DenseModelConfig: 53 | model_name: str 54 | is_multimodal: bool = False 55 | 56 | @property 57 | def organization(self) -> str: 58 | if self.model_name in _OPENAI_MODELS: 59 | return "OPENAI" 60 | elif self.model_name in _GOOGLE_MODELS: 61 | return "GOOGLE" 62 | elif self.model_name in _COHERE_MODELS: 63 | return "COHERE" 64 | elif self.model_name in _MISTRAL_MODELS: 65 | return "MISTRAL" 66 | elif self.model_name in _NVIDIA_MODELS: 67 | return "NVIDIA" 68 | elif self.model_name in _AWS_MODELS: 69 | return "AWS" 70 | elif self.model_name == "debug": 71 | return "FAKE" # For testing purposes 72 | else: 73 | return "HF" 74 | 75 | 76 | class DenseModel(Embeddings): 77 | @classmethod 78 | def from_config(cls, config: DenseModelConfig) -> Embeddings: 79 | if config.organization == "HF" and config.is_multimodal: 80 | return MultimodalEmbeddings(model_name=config.model_name) 81 | else: 82 | return loaders[config.organization](model=config.model_name) 83 | -------------------------------------------------------------------------------- /src/mmore/rag/model/sparse/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import SparseModel, SparseModelConfig 2 | 3 | __all__ = ["SparseModel", "SparseModelConfig"] 4 | -------------------------------------------------------------------------------- /src/mmore/rag/model/sparse/base.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from langchain_milvus.utils.sparse import BaseSparseEmbedding 4 | 5 | from .splade import SpladeSparseEmbedding 6 | 7 | _SPLADE_MODELS = ["naver/splade-cocondenser-selfdistil"] 8 | _names = {"splade": "naver/splade-cocondenser-selfdistil"} 9 | loaders = {"SPLADE": SpladeSparseEmbedding} 10 | 11 | 12 | @dataclass 13 | class SparseModelConfig: 14 | model_name: str 15 | is_multimodal: bool = False 16 | 17 | def __post_init__(self): 18 | if self.model_name.lower() in _names: 19 | self.model_name = _names.get(self.model_name, self.model_name) 20 | 21 | @property 22 | def model_type(self) -> str: 23 | if self.model_name in _SPLADE_MODELS: 24 | return "SPLADE" 25 | else: 26 | raise NotImplementedError() 27 | 28 | 29 | class SparseModel(BaseSparseEmbedding): 30 | @classmethod 31 | def from_config(cls, config: SparseModelConfig) -> BaseSparseEmbedding: 32 | return loaders.get(config.model_type, SpladeSparseEmbedding)( 33 | model_name=config.model_name 34 | ) 35 | -------------------------------------------------------------------------------- /src/mmore/rag/model/sparse/splade.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import torch 4 | from langchain_milvus.utils.sparse import BaseSparseEmbedding 5 | 6 | 7 | class SpladeSparseEmbedding(BaseSparseEmbedding): 8 | """Sparse embedding model based on Splade. 9 | 10 | This class uses the Splade embedding model in Milvus model to implement sparse vector embedding. 11 | This model requires pymilvus[model] to be installed. 12 | `pip install pymilvus[model]` 13 | For more information please refer to: 14 | https://milvus.io/docs/embed-with-splade.md 15 | """ 16 | 17 | def __init__(self, model_name: str = "naver/splade-cocondenser-selfdistil"): 18 | from pymilvus.model.sparse import SpladeEmbeddingFunction # type: ignore 19 | 20 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 21 | self.splade = SpladeEmbeddingFunction(model_name=model_name, device=self.device) 22 | 23 | def embed_query(self, query: str) -> Dict[int, float]: 24 | return self.splade.encode_queries([query]) 25 | 26 | def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]: 27 | return self.splade.encode_documents(texts) 28 | -------------------------------------------------------------------------------- /src/mmore/rag/pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example implementation: 3 | RAG pipeline. 4 | Integrates Milvus retrieval with HuggingFace text generation. 5 | """ 6 | 7 | from dataclasses import dataclass, field 8 | from typing import Any, Dict, List, Union 9 | 10 | from langchain_core.documents import Document 11 | from langchain_core.language_models.chat_models import BaseChatModel 12 | from langchain_core.output_parsers import StrOutputParser 13 | from langchain_core.prompts import ChatPromptTemplate 14 | from langchain_core.runnables import Runnable, RunnableLambda, RunnablePassthrough 15 | 16 | from ..utils import load_config 17 | from .llm import LLM, LLMConfig 18 | from .retriever import Retriever, RetrieverConfig 19 | from .types import MMOREInput, MMOREOutput 20 | 21 | DEFAULT_PROMPT = """\ 22 | Use the following context to answer the questions. If none of the context answer the question, just say you don't know. 23 | 24 | Context: 25 | {context} 26 | """ 27 | 28 | 29 | @dataclass 30 | class RAGConfig: 31 | """Configuration for RAG pipeline.""" 32 | 33 | retriever: RetrieverConfig 34 | llm: LLMConfig = field(default_factory=lambda: LLMConfig(llm_name="gpt2")) 35 | system_prompt: str = DEFAULT_PROMPT 36 | 37 | 38 | class RAGPipeline: 39 | """Main RAG pipeline combining retrieval and generation.""" 40 | 41 | retriever: Retriever 42 | llm: BaseChatModel 43 | prompt_template: Union[str, ChatPromptTemplate] 44 | 45 | def __init__( 46 | self, 47 | retriever: Retriever, 48 | prompt_template: Union[str, ChatPromptTemplate], 49 | llm: BaseChatModel, 50 | ): 51 | # Get modules 52 | self.retriever = retriever 53 | self.prompt = prompt_template 54 | self.llm = llm 55 | 56 | # Build the rag chain 57 | self.rag_chain = RAGPipeline._build_chain( 58 | self.retriever, RAGPipeline.format_docs, self.prompt, self.llm 59 | ) 60 | 61 | def __str__(self): 62 | return str(self.rag_chain) 63 | 64 | @classmethod 65 | def from_config(cls, config: str | RAGConfig): 66 | if isinstance(config, str): 67 | config = load_config(config, RAGConfig) 68 | 69 | retriever = Retriever.from_config(config.retriever) 70 | llm = LLM.from_config(config.llm) 71 | chat_template = ChatPromptTemplate.from_messages( 72 | [("system", config.system_prompt), ("human", "{input}")] 73 | ) 74 | 75 | return cls(retriever, chat_template, llm) 76 | 77 | @staticmethod 78 | def format_docs(docs: List[Document]) -> str: 79 | """Format documents for prompt.""" 80 | return "\n\n".join( 81 | f"[{doc.metadata['rank']}] {doc.page_content}" for doc in docs 82 | ) 83 | # return "\n\n".join(f"[#{doc.metadata['rank']}, sim={doc.metadata['similarity']:.2f}] {doc.page_content}" for doc in docs) 84 | 85 | @staticmethod 86 | def _build_chain(retriever, format_docs, prompt, llm) -> Runnable: 87 | validate_input = RunnableLambda( 88 | lambda x: MMOREInput.model_validate(x).model_dump() 89 | ) 90 | 91 | def make_output(x): 92 | """Validate the output of the LLM and keep only the actual answer of the assistant""" 93 | res_dict = MMOREOutput.model_validate(x).model_dump() 94 | res_dict["answer"] = res_dict["answer"].split("<|im_start|>assistant\n")[-1] 95 | 96 | return res_dict 97 | 98 | validate_output = RunnableLambda(make_output) 99 | 100 | rag_chain_from_docs = prompt | llm | StrOutputParser() 101 | 102 | core_chain = ( 103 | RunnablePassthrough.assign(docs=retriever) 104 | .assign(context=lambda x: format_docs(x["docs"])) 105 | .assign(answer=rag_chain_from_docs) 106 | ) 107 | 108 | return validate_input | core_chain | validate_output 109 | 110 | def __call__( 111 | self, queries: Dict[str, Any] | List[Dict[str, Any]], return_dict: bool = False 112 | ) -> List[Dict[str, str | List[str]]]: 113 | if isinstance(queries, Dict): 114 | queries_list = [queries] 115 | else: 116 | queries_list = queries 117 | 118 | results = self.rag_chain.batch(queries_list) 119 | 120 | if return_dict: 121 | return results 122 | else: 123 | return [result["answer"] for result in results] 124 | -------------------------------------------------------------------------------- /src/mmore/rag/types.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from langchain_core.documents import Document 4 | from pydantic import BaseModel, Field 5 | 6 | # ------------------------------- Simple Input ------------------------------- # 7 | 8 | 9 | class MMOREInput(BaseModel): 10 | """Input for the chat endpoint.""" 11 | 12 | input: str = Field( 13 | ..., 14 | description="The user input.", 15 | ) 16 | collection_name: str = Field( 17 | ..., 18 | description="The collection", 19 | ) 20 | document_ids: List[str] = Field( 21 | default_factory=list, # Set default to an empty list if not provided 22 | description="List of document IDs, defaults to an empty list if not provided.", 23 | ) 24 | 25 | 26 | # ------------------------------- Simple Output ------------------------------ # 27 | 28 | 29 | class MMOREOutput(BaseModel): 30 | """Base Answer, outputs the query, documents and answer""" 31 | 32 | input: str 33 | docs: List[Document] 34 | answer: str 35 | 36 | 37 | # -------------------------------- CitedAnswer ------------------------------- # 38 | 39 | 40 | class CitedAnswer(BaseModel): 41 | """Answer the user question based only on the given sources, and cite the sources used.""" 42 | 43 | answer: str = Field( 44 | ..., 45 | description="The answer to the user question, which is based only on the given sources.", 46 | ) 47 | citations: List[int] = Field( 48 | ..., 49 | description="The integer IDs of the SPECIFIC sources which justify the answer.", 50 | ) 51 | 52 | 53 | # ------------------------------- QuotedAnswer ------------------------------- # 54 | 55 | 56 | class Citation(BaseModel): 57 | source_id: int = Field( 58 | ..., 59 | description="The integer ID of a SPECIFIC source which justifies the answer.", 60 | ) 61 | quote: str = Field( 62 | ..., 63 | description="The VERBATIM quote from the specified source that justifies the answer.", 64 | ) 65 | 66 | 67 | class QuotedAnswer(BaseModel): 68 | """Answer the user question based only on the given sources, and cite the sources used.""" 69 | 70 | answer: str = Field( 71 | ..., 72 | description="The answer to the user question, which is based only on the given sources.", 73 | ) 74 | citations: List[Citation] = Field( 75 | ..., description="Citations from the given sources that justify the answer." 76 | ) 77 | -------------------------------------------------------------------------------- /src/mmore/rag/vectorstore.py: -------------------------------------------------------------------------------- 1 | """ 2 | NOT USING THIS! Maybe a future update. 3 | ----- 4 | Simple vector database indexer using Milvus for document storage. 5 | Supports multimodal documents with chunking capabilities. 6 | """ 7 | 8 | from dataclasses import dataclass 9 | from typing import Any, List 10 | 11 | from langchain_core.embeddings import Embeddings 12 | from langchain_core.vectorstores.base import VectorStoreRetriever 13 | from langchain_huggingface.embeddings import HuggingFaceEmbeddings 14 | from langchain_milvus import Milvus 15 | from langchain_milvus.utils.sparse import BaseSparseEmbedding 16 | 17 | from ..type import MultimodalSample 18 | from .model.dense.multimodal import MultimodalEmbeddings 19 | from .model.sparse.splade import SpladeSparseEmbedding 20 | 21 | 22 | @dataclass 23 | class VectorStoreConfig: 24 | dense_model_name: str = "sentence-transformers/all-MiniLM-L6-v2" 25 | sparse_model_name: str = "splade" 26 | collection_name: str = "rag" 27 | milvus_uri: str = ( 28 | "milvus_demo.db" # "http://localhost:19530" Milvus standalone docker service 29 | ) 30 | 31 | 32 | class VectorStoreMilvus: 33 | milvus: Milvus 34 | 35 | def __init__(self, milvus) -> None: 36 | self.milvus = milvus 37 | 38 | @classmethod 39 | def from_config(cls, config: VectorStoreConfig): 40 | # Get models 41 | dense_model = cls._init_dense_model(config.dense_model_name) 42 | # sparse_model = cls._init_sparse_model(config.sparse_model_name) 43 | 44 | # Instantiate the VectorStore 45 | milvus = Milvus( 46 | embedding_function=dense_model, 47 | # vector_field=['dense', 'sparse'], 48 | collection_name=config.collection_name, 49 | connection_args={"uri": config.milvus_uri}, 50 | auto_id=True, 51 | ) 52 | 53 | return cls(milvus=milvus) 54 | 55 | @classmethod 56 | def from_documents( 57 | cls, 58 | documents: List[MultimodalSample], 59 | config: VectorStoreConfig = VectorStoreConfig(), 60 | ): 61 | # Get models 62 | dense_model = VectorStoreMilvus._init_dense_model(config.dense_model_name) 63 | # sparse_model = VectorStoreMilvus._init_sparse_model(config.sparse_model_name) 64 | 65 | # Translate to multimodal embedder input 66 | texts = [MultimodalEmbeddings._multimodal_to_text(doc) for doc in documents] 67 | # metadatas = [doc.metadata for doc in documents] 68 | metadatas = [{"type": i} for i, doc in enumerate(documents)] 69 | 70 | milvus = Milvus.from_texts( 71 | texts, 72 | metadatas=metadatas, 73 | embedding=dense_model, 74 | # vector_field=['dense', 'sparse'], 75 | collection_name=config.collection_name, 76 | connection_args={"uri": config.milvus_uri}, 77 | ) 78 | 79 | return cls(milvus=milvus) 80 | 81 | def add_documents( 82 | self, documents: list[MultimodalSample], **kwargs: Any 83 | ) -> list[str]: 84 | docs = [MultimodalEmbeddings._multimodal_to_doc(sample) for sample in documents] 85 | return self.milvus.add_documents(docs, **kwargs) 86 | 87 | def as_retriever(self, **kwargs: Any) -> VectorStoreRetriever: 88 | return self.milvus.as_retriever(**kwargs) 89 | 90 | @staticmethod 91 | def _init_dense_model(dense_model_name: str) -> Embeddings: 92 | if dense_model_name == "meta-llama/Llama-3.2-11B-Vision": 93 | return MultimodalEmbeddings(model_name=dense_model_name) 94 | else: 95 | return HuggingFaceEmbeddings(model_name=dense_model_name) 96 | 97 | @staticmethod 98 | def _init_sparse_model(sparse_model_name: str) -> BaseSparseEmbedding: 99 | if sparse_model_name.lower() == "bm25": 100 | raise NotImplementedError() 101 | # return BM25SparseEmbedding(corpus) 102 | else: 103 | sparse_model_name = ( 104 | "naver/splade-cocondenser-selfdistil" 105 | if sparse_model_name == "splade" 106 | else sparse_model_name 107 | ) 108 | return SpladeSparseEmbedding(sparse_model_name) 109 | -------------------------------------------------------------------------------- /src/mmore/run_index.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from dataclasses import dataclass 4 | from typing import Optional, Union 5 | 6 | from dotenv import load_dotenv 7 | 8 | from mmore.index.indexer import Indexer, IndexerConfig 9 | from mmore.type import MultimodalSample 10 | from mmore.utils import load_config 11 | 12 | logger = logging.getLogger(__name__) 13 | INDEX_EMOJI = "🗂️" 14 | logging.basicConfig( 15 | format=f"[INDEX {INDEX_EMOJI} -- %(asctime)s] %(message)s", 16 | level=logging.INFO, 17 | datefmt="%Y-%m-%d %H:%M:%S", 18 | ) 19 | 20 | load_dotenv() 21 | 22 | 23 | @dataclass 24 | class IndexConfig: 25 | indexer: IndexerConfig 26 | collection_name: str 27 | documents_path: str 28 | 29 | 30 | def index( 31 | config_file: Union[IndexConfig, str], 32 | documents_path: Optional[str] = None, 33 | collection_name: Optional[str] = None, 34 | ): 35 | """Index files for specified documents.""" 36 | # Load the config file 37 | config: IndexConfig = load_config(config_file, IndexConfig) 38 | if collection_name is None: 39 | collection_name = config.collection_name 40 | if documents_path is None: 41 | documents_path = config.documents_path 42 | 43 | documents = MultimodalSample.from_jsonl(documents_path) 44 | 45 | logger.info("Creating the indexer...") 46 | Indexer.from_documents( 47 | config=config.indexer, documents=documents, collection_name=collection_name 48 | ) 49 | logger.info("Documents indexed!") 50 | 51 | 52 | if __name__ == "__main__": 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument( 55 | "--config-file", required=True, help="Path to the index configuration file." 56 | ) 57 | parser.add_argument( 58 | "--documents-path", "-f", required=False, help="Path to the JSONL data." 59 | ) 60 | parser.add_argument( 61 | "--collection-name", 62 | "-n", 63 | required=False, 64 | help="Name of the collection to index.", 65 | ) 66 | args = parser.parse_args() 67 | 68 | index_config = load_config(args.config_file, IndexConfig) 69 | index( 70 | index_config.indexer, index_config.documents_path, index_config.collection_name 71 | ) 72 | -------------------------------------------------------------------------------- /src/mmore/run_live_retrieval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import uvicorn 4 | from fastapi import FastAPI 5 | 6 | from .run_index_api import make_router as index_router 7 | from .run_retriever import make_router as retriever_router 8 | 9 | 10 | def run(config_file: str, host: str, port: int): 11 | app = FastAPI(title="Live Indexing & Retrieval API") 12 | 13 | app.include_router(index_router(config_file)) 14 | app.include_router(retriever_router(config_file)) 15 | 16 | uvicorn.run(app, host=host, port=port) 17 | 18 | 19 | if __name__ == "__main__": 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument( 22 | "--config-file", required=True, help="Path to the retriever configuration file." 23 | ) 24 | parser.add_argument( 25 | "--host", default="0.0.0.0", help="Host on which the API should be run." 26 | ) 27 | parser.add_argument( 28 | "--port", default=8000, help="Port on which the API should be run." 29 | ) 30 | args = parser.parse_args() 31 | 32 | run(args.config_file, args.host, args.port) 33 | -------------------------------------------------------------------------------- /src/mmore/run_postprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from typing import List 4 | 5 | from mmore.process.post_processor.pipeline import PPPipeline, PPPipelineConfig 6 | from mmore.type import MultimodalSample 7 | from mmore.utils import load_config 8 | 9 | PP_EMOJI = "🧹" 10 | logger = logging.getLogger(__name__) 11 | logging.basicConfig( 12 | format=f"[PP {PP_EMOJI}-- %(asctime)s] %(message)s", 13 | level=logging.INFO, 14 | datefmt="%Y-%m-%d %H:%M:%S", 15 | ) 16 | 17 | 18 | def _load_dataset(data_path: List[str]) -> List[MultimodalSample]: 19 | return [s for path in data_path for s in MultimodalSample.from_jsonl(path)] 20 | 21 | 22 | def postprocess(config_file, input_data): 23 | """Run post-processors pipeline.""" 24 | if isinstance(input_data, str): 25 | input_data = [input_data] 26 | 27 | # Load config 28 | config = load_config(config_file, PPPipelineConfig) 29 | 30 | # Load post-processors pipeline 31 | pipeline = PPPipeline.from_config(config) 32 | 33 | # Load samples 34 | samples = _load_dataset(input_data) 35 | if len(samples) == 0: 36 | logger.warning("⚠️ Found no file to postprocess") 37 | 38 | # Run pipeline 39 | samples = pipeline(samples) 40 | 41 | 42 | if __name__ == "__main__": 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument( 45 | "--config_file", 46 | required=True, 47 | help="Path to the postprocess configuration file.", 48 | ) 49 | parser.add_argument( 50 | "--input_data", required=True, help="Path to the jsonl of the documents." 51 | ) 52 | 53 | args = parser.parse_args() 54 | postprocess(args.config_file, args.input_data) 55 | -------------------------------------------------------------------------------- /src/mmore/run_rag.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | from dataclasses import dataclass 5 | from pathlib import Path 6 | from typing import Dict, List, Optional, Union, cast 7 | 8 | import uvicorn 9 | from dotenv import load_dotenv 10 | from fastapi import FastAPI 11 | from pydantic import BaseModel 12 | 13 | from mmore.rag.pipeline import RAGConfig, RAGPipeline 14 | from mmore.utils import load_config 15 | 16 | RAG_EMOJI = "🧠" 17 | logger = logging.getLogger(__name__) 18 | logging.basicConfig( 19 | format=f"[RAG {RAG_EMOJI} -- %(asctime)s] %(message)s", 20 | level=logging.INFO, 21 | datefmt="%Y-%m-%d %H:%M:%S", 22 | ) 23 | 24 | load_dotenv() 25 | 26 | 27 | @dataclass 28 | class LocalConfig: 29 | input_file: str 30 | output_file: str 31 | 32 | 33 | @dataclass 34 | class APIConfig: 35 | endpoint: str = "/rag" 36 | port: int = 8000 37 | host: str = "0.0.0.0" 38 | 39 | 40 | @dataclass 41 | class RAGInferenceConfig: 42 | rag: RAGConfig 43 | mode: str 44 | mode_args: Optional[Union[LocalConfig, APIConfig]] = None 45 | 46 | def __post_init__(self): 47 | if self.mode_args is None and self.mode == "api": 48 | self.mode_args = APIConfig() 49 | 50 | 51 | def read_queries(input_file: Union[Path, str]) -> List[Dict[str, str]]: 52 | with open(input_file, "r") as f: 53 | return [json.loads(line) for line in f] 54 | 55 | 56 | def save_results(results: List[Dict], output_file: Union[Path, str]): 57 | results = [ 58 | {key: d[key] for key in {"input", "context", "answer"} if key in d} 59 | for d in results 60 | ] 61 | with open(output_file, "w") as f: 62 | json.dump(results, f, indent=2) 63 | 64 | 65 | class InnerInput(BaseModel): 66 | input: str 67 | collection_name: Optional[str] = None 68 | 69 | 70 | class RAGInput(BaseModel): 71 | input: InnerInput 72 | 73 | 74 | class RAGOutput(BaseModel): 75 | input: Optional[str] = None 76 | context: Optional[str] = None 77 | answer: Optional[str] = None 78 | 79 | 80 | def create_api(rag: RAGPipeline, endpoint: str): 81 | app = FastAPI( 82 | title="RAG Pipeline API", 83 | description="API for question answering using RAG", 84 | version="2.0", 85 | ) 86 | 87 | @app.post(endpoint, response_model=RAGOutput) 88 | async def run_rag(request: RAGInput): 89 | # Extract the inner input dict to pass to rag_chain 90 | pipeline_input = request.input.model_dump() 91 | output_dict = rag.rag_chain.invoke(pipeline_input) 92 | return RAGOutput(**output_dict) 93 | 94 | @app.get("/health") 95 | def health_check(): 96 | return {"status": "healthy"} 97 | 98 | return app 99 | 100 | 101 | def rag(config_file): 102 | """Run RAG in local or API""" 103 | config = load_config(config_file, RAGInferenceConfig) 104 | 105 | logger.info("Creating the RAG Pipeline...") 106 | rag_pp = RAGPipeline.from_config(config.rag) 107 | logger.info("RAG pipeline initialized!") 108 | 109 | if config.mode == "local": 110 | config_args = cast(LocalConfig, config.mode_args) 111 | 112 | queries = read_queries(config_args.input_file) 113 | results = rag_pp(queries, return_dict=True) 114 | save_results(results, config_args.output_file) 115 | 116 | elif config.mode == "api": 117 | config_args = cast(APIConfig, config.mode_args) 118 | 119 | app = create_api(rag_pp, config_args.endpoint) 120 | uvicorn.run(app, host=config_args.host, port=config_args.port) 121 | 122 | else: 123 | raise ValueError(f"Unknown mode: {config.mode}. Should be either api or local") 124 | 125 | 126 | if __name__ == "__main__": 127 | parser = argparse.ArgumentParser() 128 | parser.add_argument( 129 | "--config-file", required=True, help="Path to the rag configuration file." 130 | ) 131 | args = parser.parse_args() 132 | 133 | rag(args.config_file) 134 | -------------------------------------------------------------------------------- /src/mmore/websearchRAG/config.py: -------------------------------------------------------------------------------- 1 | # mmore/websearch/config.py 2 | from dataclasses import dataclass, field 3 | from pathlib import Path 4 | from typing import Any, Dict, Literal, Optional 5 | 6 | import yaml 7 | 8 | from ..rag.llm import LLMConfig # Reuse the same LLMConfig as RAG 9 | 10 | 11 | @dataclass 12 | class WebsearchConfig: 13 | """ 14 | Configuration for WebsearchPipeline. 15 | 16 | Fields: 17 | rag_config_path: (str or None) Path to the RAG config YAML. Required if use_rag=True. 18 | use_rag: (bool) If True, run RAG first; otherwise skip directly to sub-query generation. 19 | use_summary: (bool) If True, run an initial LLM-based summary of the RAG answer. 20 | input_file: (str) Path to the JSON file used as “queries” (or RAG output). 21 | output_file: (str) Path where the enhanced JSON results will be written. 22 | input_queries: (str) Path to queries file. 23 | n_subqueries: (int) Number of sub-queries to generate via LLM. 24 | n_loops: (int) Number of loops to run the process. 25 | max_searches: (int) Max results to fetch from DuckDuckGo per sub-query. 26 | llm_config: (dict) Passed to rag.llm.LLMConfig (keys: llm_name, max_new_tokens, temperature, etc.) 27 | mode: (str) Mode of operation ("local" or "api"). 28 | """ 29 | 30 | rag_config_path: str # e.g., "../rag/config.yaml" 31 | output_file: str 32 | use_rag: bool = False 33 | use_summary: bool = False 34 | input_file: Optional[str] = None 35 | input_queries: Optional[str] = None 36 | n_subqueries: int = 3 37 | n_loops: int = 2 38 | max_searches: int = 10 39 | llm_config: LLMConfig = field( 40 | default_factory=lambda: LLMConfig( 41 | **{"llm_name": "gpt-4", "max_new_tokens": 1200} 42 | ) 43 | ) 44 | mode: Literal["local", "api"] = "local" 45 | 46 | def __post_init__(self): 47 | required_fields = ["n_loops", "n_subqueries", "max_searches", "mode"] 48 | for field_name in required_fields: 49 | if not getattr(self, field_name): 50 | raise ValueError(f"'{field_name}' is a required field.") 51 | 52 | def get_llm_config(self) -> LLMConfig: 53 | """ 54 | Return the nested llm_config object. 55 | """ 56 | return self.llm_config 57 | 58 | def access_rag_config(self) -> Dict[str, Any]: 59 | """ 60 | Access and parse the RAG configuration file defined in `rag_config_path`. 61 | 62 | Returns: 63 | A dictionary representing the RAG configuration. 64 | """ 65 | if not self.rag_config_path: 66 | raise ValueError("The 'rag_config_path' is not defined.") 67 | 68 | # Resolve the full path to the RAG config file 69 | rag_config_full_path = Path(self.rag_config_path) 70 | 71 | if not rag_config_full_path.exists(): 72 | raise FileNotFoundError( 73 | f"RAG config file not found at {rag_config_full_path}" 74 | ) 75 | 76 | # Load the RAG configuration 77 | with open(rag_config_full_path, "r") as file: 78 | rag_config = yaml.safe_load(file) 79 | 80 | return rag_config 81 | -------------------------------------------------------------------------------- /src/mmore/websearchRAG/logging_config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | # Create or get the logger 4 | 5 | WEBSRCH_EMOJI = "🌐" 6 | logger = logging.getLogger("WEBSEARCHRAG") 7 | logging.basicConfig( 8 | format=f"[WebSearch {WEBSRCH_EMOJI} -- %(asctime)s] %(message)s", 9 | level=logging.INFO, 10 | datefmt="%Y-%m-%d %H:%M:%S", 11 | ) 12 | 13 | # Prevent multiple handlers if the logger is configured multiple times 14 | if not logger.handlers: 15 | # Create a file handler to log to a file 16 | file_handler = logging.FileHandler("shared_log_file.log") 17 | file_handler.setLevel(logging.DEBUG) 18 | 19 | # Define log format 20 | formatter = logging.Formatter( 21 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 22 | ) 23 | file_handler.setFormatter(formatter) 24 | 25 | # Add file handler to logger 26 | logger.addHandler(file_handler) 27 | -------------------------------------------------------------------------------- /src/mmore/websearchRAG/websearch.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from langchain_community.tools import DuckDuckGoSearchResults 4 | from langchain_community.utilities import DuckDuckGoSearchAPIWrapper 5 | 6 | from ..rag.llm import LLM, LLMConfig 7 | 8 | 9 | class WebsearchOnly: 10 | """Class dedicated to performing web searches and validating their usefulness.""" 11 | 12 | def __init__(self, region: str = "wt-wt", max_results: int = 10): 13 | """Initialize the WebsearchOnly class with search parameters.""" 14 | self.wrapper = DuckDuckGoSearchAPIWrapper( 15 | region=region, max_results=max_results 16 | ) 17 | 18 | def websearch_pipeline(self, query: str) -> Dict[str, str]: 19 | """Perform a single web search.""" 20 | search = DuckDuckGoSearchResults(api_wrapper=self.wrapper) 21 | web_output = search.run(query) 22 | return web_output 23 | 24 | def summarize_web_search(self, query: str, web_output: str) -> str: 25 | """Call LLM to summarize the current web output based on the original query, return a summary of the web search and the source.""" 26 | llm = LLM.from_config( 27 | LLMConfig(llm_name="OpenMeditron/meditron3-8b", max_new_tokens=1200) 28 | ) 29 | prompt = ( 30 | f"Original Query: '{query}'\n" 31 | f"Web content: '{web_output}'\n" 32 | "Based on the original query and the web content, can you provide a response to the original query?" 33 | ) 34 | response = llm.invoke(prompt).content 35 | assert isinstance(response, str) 36 | return response.strip() 37 | --------------------------------------------------------------------------------