├── .python-version
├── CLAUDE.md
├── scripts
    ├── __init__.py
    ├── hf_utils
    │   ├── update_standalone.py
    │   └── hf_model_process_check.py
    ├── utils
    │   └── sync_output_modeling.py
    ├── context-relevance-datasets
    │   ├── upload_context_relevance_to_hf.py
    │   ├── frequency_filter_ds.py
    │   └── add_reranker_teacher_scores.py
    └── eval_mldr
    │   └── ignored_questions.yaml
├── .env.sample
├── open_provence
    ├── utils
    │   ├── __init__.py
    │   ├── modeling_export.py
    │   └── model_architecture.py
    ├── models
    │   ├── __init__.py
    │   └── open_provence_head.py
    ├── trainer_cli.py
    ├── modeling_open_provence_transformers.py
    ├── __init__.py
    ├── data_structures.py
    └── losses.py
├── configs
    ├── eval_datasets
    │   ├── en.yaml
    │   ├── en_nano.yaml
    │   ├── ja.yaml
    │   └── ja_nano.yaml
    ├── open-provence-reranker-v1-gte-modernbert-base.yaml
    ├── toy-open-provence-reranker-v1-gte-modernbert-base.yaml
    ├── toy-open-provence-reranker-v1.yaml
    ├── open-provence-reranker-large-v1.yaml
    ├── open-provence-reranker-v1.yaml
    └── open-provence-reranker-xsmall-v1.yaml
├── tox.ini
├── LICENSE
├── .github
    └── workflows
    │   └── ci.yaml
├── tests
    ├── utils
    │   ├── test_modeling_export.py
    │   └── test_model_architecture.py
    ├── test_modeling_default_dtype.py
    ├── test_checkpoint_resolution.py
    ├── scripts
    │   ├── test_generate_ds_from_sentense_transformer.py
    │   └── test_sync_output_modeling.py
    ├── test_data_structures.py
    ├── test_items_sampling.py
    ├── test_sequential_fragmentize.py
    ├── test_tokenizer_special_tokens.py
    ├── test_eval_mldr_official.py
    └── test_trainer_sampling.py
├── .gitignore
├── pyproject.toml
├── docs
    ├── eval_dataset.md
    ├── eval_mldr.md
    └── train.md
└── AGENTS.md


/.python-version:
--------------------------------------------------------------------------------
1 | 3.11
2 | 


--------------------------------------------------------------------------------
/CLAUDE.md:
--------------------------------------------------------------------------------
1 | @AGENTS.md
2 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | """Helper package for CLI utilities."""
2 | 
3 | from __future__ import annotations
4 | 


--------------------------------------------------------------------------------
/.env.sample:
--------------------------------------------------------------------------------
1 | # Environment variables for Open Provence tooling
2 | # Copy to .env and replace values with your own secrets.
3 | 
4 | # Required when using MLDR LLM-based evaluation or the Streamlit WebUI "LLM judge" features.
5 | OPENAI_API_KEY=sk-xxxxxxxxxx
6 | 


--------------------------------------------------------------------------------
/open_provence/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility modules for OpenProvence.
 3 | """
 4 | 
 5 | from __future__ import annotations
 6 | 
 7 | from .model_architecture import ModelArchitectureUtils
 8 | 
 9 | __all__ = [
10 |     "ModelArchitectureUtils",
11 | ]
12 | 


--------------------------------------------------------------------------------
/open_provence/models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Models for OpenProvence.
 3 | """
 4 | 
 5 | from __future__ import annotations
 6 | 
 7 | from .open_provence_head import OpenProvenceHead, OpenProvenceHeadConfig
 8 | 
 9 | __all__ = ["OpenProvenceHead", "OpenProvenceHeadConfig"]
10 | 


--------------------------------------------------------------------------------
/open_provence/trainer_cli.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Entry point for training OpenProvence models.
 4 | 
 5 | This script delegates to the main runner module.
 6 | """
 7 | 
 8 | from __future__ import annotations
 9 | 
10 | from open_provence.runner import main
11 | 
12 | if __name__ == "__main__":
13 |     main()
14 | 


--------------------------------------------------------------------------------
/configs/eval_datasets/en.yaml:
--------------------------------------------------------------------------------
 1 | # Evaluation datasets for English freq reranker checkpoints.
 2 | split: test
 3 | datasets:
 4 |   - dataset_name: "hotchpotch/msmarco-context-relevance"
 5 |     subset: "freq2"
 6 |   - dataset_name: "hotchpotch/natural-questions-context-relevance"
 7 |     subset: "nodup_freq2"
 8 |   - dataset_name: "hotchpotch/gooaq-context-relevance-130k"
 9 |     subset: "default"
10 | 


--------------------------------------------------------------------------------
/configs/eval_datasets/en_nano.yaml:
--------------------------------------------------------------------------------
 1 | split: test
 2 | datasets:
 3 |   - dataset_name: "hotchpotch/msmarco-context-relevance"
 4 |     subset: "freq2"
 5 |     n_samples: 100
 6 |   - dataset_name: "hotchpotch/natural-questions-context-relevance"
 7 |     subset: "nodup_freq2"
 8 |     n_samples: 100
 9 |   - dataset_name: "hotchpotch/gooaq-context-relevance-130k"
10 |     subset: "default"
11 |     n_samples: 100
12 | 


--------------------------------------------------------------------------------
/open_provence/utils/modeling_export.py:
--------------------------------------------------------------------------------
 1 | """Helpers for exporting modeling_open_provence_standalone scripts."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from pathlib import Path
 6 | 
 7 | 
 8 | def write_modeling_open_provence(
 9 |     source: Path,
10 |     destination: Path,
11 | ) -> None:
12 |     """Copy modeling_open_provence_standalone.py without mutating its contents."""
13 | 
14 |     destination.write_text(source.read_text(encoding="utf-8"), encoding="utf-8")
15 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | requires = tox-uv>=1.11.1
 3 | envlist = pytests, lint, format-check, typecheck
 4 | isolated_build = false
 5 | skip_missing_interpreters = true
 6 | 
 7 | [testenv]
 8 | runner = uv-venv-lock-runner
 9 | dependency_groups = 
10 |     dev
11 |     cpu
12 | no_default_groups = true
13 | skip_install = true
14 | commands = python -c "raise SystemExit('Specify a concrete environment, e.g. `tox -e lint`')"  # guard
15 | 
16 | [testenv:pytests]
17 | commands =
18 |     pytest --maxfail=1 --durations=5 -n auto --maxprocesses=4 --dist loadscope
19 | 
20 | [testenv:lint]
21 | commands =
22 |     ruff check open_provence tests scripts
23 | 
24 | [testenv:format-check]
25 | commands =
26 |     ruff format --check --diff open_provence tests scripts
27 | 
28 | [testenv:typecheck]
29 | commands =
30 |     pyright --threads 4 open_provence tests scripts
31 | 


--------------------------------------------------------------------------------
/open_provence/modeling_open_provence_transformers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Compatibility shim: legacy imports for OpenProvence Hugging Face helpers.
 3 | 
 4 | All functionality now resides in ``modeling_open_provence_standalone``. This module keeps the
 5 | old import path working for downstream tooling that still references
 6 | ``open_provence.modeling_open_provence_transformers``.
 7 | """
 8 | 
 9 | from __future__ import annotations
10 | 
11 | from .modeling_open_provence_standalone import (
12 |     OpenProvenceConfig,
13 |     OpenProvenceEncoderConfig,
14 |     OpenProvenceEncoderForSequenceClassification,
15 |     OpenProvenceEncoderForTokenClassification,
16 |     OpenProvenceForSequenceClassification,
17 |     OpenProvenceForTokenClassification,
18 | )
19 | 
20 | __all__ = [
21 |     "OpenProvenceConfig",
22 |     "OpenProvenceForSequenceClassification",
23 |     "OpenProvenceForTokenClassification",
24 |     "OpenProvenceEncoderConfig",
25 |     "OpenProvenceEncoderForSequenceClassification",
26 |     "OpenProvenceEncoderForTokenClassification",
27 | ]
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Yuichi Tateno
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/open_provence/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Query-dependent text pruning and reranking for efficient RAG pipelines.
 3 | 
 4 | This module provides functionality for pruning irrelevant content from documents
 5 | based on queries, with optional reranking capabilities.
 6 | """
 7 | 
 8 | from __future__ import annotations
 9 | 
10 | from .data_collator import OpenProvenceDataCollator
11 | from .data_structures import (
12 |     OpenProvenceConfig,
13 |     OpenProvenceOnlyOutput,
14 |     OpenProvenceOutput,
15 |     RerankingOpenProvenceOutput,
16 | )
17 | from .encoder import OpenProvenceEncoder
18 | from .losses import OpenProvenceLoss
19 | from .trainer import OpenProvenceTrainer
20 | 
21 | # Import runner module at the end to avoid circular imports
22 | # It will be imported after other modules are initialized
23 | 
24 | __all__ = [
25 |     "OpenProvenceConfig",
26 |     "RerankingOpenProvenceOutput",
27 |     "OpenProvenceOutput",
28 |     "OpenProvenceOnlyOutput",
29 |     "OpenProvenceEncoder",
30 |     "OpenProvenceTrainer",
31 |     "OpenProvenceLoss",
32 |     "OpenProvenceDataCollator",
33 |     "runner",
34 | ]
35 | 
36 | # Import runner after other modules are initialized
37 | from . import runner
38 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |   workflow_dispatch:
 9 | 
10 | permissions:
11 |   contents: read
12 | 
13 | jobs:
14 |   ci:
15 |     runs-on: ubuntu-latest
16 |     env:
17 |       UV_PYTHON: "3.11"
18 |     steps:
19 |       - name: Checkout repository
20 |         uses: actions/checkout@v4
21 | 
22 |       - name: Set up uv
23 |         id: setup-uv
24 |         uses: astral-sh/setup-uv@v6
25 |         with:
26 |           enable-cache: true
27 |           cache-suffix: linux-py311-tox
28 | 
29 |       - name: Install Python 3.11
30 |         run: uv python install 3.11
31 | 
32 |       - name: Sync dependencies
33 |         run: uv sync --locked --no-default-groups --group dev --group cpu
34 | 
35 |       - name: Download NLTK resources
36 |         run: |
37 |           # Use the virtualenv interpreter directly so CI never pulls NVIDIA/CUDA extras via `uv run`.
38 |           ./.venv/bin/python -c "import nltk; nltk.download('punkt', quiet=True); nltk.download('punkt_tab', quiet=True)"
39 | 
40 |       - name: Run tox
41 |         run: |
42 |           # Invoke tox from the synced virtualenv to reuse locked deps; keep `run-parallel` for CI throughput.
43 |           ./.venv/bin/tox run-parallel
44 | 


--------------------------------------------------------------------------------
/tests/utils/test_modeling_export.py:
--------------------------------------------------------------------------------
 1 | """Tests for ``open_provence.utils.modeling_export``."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from pathlib import Path
 6 | 
 7 | from open_provence.utils.modeling_export import write_modeling_open_provence
 8 | 
 9 | 
10 | def _make_source(tmp_path: Path, content: str) -> Path:
11 |     source = tmp_path / "modeling_open_provence_standalone.py"
12 |     source.write_text(content, encoding="utf-8")
13 |     return source
14 | 
15 | 
16 | def test_write_modeling_open_provence_copies_source(tmp_path: Path) -> None:
17 |     content = "DEFAULT_SPLITTER_LANGUAGE = \"auto\"\n"
18 |     source = _make_source(tmp_path, content)
19 |     destination = tmp_path / "out.py"
20 | 
21 |     write_modeling_open_provence(source, destination)
22 | 
23 |     assert destination.read_text(encoding="utf-8") == content
24 | 
25 | 
26 | def test_write_modeling_open_provence_overwrites_existing(tmp_path: Path) -> None:
27 |     content = "# latest\nDEFAULT_SPLITTER_LANGUAGE = \"auto\"\n"
28 |     source = _make_source(tmp_path, content)
29 |     destination = tmp_path / "out.py"
30 |     destination.write_text("legacy\n", encoding="utf-8")
31 | 
32 |     write_modeling_open_provence(source, destination)
33 | 
34 |     assert destination.read_text(encoding="utf-8") == content
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Distribution / packaging
 2 | .Python
 3 | build/
 4 | develop-eggs/
 5 | dist/
 6 | downloads/
 7 | eggs/
 8 | .eggs/
 9 | lib/
10 | lib64/
11 | parts/
12 | sdist/
13 | var/
14 | wheels/
15 | share/python-wheels/
16 | *.egg-info/
17 | .installed.cfg
18 | *.egg
19 | MANIFEST
20 | 
21 | # Docs
22 | /docs/_build/
23 | /docs/make.bat
24 | 
25 | # Editors
26 | .idea
27 | .vscode
28 | 
29 | # Coverage
30 | htmlcov
31 | 
32 | # Training outputs and temporary files
33 | output/
34 | outputs/
35 | tmp/
36 | *.bin
37 | *.safetensors
38 | *.pt
39 | *.pth
40 | .coverage*
41 | coverage.xml
42 | 
43 | # Examples
44 | /examples/**/output/*
45 | /examples/datasets/
46 | /examples/embeddings/
47 | /examples/sentence_transformer/training/quora_duplicate_questions/quora-IR-dataset/
48 | examples/datasets/*/
49 | 
50 | 
51 | # Specific files and folders
52 | /pretrained-models/
53 | /cheatsheet.txt
54 | /testsuite.txt
55 | /TODO.txt
56 | 
57 | # Virtual environments
58 | .env
59 | .venv
60 | env/
61 | venv/
62 | 
63 | # Database
64 | /qdrant_storage
65 | /elastic-start-local
66 | 
67 | # Others
68 | *.pyc
69 | *.gz
70 | *.tsv
71 | 
72 | 
73 | tmp_*.py
74 | nr_*/
75 | wandb
76 | checkpoints
77 | tmp
78 | .DS_Store
79 | /runs
80 | /output/
81 | /results/
82 | /log/
83 | /logs/
84 | tmp/
85 | tmp*
86 | log/
87 | logs/
88 | cache/
89 | 
90 | # Log directories
91 | logs/
92 | scripts/log/
93 | 
94 | .cckiro/
95 | 


--------------------------------------------------------------------------------
/configs/eval_datasets/ja.yaml:
--------------------------------------------------------------------------------
 1 | # Evaluation datasets for Japanese freq reranker checkpoints.
 2 | split: test
 3 | datasets:
 4 |   - dataset_name: "hotchpotch/msmarco-context-relevance"
 5 |     subset: "freq2"
 6 |   - dataset_name: "hotchpotch/natural-questions-context-relevance"
 7 |     subset: "nodup_freq2"
 8 |   - dataset_name: "hotchpotch/gooaq-context-relevance-130k"
 9 |     subset: "default"
10 |   - dataset_name: "hotchpotch/japanese-context-relevance"
11 |     subset: "msmarco-ja-freq2"
12 |   - dataset_name: "hotchpotch/japanese-context-relevance"
13 |     subset: "auto-wiki-qa-nemotron"
14 |   - dataset_name: "hotchpotch/japanese-context-relevance"
15 |     subset: "jaquad-freq2"
16 |   - dataset_name: "hotchpotch/japanese-context-relevance"
17 |     subset: "jqara"
18 |   - dataset_name: "hotchpotch/japanese-context-relevance"
19 |     subset: "jsquad-freq2"
20 |   - dataset_name: "hotchpotch/japanese-context-relevance"
21 |     subset: "miracl"
22 |   - dataset_name: "hotchpotch/japanese-context-relevance"
23 |     subset: "mkqa"
24 |   - dataset_name: "hotchpotch/japanese-context-relevance"
25 |     subset: "mr-tydi"
26 |   - dataset_name: "hotchpotch/japanese-context-relevance"
27 |     subset: "quiz-no-mori"
28 |   - dataset_name: "hotchpotch/japanese-context-relevance"
29 |     subset: "quiz-works"
30 |   - dataset_name: "hotchpotch/japanese-context-relevance"
31 |     subset: "JFWIR"
32 | 


--------------------------------------------------------------------------------
/tests/test_modeling_default_dtype.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import platform
 4 | 
 5 | import pytest
 6 | import torch
 7 | 
 8 | try:
 9 |     from open_provence.modeling_open_provence_standalone import _select_default_torch_dtype
10 | except ImportError:  # datasets などが未インストールの場合はスキップ
11 |     pytest.skip(
12 |         "modeling_open_provence_standalone requires optional dependencies",
13 |         allow_module_level=True,
14 |     )
15 | 
16 | 
17 | def test_select_default_dtype_cuda_prefers_bf16(monkeypatch):
18 |     monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
19 |     monkeypatch.setattr(torch.cuda, "is_bf16_supported", lambda: True)
20 |     assert _select_default_torch_dtype("cuda") == torch.bfloat16
21 | 
22 | 
23 | def test_select_default_dtype_cuda_fallback_float16(monkeypatch):
24 |     monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
25 |     monkeypatch.setattr(torch.cuda, "is_bf16_supported", lambda: False)
26 |     assert _select_default_torch_dtype("cuda") == torch.float16
27 | 
28 | 
29 | def test_select_default_dtype_cpu_apple(monkeypatch):
30 |     monkeypatch.setattr(platform, "system", lambda: "Darwin")
31 |     monkeypatch.setattr(platform, "machine", lambda: "arm64")
32 |     assert _select_default_torch_dtype("cpu") == "auto"
33 | 
34 | 
35 | def test_select_default_dtype_mps(monkeypatch):
36 |     assert _select_default_torch_dtype("mps") == "auto"
37 | 
38 | 
39 | def test_select_default_dtype_unknown_device(monkeypatch):
40 |     monkeypatch.setattr(platform, "system", lambda: "Linux")
41 |     monkeypatch.setattr(platform, "machine", lambda: "x86_64")
42 |     assert _select_default_torch_dtype("cpu") is None
43 | 


--------------------------------------------------------------------------------
/configs/eval_datasets/ja_nano.yaml:
--------------------------------------------------------------------------------
 1 | # Nano evaluation slice aligned with freq datasets (first 100 examples per dataset).
 2 | split: test
 3 | datasets:
 4 |   - dataset_name: "hotchpotch/msmarco-context-relevance"
 5 |     subset: "freq2"
 6 |     n_samples: 100
 7 |   - dataset_name: "hotchpotch/natural-questions-context-relevance"
 8 |     subset: "nodup_freq2"
 9 |     n_samples: 100
10 |   - dataset_name: "hotchpotch/gooaq-context-relevance-130k"
11 |     subset: "default"
12 |     n_samples: 100
13 |   - dataset_name: "hotchpotch/japanese-context-relevance"
14 |     subset: "msmarco-ja-freq2"
15 |     n_samples: 100
16 |   - dataset_name: "hotchpotch/japanese-context-relevance"
17 |     subset: "auto-wiki-qa-nemotron"
18 |     n_samples: 100
19 |   - dataset_name: "hotchpotch/japanese-context-relevance"
20 |     subset: "jaquad-freq2"
21 |     n_samples: 100
22 |   - dataset_name: "hotchpotch/japanese-context-relevance"
23 |     subset: "jqara"
24 |     n_samples: 100
25 |   - dataset_name: "hotchpotch/japanese-context-relevance"
26 |     subset: "jsquad-freq2"
27 |     n_samples: 100
28 |   - dataset_name: "hotchpotch/japanese-context-relevance"
29 |     subset: "miracl"
30 |     n_samples: 100
31 |   - dataset_name: "hotchpotch/japanese-context-relevance"
32 |     subset: "mkqa"
33 |     n_samples: 100
34 |   - dataset_name: "hotchpotch/japanese-context-relevance"
35 |     subset: "mr-tydi"
36 |     n_samples: 100
37 |   - dataset_name: "hotchpotch/japanese-context-relevance"
38 |     subset: "quiz-no-mori"
39 |     n_samples: 100
40 |   - dataset_name: "hotchpotch/japanese-context-relevance"
41 |     subset: "quiz-works"
42 |     n_samples: 100
43 |   - dataset_name: "hotchpotch/japanese-context-relevance"
44 |     subset: "JFWIR"
45 |     n_samples: 100
46 | 


--------------------------------------------------------------------------------
/configs/open-provence-reranker-v1-gte-modernbert-base.yaml:
--------------------------------------------------------------------------------
 1 | model_args:
 2 |   model_name_or_path: "Alibaba-NLP/gte-reranker-modernbert-base"
 3 |   classifier_dropout: 0.0
 4 | 
 5 | data_args:
 6 |   datasets:
 7 |     -
 8 |       dataset_name: "hotchpotch/msmarco-context-relevance"
 9 |       subset: "freq2"
10 |       teacher_column: "teacher_scores.gte-reranker-modernbert-base"
11 |     -
12 |       dataset_name: "hotchpotch/natural-questions-context-relevance"
13 |       subset: "nodup_freq2"
14 |       teacher_column: "teacher_scores.gte-reranker-modernbert-base"
15 |       items: 6
16 |     -
17 |       dataset_name: "hotchpotch/gooaq-context-relevance-130k"
18 |       subset: "default"
19 |       teacher_column: "teacher_scores.gte-reranker-modernbert-base"
20 |       items: 6
21 | 
22 | 
23 | training_args:
24 |   overwrite_output_dir: true
25 |   optimizer: "adafactor"
26 | 
27 |   # Training parameters
28 |   learning_rate: 5.0e-5
29 |   per_device_train_batch_size: 4 # If GPU memory is not enough, try reducing this value.
30 |   gradient_accumulation_steps: 64
31 |   max_grad_norm: 1.0
32 | 
33 |   # Optimizer and scheduler
34 |   weight_decay: 0.01
35 |   lr_scheduler_type: "cosine"
36 |   warmup_ratio: 0.1
37 | 
38 |   # Logging and saving
39 |   logging_steps: 100
40 |   save_steps: 500
41 |   save_total_limit: 5
42 | 
43 |   # Mixed precision
44 |   fp16: false
45 |   bf16: true
46 | 
47 |   # Other settings
48 |   dataloader_num_workers: 8
49 |   load_best_model_at_end: true
50 |   num_train_epochs: 1
51 | 
52 |   # eval
53 |   per_device_eval_batch_size: 16
54 |   eval_steps: 500
55 | 
56 |   # Reporting
57 |   report_to: ["wandb"]
58 | 
59 |   eval_datasets:
60 |     config: configs/eval_datasets/en.yaml
61 |     threshold: 0.1
62 |     batch_size: 32
63 | 


--------------------------------------------------------------------------------
/configs/toy-open-provence-reranker-v1-gte-modernbert-base.yaml:
--------------------------------------------------------------------------------
 1 | model_args:
 2 |   model_name_or_path: "Alibaba-NLP/gte-reranker-modernbert-base"
 3 |   classifier_dropout: 0.0
 4 | 
 5 | data_args:
 6 |   datasets:
 7 |     -
 8 |       dataset_name: "hotchpotch/msmarco-context-relevance"
 9 |       subset: "freq2"
10 |       teacher_column: "teacher_scores.gte-reranker-modernbert-base"
11 |       n_samples: 4000
12 |     -
13 |       dataset_name: "hotchpotch/natural-questions-context-relevance"
14 |       subset: "nodup_freq2"
15 |       teacher_column: "teacher_scores.gte-reranker-modernbert-base"
16 |       items: 6
17 |       n_samples: 4000
18 |     -
19 |       dataset_name: "hotchpotch/gooaq-context-relevance-130k"
20 |       subset: "default"
21 |       teacher_column: "teacher_scores.gte-reranker-modernbert-base"
22 |       items: 6
23 |       n_samples: 4000
24 | 
25 | 
26 | training_args:
27 |   overwrite_output_dir: true
28 |   optimizer: "adafactor"
29 | 
30 |   # Training parameters
31 |   learning_rate: 5.0e-5
32 |   per_device_train_batch_size: 4 # If GPU memory is not enough, try reducing this value.
33 |   gradient_accumulation_steps: 16
34 |   max_grad_norm: 1.0
35 | 
36 |   # Optimizer and scheduler
37 |   weight_decay: 0.01
38 |   lr_scheduler_type: "cosine"
39 |   warmup_ratio: 0.1
40 | 
41 |   # Logging and saving
42 |   logging_steps: 100
43 |   save_steps: 500
44 |   save_total_limit: 5
45 | 
46 |   # Mixed precision
47 |   fp16: false
48 |   bf16: true
49 | 
50 |   # Other settings
51 |   dataloader_num_workers: 8
52 |   load_best_model_at_end: true
53 |   num_train_epochs: 1
54 | 
55 |   # eval
56 |   per_device_eval_batch_size: 16
57 |   eval_steps: 500
58 | 
59 |   # Reporting
60 |   report_to: ["wandb"]
61 | 
62 |   eval_datasets:
63 |     config: configs/eval_datasets/en_nano.yaml
64 |     threshold: 0.1
65 |     batch_size: 32
66 | 


--------------------------------------------------------------------------------
/tests/test_checkpoint_resolution.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | import pytest
 6 | from open_provence.trainer import ResolvedCheckpoint, resolve_resume_checkpoint_path
 7 | 
 8 | 
 9 | def _make_checkpoint(dir_path: Path) -> None:
10 |     dir_path.mkdir(parents=True)
11 |     (dir_path / "trainer_state.json").write_text("{}", encoding="utf-8")
12 | 
13 | 
14 | def test_resolve_explicit_checkpoint_returns_parent(tmp_path: Path) -> None:
15 |     checkpoint_dir = tmp_path / "checkpoint-0500"
16 |     _make_checkpoint(checkpoint_dir)
17 | 
18 |     resolved = resolve_resume_checkpoint_path(checkpoint_dir)
19 | 
20 |     assert isinstance(resolved, ResolvedCheckpoint)
21 |     assert resolved.checkpoint_dir == checkpoint_dir.resolve()
22 |     assert resolved.run_dir == tmp_path.resolve()
23 |     assert resolved.steps == 500
24 | 
25 | 
26 | def test_resolve_parent_directory_picks_latest_checkpoint(tmp_path: Path) -> None:
27 |     run_dir = tmp_path / "run"
28 |     older = run_dir / "checkpoint-0100"
29 |     newest = run_dir / "checkpoint-0500"
30 |     _make_checkpoint(older)
31 |     _make_checkpoint(newest)
32 | 
33 |     resolved = resolve_resume_checkpoint_path(run_dir)
34 | 
35 |     assert resolved.checkpoint_dir == newest.resolve()
36 |     assert resolved.run_dir == run_dir.resolve()
37 |     assert resolved.steps == 500
38 | 
39 | 
40 | def test_resolve_parent_directory_without_checkpoints_errors(tmp_path: Path) -> None:
41 |     run_dir = tmp_path / "run"
42 |     run_dir.mkdir()
43 | 
44 |     with pytest.raises(ValueError):
45 |         resolve_resume_checkpoint_path(run_dir)
46 | 
47 | 
48 | def test_resolve_missing_path_errors(tmp_path: Path) -> None:
49 |     missing = tmp_path / "missing"
50 | 
51 |     with pytest.raises(FileNotFoundError):
52 |         resolve_resume_checkpoint_path(missing)
53 | 


--------------------------------------------------------------------------------
/configs/toy-open-provence-reranker-v1.yaml:
--------------------------------------------------------------------------------
 1 | model_args:
 2 |   model_name_or_path: "hotchpotch/japanese-reranker-base-v2"
 3 |   classifier_dropout: 0.0
 4 | 
 5 | data_args:
 6 |   datasets:
 7 |     -
 8 |       dataset_name: "hotchpotch/msmarco-context-relevance"
 9 |       subset: "freq2"
10 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
11 |       n_samples: 4000
12 |     -
13 |       dataset_name: "hotchpotch/japanese-context-relevance"
14 |       subset: "msmarco-ja-freq2"
15 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
16 |       n_samples: 4000
17 |     -
18 |       dataset_name: "hotchpotch/japanese-context-relevance"
19 |       subset: "auto-wiki-qa-nemotron"
20 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
21 |       n_samples: 4000
22 | 
23 | training_args:
24 |   overwrite_output_dir: true
25 |   optimizer: "adafactor"
26 | 
27 |   # Training parameters
28 |   learning_rate: 5.0e-5
29 |   # The Japanese model produces stable and well-balanced scores with a batch size of 256.
30 |   per_device_train_batch_size: 4 # If GPU memory is not enough, try reducing this value.
31 |   gradient_accumulation_steps: 16
32 |   max_grad_norm: 1.0
33 | 
34 |   # Optimizer and scheduler
35 |   weight_decay: 0.01
36 |   lr_scheduler_type: "cosine"
37 |   warmup_ratio: 0.1
38 | 
39 |   # Logging and saving
40 |   logging_steps: 100
41 |   save_steps: 500
42 |   save_total_limit: 5
43 | 
44 |   # Mixed precision
45 |   fp16: false
46 |   bf16: true
47 | 
48 |   # Other settings
49 |   dataloader_num_workers: 8
50 |   load_best_model_at_end: true
51 |   num_train_epochs: 1
52 | 
53 |   # eval
54 |   per_device_eval_batch_size: 16
55 |   eval_steps: 500
56 | 
57 |   # Reporting
58 |   report_to: ["wandb"]
59 | 
60 |   eval_datasets:
61 |     config: configs/eval_datasets/ja_nano.yaml
62 |     threshold: 0.1
63 |     batch_size: 32
64 | 


--------------------------------------------------------------------------------
/tests/scripts/test_generate_ds_from_sentense_transformer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import subprocess
 4 | import sys
 5 | from pathlib import Path
 6 | 
 7 | from datasets import Dataset, DatasetDict, load_from_disk
 8 | 
 9 | SCRIPT_PATH = (
10 |     Path(__file__).resolve().parents[2]
11 |     / "scripts"
12 |     / "context-relevance-datasets"
13 |     / "generate_ds_from_sentense_transformer.py"
14 | )
15 | 
16 | 
17 | def build_source_dataset(root: Path) -> Path:
18 |     rows = 20
19 |     data = {
20 |         "question": [f"question {i}" for i in range(rows)],
21 |         "answer": [f"answer {i}" for i in range(rows)],
22 |         "neg1": [f"neg1 {i}" for i in range(rows)],
23 |         "neg2": [f"neg2 {i}" for i in range(rows)],
24 |     }
25 |     dataset = Dataset.from_dict(data)
26 |     dataset_dict = DatasetDict({"train": dataset})
27 |     source_path = root / "source_ds"
28 |     dataset_dict.save_to_disk(source_path)
29 |     return source_path
30 | 
31 | 
32 | def test_generate_from_local_dataset(tmp_path):
33 |     source_path = build_source_dataset(tmp_path)
34 |     output_root = tmp_path / "converted"
35 | 
36 |     cmd = [
37 |         sys.executable,
38 |         str(SCRIPT_PATH),
39 |         "--dataset",
40 |         str(source_path),
41 |         "--lang",
42 |         "en",
43 |         "--output-root",
44 |         str(output_root),
45 |         "--overwrite",
46 |     ]
47 |     subprocess.run(cmd, check=True, cwd=Path(__file__).resolve().parents[2])
48 | 
49 |     output_dirs = list(output_root.iterdir())
50 |     assert len(output_dirs) == 1
51 |     converted = load_from_disk(output_dirs[0])
52 |     assert isinstance(converted, DatasetDict)
53 |     assert set(converted.keys()) == {"train", "validation", "test"}
54 |     first = converted["train"][0]
55 |     assert first["query"].startswith("question")
56 |     assert first["texts"][0].startswith("answer")
57 |     assert first["labels"][0] == 1
58 |     assert all(label in {0, 1} for label in first["labels"])  # sanity check
59 | 


--------------------------------------------------------------------------------
/tests/test_data_structures.py:
--------------------------------------------------------------------------------
 1 | """Tests for ``open_provence.data_structures`` helpers."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | from open_provence.data_structures import (
 8 |     OpenProvenceOnlyOutput,
 9 |     OpenProvenceOutput,
10 |     RerankingOpenProvenceOutput,
11 | )
12 | 
13 | 
14 | def test_open_provence_output_to_dict_serializes_numpy() -> None:
15 |     output = OpenProvenceOutput(
16 |         ranking_scores=np.array([0.1, 0.2]),
17 |         chunk_predictions=np.array([[1, 0], [0, 1]]),
18 |         chunk_positions=[[(0, 1)]],
19 |         compression_ratio=0.5,
20 |     )
21 | 
22 |     result = output.to_dict()
23 | 
24 |     assert result["ranking_scores"] == [0.1, 0.2]
25 |     assert result["chunk_predictions"] == [[1, 0], [0, 1]]
26 |     assert result["chunk_positions"] == [[(0, 1)]]
27 |     assert result["compression_ratio"] == 0.5
28 |     assert "token_scores" not in result
29 | 
30 | 
31 | def test_open_provence_only_output_to_dict_handles_torch() -> None:
32 |     logits = torch.tensor([[[0.2, 0.8], [0.7, 0.3]]])
33 |     output = OpenProvenceOnlyOutput(
34 |         pruning_logits=logits,
35 |         pruning_masks=np.array([[1, 0]]),
36 |         num_pruned_tokens=5,
37 |     )
38 | 
39 |     result = output.to_dict()
40 | 
41 |     np.testing.assert_allclose(
42 |         result["pruning_logits"],
43 |         [[[0.2, 0.8], [0.7, 0.3]]],
44 |     )
45 |     assert result["pruning_masks"] == [[1, 0]]
46 |     assert result["num_pruned_tokens"] == 5
47 |     assert "pruning_probs" not in result
48 | 
49 | 
50 | def test_reranking_output_repr_includes_shapes() -> None:
51 |     output = RerankingOpenProvenceOutput(
52 |         ranking_scores=np.ones(2),
53 |         pruning_masks=np.ones((1, 2)),
54 |         compression_ratio=0.75,
55 |         pruning_logits=torch.zeros(1, 2, 2),
56 |     )
57 | 
58 |     result = output.to_dict()
59 |     assert result["pruning_logits"] == [[[0.0, 0.0], [0.0, 0.0]]]
60 | 
61 |     representation = repr(output)
62 |     assert "ranking_scores=(2,)" in representation
63 |     assert "pruning_masks=(1, 2)" in representation
64 |     assert "compression_ratio=0.75" in representation
65 | 


--------------------------------------------------------------------------------
/tests/test_items_sampling.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from datasets import Dataset
 4 | from open_provence.trainer import sample_items_by_label_priority
 5 | 
 6 | 
 7 | def test_items_sampling_keeps_positive_and_samples_negatives():
 8 |     dataset = Dataset.from_dict(
 9 |         {
10 |             "labels": [[1, 0, 0, 0]],
11 |             "texts": [["pos", "neg-a", "neg-b", "neg-c"]],
12 |             "teacher_scores": [[0.9, 0.2, 0.1, 0.05]],
13 |         }
14 |     )
15 | 
16 |     filtered = sample_items_by_label_priority(dataset, 3, seed=123, num_proc=1)
17 | 
18 |     assert len(filtered) == 1
19 |     row = filtered[0]
20 |     assert len(row["labels"]) == 3
21 |     assert row["labels"][0] == 1  # positive entry is retained
22 |     assert row["texts"][0] == "pos"
23 |     # The remaining items originate from the original negatives
24 |     assert set(row["texts"][1:]).issubset({"neg-a", "neg-b", "neg-c"})
25 |     assert len(row["teacher_scores"]) == 3
26 | 
27 | 
28 | def test_items_sampling_drops_queries_with_too_few_items():
29 |     dataset = Dataset.from_dict(
30 |         {
31 |             "id": ["short", "long"],
32 |             "labels": [[1, 0], [1, 0, 0]],
33 |             "texts": [["p", "n"], ["p", "n1", "n2"]],
34 |         }
35 |     )
36 | 
37 |     filtered = sample_items_by_label_priority(dataset, 3, seed=42, num_proc=1)
38 | 
39 |     assert len(filtered) == 1
40 |     assert filtered[0]["id"] == "long"
41 |     assert len(filtered[0]["labels"]) == 3
42 | 
43 | 
44 | def test_items_sampling_handles_rows_without_positive_labels():
45 |     dataset = Dataset.from_dict(
46 |         {
47 |             "labels": [[0, 0, 0, 0]],
48 |             "texts": [["a", "b", "c", "d"]],
49 |         }
50 |     )
51 | 
52 |     filtered = sample_items_by_label_priority(dataset, 2, seed=7, num_proc=1)
53 | 
54 |     assert len(filtered) == 1
55 |     row = filtered[0]
56 |     assert len(row["labels"]) == 2
57 |     assert set(row["texts"]).issubset({"a", "b", "c", "d"})
58 | 
59 | 
60 | def test_items_sampling_prefers_positive_items_when_exceeding_limit():
61 |     dataset = Dataset.from_dict(
62 |         {
63 |             "labels": [[1, 1, 0, 0]],
64 |             "texts": [["p1", "p2", "n1", "n2"]],
65 |         }
66 |     )
67 | 
68 |     filtered = sample_items_by_label_priority(dataset, 2, seed=5, num_proc=1)
69 | 
70 |     assert len(filtered) == 1
71 |     assert filtered[0]["labels"] == [1, 1]
72 |     assert filtered[0]["texts"] == ["p1", "p2"]
73 | 


--------------------------------------------------------------------------------
/tests/test_sequential_fragmentize.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any
 4 | 
 5 | from open_provence.modeling_open_provence_standalone import (
 6 |     OpenProvenceModel,
 7 |     SentenceSplitter,
 8 | )
 9 | 
10 | 
11 | class _StubTokenizer:
12 |     """Minimal tokenizer stub that operates on Unicode codepoints."""
13 | 
14 |     def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
15 |         return [ord(ch) for ch in text]
16 | 
17 |     def __call__(
18 |         self,
19 |         sentences: list[str],
20 |         *,
21 |         add_special_tokens: bool = False,
22 |         return_attention_mask: bool = False,
23 |     ) -> dict[str, Any]:
24 |         return {"input_ids": [[ord(ch) for ch in sentence] for sentence in sentences]}
25 | 
26 |     def batch_decode(
27 |         self,
28 |         sequences: list[list[int]],
29 |         *,
30 |         skip_special_tokens: bool = True,
31 |         clean_up_tokenization_spaces: bool = False,
32 |     ) -> list[str]:
33 |         return ["".join(chr(ch) for ch in seq) for seq in sequences]
34 | 
35 |     def decode(
36 |         self,
37 |         sequence: list[int],
38 |         *,
39 |         skip_special_tokens: bool = True,
40 |         clean_up_tokenization_spaces: bool = False,
41 |     ) -> str:
42 |         return "".join(chr(ch) for ch in sequence)
43 | 
44 | 
45 | def _split_sentences(text: str) -> list[str]:
46 |     return [segment for segment in text.split("。") if segment] or [text]
47 | 
48 | 
49 | def test_run_sequential_fragmentize_produces_fragments() -> None:
50 |     model = OpenProvenceModel.__new__(OpenProvenceModel)
51 |     model.tokenizer = _StubTokenizer()
52 | 
53 |     job = {
54 |         "query_idx": 0,
55 |         "context_idx": 0,
56 |         "context_text": "吾輩は猫である。名前はまだない。",
57 |         "prefix_sentences": [],
58 |         "manual_sentences": None,
59 |         "cached_sentences": None,
60 |         "cached_token_lists": None,
61 |     }
62 | 
63 |     splitter: SentenceSplitter = _split_sentences
64 | 
65 |     results = model._run_sequential_fragmentize(
66 |         [job],
67 |         max_fragment_tokens=16,
68 |         splitter=splitter,
69 |         show_progress=False,
70 |         strip_sentences=True,
71 |         respect_sentence_boundaries=False,
72 |     )
73 | 
74 |     assert len(results) == 1
75 |     entry = results[0]
76 | 
77 |     assert entry["sentences"] == ["吾輩は猫である", "名前はまだない"]
78 |     assert entry["fragment_texts"] == ["吾輩は猫である", "名前はまだない"]
79 |     assert entry["fragment_token_ids"]
80 |     assert entry["timing_sentence_collect"] >= 0.0
81 |     assert entry["timing_fragment_decode"] >= 0.0
82 | 


--------------------------------------------------------------------------------
/tests/test_tokenizer_special_tokens.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | from transformers import AutoTokenizer
 5 | 
 6 | ENGLISH_MODEL_NAME = "Alibaba-NLP/gte-reranker-modernbert-base"
 7 | JAPANESE_MODEL_NAME = "hotchpotch/japanese-reranker-base-v2"
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     ("model_name", "query", "document"),
12 |     [
13 |         (
14 |             ENGLISH_MODEL_NAME,
15 |             "What is artificial intelligence?",
16 |             "Artificial intelligence studies intelligent behaviour in machines.",
17 |         ),
18 |         (
19 |             JAPANESE_MODEL_NAME,
20 |             "AIとは何ですか？",
21 |             "AIは人工知能の略称で、人間の知能を機械で再現することを指します。",
22 |         ),
23 |     ],
24 | )
25 | def test_encode_plus_inserts_special_tokens(model_name: str, query: str, document: str) -> None:
26 |     """Ensure encode_plus inserts special tokens for both English and Japanese checkpoints."""
27 | 
28 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
29 | 
30 |     encoding = tokenizer.encode_plus(
31 |         query,
32 |         document,
33 |         add_special_tokens=True,
34 |         return_token_type_ids=True,
35 |     )
36 | 
37 |     input_ids = encoding["input_ids"]
38 |     assert input_ids, "Tokenizer returned empty input ids."
39 | 
40 |     start_candidates = [
41 |         tokenizer.cls_token_id,
42 |         tokenizer.bos_token_id,
43 |         tokenizer.special_tokens_map.get("cls_token_id"),
44 |         tokenizer.special_tokens_map.get("bos_token_id"),
45 |     ]
46 |     start_candidates = [tok_id for tok_id in start_candidates if isinstance(tok_id, int)]
47 |     assert start_candidates, "Tokenizer has no CLS/BOS token id defined."
48 |     assert input_ids[0] in start_candidates, (
49 |         f"Expected one of {start_candidates} at start, but got {input_ids[0]}."
50 |     )
51 | 
52 |     boundary_candidates = [
53 |         tokenizer.sep_token_id,
54 |         tokenizer.eos_token_id,
55 |         tokenizer.special_tokens_map.get("sep_token_id"),
56 |         tokenizer.special_tokens_map.get("eos_token_id"),
57 |     ]
58 |     boundary_candidates = [tok_id for tok_id in boundary_candidates if isinstance(tok_id, int)]
59 |     assert boundary_candidates, "Tokenizer has no SEP/EOS token id defined."
60 | 
61 |     boundary_indices = [
62 |         idx for idx, tok in enumerate(input_ids[1:], start=1) if tok in boundary_candidates
63 |     ]
64 |     assert boundary_indices, (
65 |         "No boundary token found between query and document "
66 |         f"(candidates={boundary_candidates}, tokens={input_ids})."
67 |     )
68 |     assert boundary_indices[0] < len(input_ids) - 1, (
69 |         "Boundary token should not be the final token."
70 |     )
71 | 
72 |     # Confirm that removing special tokens changes the sequence start.
73 |     encoding_no_special = tokenizer.encode_plus(
74 |         query,
75 |         document,
76 |         add_special_tokens=False,
77 |         return_token_type_ids=True,
78 |     )
79 |     assert encoding_no_special["input_ids"], "encode_plus without specials returned no tokens."
80 |     assert encoding_no_special["input_ids"][0] not in start_candidates, (
81 |         "encode_plus(add_special_tokens=False) unexpectedly kept the start special token; "
82 |         "this would invalidate the special-token check."
83 |     )
84 | 


--------------------------------------------------------------------------------
/tests/utils/test_model_architecture.py:
--------------------------------------------------------------------------------
 1 | """Tests for ``open_provence.utils.model_architecture``."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from open_provence.utils.model_architecture import ModelArchitectureUtils
 6 | 
 7 | 
 8 | def test_detect_architecture_modernbert() -> None:
 9 |     keys = [
10 |         "tok_embeddings.weight",
11 |         "layers.0.attn.Wqkv.weight",
12 |         "layers.0.mlp_norm.weight",
13 |     ]
14 |     assert ModelArchitectureUtils.detect_architecture(keys) == "modernbert"
15 | 
16 | 
17 | def test_detect_architecture_prefers_known_prefixes() -> None:
18 |     keys = [
19 |         "bert.embeddings.word_embeddings.weight",
20 |         "bert.encoder.layer.0.attention.self.query.weight",
21 |         "bert.pooler.dense.weight",
22 |     ]
23 |     assert ModelArchitectureUtils.detect_architecture(keys) == "bert"
24 | 
25 | 
26 | def test_detect_architecture_unknown_when_no_patterns() -> None:
27 |     keys = ["linear.weight", "classifier.bias"]
28 |     assert ModelArchitectureUtils.detect_architecture(keys) == "unknown"
29 | 
30 | 
31 | def test_needs_prefix_conversion_identifies_flat_modernbert_keys() -> None:
32 |     keys = [
33 |         "embeddings.word_embeddings.weight",
34 |         "layers.0.attn.Wqkv.weight",
35 |     ]
36 |     needs_conversion, prefix = ModelArchitectureUtils.needs_prefix_conversion(keys, "modernbert")
37 |     assert needs_conversion is True
38 |     assert prefix == "model."
39 | 
40 | 
41 | def test_needs_prefix_conversion_no_action_when_prefixed() -> None:
42 |     keys = [
43 |         "model.embeddings.word_embeddings.weight",
44 |         "model.layers.0.attn.Wqkv.weight",
45 |     ]
46 |     needs_conversion, prefix = ModelArchitectureUtils.needs_prefix_conversion(keys, "modernbert")
47 |     assert needs_conversion is False
48 |     assert prefix is None
49 | 
50 | 
51 | def test_convert_state_dict_keys_adds_and_skips() -> None:
52 |     state_dict = {
53 |         "embeddings.word_embeddings.weight": "weights",
54 |         "layers.0.attn.Wqkv.weight": "attn",
55 |         "pruning_head.linear.weight": "head",
56 |     }
57 | 
58 |     converted = ModelArchitectureUtils.convert_state_dict_keys(
59 |         state_dict,
60 |         add_prefix="model.",
61 |         skip_keys=["pruning_head"],
62 |     )
63 | 
64 |     assert converted["model.embeddings.word_embeddings.weight"] == "weights"
65 |     assert converted["model.layers.0.attn.Wqkv.weight"] == "attn"
66 |     assert converted["pruning_head.linear.weight"] == "head"
67 | 
68 | 
69 | def test_auto_fix_state_dict_adds_model_prefix_for_modernbert() -> None:
70 |     state_dict = {
71 |         "embeddings.word_embeddings.weight": "weights",
72 |         "layers.0.attn.Wqkv.weight": "attn",
73 |     }
74 | 
75 |     fixed = ModelArchitectureUtils.auto_fix_state_dict(state_dict, list(state_dict.keys()), "modernbert")
76 | 
77 |     assert "model.embeddings.word_embeddings.weight" in fixed
78 |     assert "model.layers.0.attn.Wqkv.weight" in fixed
79 | 
80 | 
81 | def test_normalize_state_dict_for_saving_removes_model_prefix() -> None:
82 |     state_dict = {
83 |         "model.embeddings.word_embeddings.weight": "weights",
84 |         "model.layers.0.attn.Wqkv.weight": "attn",
85 |     }
86 | 
87 |     normalized = ModelArchitectureUtils.normalize_state_dict_for_saving(state_dict, "modernbert")
88 | 
89 |     assert "embeddings.word_embeddings.weight" in normalized
90 |     assert "layers.0.attn.Wqkv.weight" in normalized
91 | 


--------------------------------------------------------------------------------
/configs/open-provence-reranker-large-v1.yaml:
--------------------------------------------------------------------------------
  1 | model_args:
  2 |   model_name_or_path: "cl-nagoya/ruri-v3-reranker-310m"
  3 |   classifier_dropout: 0.0
  4 | 
  5 | data_args:
  6 |   datasets:
  7 |     -
  8 |       dataset_name: "hotchpotch/msmarco-context-relevance"
  9 |       subset: "freq2"
 10 |       teacher_column: "teacher_scores.ruri-v3-reranker-310m"
 11 |     -
 12 |       items: 6
 13 |       dataset_name: "hotchpotch/natural-questions-context-relevance"
 14 |       subset: "nodup_freq2"
 15 |       teacher_column: "teacher_scores.ruri-v3-reranker-310m"
 16 |     -
 17 |       items: 6
 18 |       dataset_name: "hotchpotch/gooaq-context-relevance-130k"
 19 |       subset: "default"
 20 |       teacher_column: "teacher_scores.ruri-v3-reranker-310m"
 21 |     -
 22 |       dataset_name: "hotchpotch/japanese-context-relevance"
 23 |       subset: "msmarco-ja-freq2"
 24 |       teacher_column: "teacher_scores.ruri-v3-reranker-310m"
 25 |     -
 26 |       dataset_name: "hotchpotch/japanese-context-relevance"
 27 |       subset: "auto-wiki-qa-nemotron"
 28 |       teacher_column: "teacher_scores.ruri-v3-reranker-310m"
 29 |     -
 30 |       dataset_name: "hotchpotch/japanese-context-relevance"
 31 |       subset: "jaquad-freq2"
 32 |       teacher_column: "teacher_scores.ruri-v3-reranker-310m"
 33 |     -
 34 |       dataset_name: "hotchpotch/japanese-context-relevance"
 35 |       subset: "jqara"
 36 |       teacher_column: "teacher_scores.ruri-v3-reranker-310m"
 37 |       upsample_factor: 4.0
 38 |     -
 39 |       dataset_name: "hotchpotch/japanese-context-relevance"
 40 |       subset: "jsquad-freq2"
 41 |       teacher_column: "teacher_scores.ruri-v3-reranker-310m"
 42 |     -
 43 |       dataset_name: "hotchpotch/japanese-context-relevance"
 44 |       subset: "miracl"
 45 |       teacher_column: "teacher_scores.ruri-v3-reranker-310m"
 46 |       upsample_factor: 2.0
 47 |     -
 48 |       dataset_name: "hotchpotch/japanese-context-relevance"
 49 |       subset: "mkqa"
 50 |       teacher_column: "teacher_scores.ruri-v3-reranker-310m"
 51 |       upsample_factor: 2.0
 52 |     -
 53 |       dataset_name: "hotchpotch/japanese-context-relevance"
 54 |       subset: "mr-tydi"
 55 |       teacher_column: "teacher_scores.ruri-v3-reranker-310m"
 56 |       upsample_factor: 2.0
 57 |     -
 58 |       dataset_name: "hotchpotch/japanese-context-relevance"
 59 |       subset: "quiz-no-mori"
 60 |       teacher_column: "teacher_scores.ruri-v3-reranker-310m"
 61 |     -
 62 |       dataset_name: "hotchpotch/japanese-context-relevance"
 63 |       subset: "quiz-works"
 64 |       teacher_column: "teacher_scores.ruri-v3-reranker-310m"
 65 |     -
 66 |       dataset_name: "hotchpotch/japanese-context-relevance"
 67 |       subset: "JFWIR"
 68 |       teacher_column: "teacher_scores.ruri-v3-reranker-310m"
 69 | 
 70 | 
 71 | training_args:
 72 |   overwrite_output_dir: true
 73 |   optimizer: "adafactor"
 74 | 
 75 |   # Training parameters
 76 |   learning_rate: 5.0e-5
 77 |   # The Japanese model produces stable and well-balanced scores with a batch size of 256.
 78 |   per_device_train_batch_size: 2 # If GPU memory is not enough, try reducing this value.
 79 |   gradient_accumulation_steps: 128
 80 |   max_grad_norm: 1.0
 81 | 
 82 |   # Optimizer and scheduler
 83 |   weight_decay: 0.01
 84 |   lr_scheduler_type: "cosine"
 85 |   warmup_ratio: 0.1
 86 | 
 87 |   # Logging and saving
 88 |   logging_steps: 100
 89 |   save_steps: 500
 90 |   save_total_limit: 5
 91 | 
 92 |   # Mixed precision
 93 |   fp16: false
 94 |   bf16: true
 95 | 
 96 |   # Other settings
 97 |   dataloader_num_workers: 8
 98 |   load_best_model_at_end: true
 99 |   num_train_epochs: 1
100 | 
101 |   # eval
102 |   per_device_eval_batch_size: 16
103 |   eval_steps: 500
104 | 
105 |   # Reporting
106 |   report_to: ["wandb"]
107 | 
108 |   eval_datasets:
109 |     config: configs/eval_datasets/ja.yaml
110 |     threshold: 0.1
111 |     batch_size: 16
112 | 


--------------------------------------------------------------------------------
/configs/open-provence-reranker-v1.yaml:
--------------------------------------------------------------------------------
  1 | model_args:
  2 |   model_name_or_path: "hotchpotch/japanese-reranker-base-v2"
  3 |   classifier_dropout: 0.0
  4 | 
  5 | data_args:
  6 |   datasets:
  7 |     -
  8 |       dataset_name: "hotchpotch/msmarco-context-relevance"
  9 |       subset: "freq2"
 10 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
 11 |     -
 12 |       items: 6
 13 |       dataset_name: "hotchpotch/natural-questions-context-relevance"
 14 |       subset: "nodup_freq2"
 15 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
 16 |     -
 17 |       items: 6
 18 |       dataset_name: "hotchpotch/gooaq-context-relevance-130k"
 19 |       subset: "default"
 20 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
 21 |     -
 22 |       dataset_name: "hotchpotch/japanese-context-relevance"
 23 |       subset: "msmarco-ja-freq2"
 24 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
 25 |     -
 26 |       dataset_name: "hotchpotch/japanese-context-relevance"
 27 |       subset: "auto-wiki-qa-nemotron"
 28 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
 29 |     -
 30 |       dataset_name: "hotchpotch/japanese-context-relevance"
 31 |       subset: "jaquad-freq2"
 32 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
 33 |     -
 34 |       dataset_name: "hotchpotch/japanese-context-relevance"
 35 |       subset: "jqara"
 36 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
 37 |       upsample_factor: 4.0
 38 |     -
 39 |       dataset_name: "hotchpotch/japanese-context-relevance"
 40 |       subset: "jsquad-freq2"
 41 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
 42 |     -
 43 |       dataset_name: "hotchpotch/japanese-context-relevance"
 44 |       subset: "miracl"
 45 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
 46 |       upsample_factor: 2.0
 47 |     -
 48 |       dataset_name: "hotchpotch/japanese-context-relevance"
 49 |       subset: "mkqa"
 50 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
 51 |       upsample_factor: 2.0
 52 |     -
 53 |       dataset_name: "hotchpotch/japanese-context-relevance"
 54 |       subset: "mr-tydi"
 55 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
 56 |       upsample_factor: 2.0
 57 |     -
 58 |       dataset_name: "hotchpotch/japanese-context-relevance"
 59 |       subset: "quiz-no-mori"
 60 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
 61 |     -
 62 |       dataset_name: "hotchpotch/japanese-context-relevance"
 63 |       subset: "quiz-works"
 64 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
 65 |     -
 66 |       dataset_name: "hotchpotch/japanese-context-relevance"
 67 |       subset: "JFWIR"
 68 |       teacher_column: "teacher_scores.japanese-reranker-base-v2"
 69 | 
 70 | 
 71 | training_args:
 72 |   overwrite_output_dir: true
 73 |   optimizer: "adafactor"
 74 | 
 75 |   # Training parameters
 76 |   learning_rate: 5.0e-5
 77 |   # The Japanese model produces stable and well-balanced scores with a batch size of 256.
 78 |   per_device_train_batch_size: 4 # If GPU memory is not enough, try reducing this value.
 79 |   gradient_accumulation_steps: 64
 80 |   max_grad_norm: 1.0
 81 | 
 82 |   # Optimizer and scheduler
 83 |   weight_decay: 0.01
 84 |   lr_scheduler_type: "cosine"
 85 |   warmup_ratio: 0.1
 86 | 
 87 |   # Logging and saving
 88 |   logging_steps: 100
 89 |   save_steps: 500
 90 |   save_total_limit: 5
 91 | 
 92 |   # Mixed precision
 93 |   fp16: false
 94 |   bf16: true
 95 | 
 96 |   # Other settings
 97 |   dataloader_num_workers: 8
 98 |   load_best_model_at_end: true
 99 |   num_train_epochs: 1
100 | 
101 |   # eval
102 |   per_device_eval_batch_size: 16
103 |   eval_steps: 500
104 | 
105 |   # Reporting
106 |   report_to: ["wandb"]
107 | 
108 |   eval_datasets:
109 |     config: configs/eval_datasets/ja.yaml
110 |     threshold: 0.1
111 |     batch_size: 32
112 | 


--------------------------------------------------------------------------------
/configs/open-provence-reranker-xsmall-v1.yaml:
--------------------------------------------------------------------------------
  1 | model_args:
  2 |   model_name_or_path: "hotchpotch/japanese-reranker-xsmall-v2"
  3 |   classifier_dropout: 0.0
  4 | 
  5 | data_args:
  6 |   datasets:
  7 |     -
  8 |       dataset_name: "hotchpotch/msmarco-context-relevance"
  9 |       subset: "freq2"
 10 |       teacher_column: "teacher_scores.japanese-reranker-xsmall-v2"
 11 |     -
 12 |       items: 6
 13 |       dataset_name: "hotchpotch/natural-questions-context-relevance"
 14 |       subset: "nodup_freq2"
 15 |       teacher_column: "teacher_scores.japanese-reranker-xsmall-v2"
 16 |     -
 17 |       items: 6
 18 |       dataset_name: "hotchpotch/gooaq-context-relevance-130k"
 19 |       subset: "default"
 20 |       teacher_column: "teacher_scores.japanese-reranker-xsmall-v2"
 21 |     -
 22 |       dataset_name: "hotchpotch/japanese-context-relevance"
 23 |       subset: "msmarco-ja-freq2"
 24 |       teacher_column: "teacher_scores.japanese-reranker-xsmall-v2"
 25 |     -
 26 |       dataset_name: "hotchpotch/japanese-context-relevance"
 27 |       subset: "auto-wiki-qa-nemotron"
 28 |       teacher_column: "teacher_scores.japanese-reranker-xsmall-v2"
 29 |     -
 30 |       dataset_name: "hotchpotch/japanese-context-relevance"
 31 |       subset: "jaquad-freq2"
 32 |       teacher_column: "teacher_scores.japanese-reranker-xsmall-v2"
 33 |     -
 34 |       dataset_name: "hotchpotch/japanese-context-relevance"
 35 |       subset: "jqara"
 36 |       teacher_column: "teacher_scores.japanese-reranker-xsmall-v2"
 37 |       upsample_factor: 4.0
 38 |     -
 39 |       dataset_name: "hotchpotch/japanese-context-relevance"
 40 |       subset: "jsquad-freq2"
 41 |       teacher_column: "teacher_scores.japanese-reranker-xsmall-v2"
 42 |     -
 43 |       dataset_name: "hotchpotch/japanese-context-relevance"
 44 |       subset: "miracl"
 45 |       teacher_column: "teacher_scores.japanese-reranker-xsmall-v2"
 46 |       upsample_factor: 2.0
 47 |     -
 48 |       dataset_name: "hotchpotch/japanese-context-relevance"
 49 |       subset: "mkqa"
 50 |       teacher_column: "teacher_scores.japanese-reranker-xsmall-v2"
 51 |       upsample_factor: 2.0
 52 |     -
 53 |       dataset_name: "hotchpotch/japanese-context-relevance"
 54 |       subset: "mr-tydi"
 55 |       teacher_column: "teacher_scores.japanese-reranker-xsmall-v2"
 56 |       upsample_factor: 2.0
 57 |     -
 58 |       dataset_name: "hotchpotch/japanese-context-relevance"
 59 |       subset: "quiz-no-mori"
 60 |       teacher_column: "teacher_scores.japanese-reranker-xsmall-v2"
 61 |     -
 62 |       dataset_name: "hotchpotch/japanese-context-relevance"
 63 |       subset: "quiz-works"
 64 |       teacher_column: "teacher_scores.japanese-reranker-xsmall-v2"
 65 |     -
 66 |       dataset_name: "hotchpotch/japanese-context-relevance"
 67 |       subset: "JFWIR"
 68 |       teacher_column: "teacher_scores.japanese-reranker-xsmall-v2"
 69 | 
 70 | 
 71 | training_args:
 72 |   overwrite_output_dir: true
 73 |   optimizer: "adafactor"
 74 | 
 75 |   # Training parameters
 76 |   learning_rate: 5.0e-5
 77 |   # The Japanese model produces stable and well-balanced scores with a batch size of 256.
 78 |   per_device_train_batch_size: 4 # If GPU memory is not enough, try reducing this value.
 79 |   gradient_accumulation_steps: 64
 80 |   max_grad_norm: 1.0
 81 | 
 82 |   # Optimizer and scheduler
 83 |   weight_decay: 0.01
 84 |   lr_scheduler_type: "cosine"
 85 |   warmup_ratio: 0.1
 86 | 
 87 |   # Logging and saving
 88 |   logging_steps: 100
 89 |   save_steps: 500
 90 |   save_total_limit: 5
 91 | 
 92 |   # Mixed precision
 93 |   fp16: false
 94 |   bf16: true
 95 | 
 96 |   # Other settings
 97 |   dataloader_num_workers: 8
 98 |   load_best_model_at_end: true
 99 |   num_train_epochs: 1
100 | 
101 |   # eval
102 |   per_device_eval_batch_size: 16
103 |   eval_steps: 500
104 | 
105 |   # Reporting
106 |   report_to: ["wandb"]
107 | 
108 |   eval_datasets:
109 |     config: configs/eval_datasets/ja.yaml
110 |     threshold: 0.1
111 |     batch_size: 64
112 | 


--------------------------------------------------------------------------------
/tests/scripts/test_sync_output_modeling.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import importlib.util
  4 | import io
  5 | import json
  6 | import sys
  7 | from pathlib import Path
  8 | 
  9 | 
 10 | def _repo_root() -> Path:
 11 |     return Path(__file__).resolve().parents[2]
 12 | 
 13 | 
 14 | def _load_sync_module():
 15 |     module_path = _repo_root() / "scripts" / "utils" / "sync_output_modeling.py"
 16 |     spec = importlib.util.spec_from_file_location("sync_output_modeling", module_path)
 17 |     if spec is None or spec.loader is None:
 18 |         raise RuntimeError("Failed to load sync_output_modeling module")
 19 |     module = importlib.util.module_from_spec(spec)
 20 |     sys.modules[spec.name] = module
 21 |     spec.loader.exec_module(module)  # type: ignore[assignment]
 22 |     return module
 23 | 
 24 | 
 25 | def test_sync_updates_modeling_and_config(tmp_path: Path) -> None:
 26 |     repo_root = _repo_root()
 27 |     base_file = repo_root / "open_provence" / "modeling_open_provence_standalone.py"
 28 |     sync = _load_sync_module()
 29 | 
 30 |     output_dir = tmp_path / "output"
 31 |     run_dir = output_dir / "toy-open-provence-reranker-japanese-test"
 32 |     run_dir.mkdir(parents=True, exist_ok=True)
 33 | 
 34 |     # Create outdated modeling file
 35 |     (run_dir / "modeling_open_provence_standalone.py").write_text(
 36 |         "# legacy content\n",
 37 |         encoding="utf-8",
 38 |     )
 39 | 
 40 |     # Config with wrong language and missing legacy field
 41 |     config_path = run_dir / "config.json"
 42 |     config_path.write_text(
 43 |         json.dumps(
 44 |             {
 45 |                 "model_type": "open_provence",
 46 |                 "splitter_default_language": "en",
 47 |                 "standalone_process_default_language": "en",
 48 |                 "modeling_open_provence_default_language": "en",
 49 |             },
 50 |             indent=2,
 51 |             ensure_ascii=False,
 52 |         )
 53 |         + "\n",
 54 |         encoding="utf-8",
 55 |     )
 56 | 
 57 |     states = sync.plan_sync(base_file, output_dir)
 58 |     assert len(states) == 1
 59 |     state = states[0]
 60 |     assert state.modeling_needs_update is True
 61 |     assert state.config_needs_update is True
 62 |     assert set(state.removed_keys) == {
 63 |         "splitter_default_language",
 64 |         "standalone_process_default_language",
 65 |         "modeling_open_provence_default_language",
 66 |     }
 67 | 
 68 |     stream = io.StringIO()
 69 |     sync.sync_targets(base_file, output_dir, overwrite=True, stream=stream)
 70 | 
 71 |     # modeling file should now match base file
 72 |     assert (run_dir / "modeling_open_provence_standalone.py").read_text(
 73 |         encoding="utf-8"
 74 |     ) == base_file.read_text(encoding="utf-8")
 75 | 
 76 |     updated_config = json.loads(config_path.read_text(encoding="utf-8"))
 77 |     for key in (
 78 |         "splitter_default_language",
 79 |         "standalone_process_default_language",
 80 |         "modeling_open_provence_default_language",
 81 |     ):
 82 |         assert key not in updated_config
 83 | 
 84 |     output = stream.getvalue()
 85 |     assert "copied modeling_open_provence_standalone.py" in output
 86 |     assert "removed deprecated config keys" in output
 87 | 
 88 | 
 89 | def test_sync_skip_when_up_to_date(tmp_path: Path) -> None:
 90 |     repo_root = _repo_root()
 91 |     base_file = repo_root / "open_provence" / "modeling_open_provence_standalone.py"
 92 |     sync = _load_sync_module()
 93 | 
 94 |     output_dir = tmp_path / "output"
 95 |     run_dir = output_dir / "toy-open-provence-reranker-test"
 96 |     run_dir.mkdir(parents=True, exist_ok=True)
 97 | 
 98 |     # Up-to-date modeling file
 99 |     run_dir.joinpath("modeling_open_provence_standalone.py").write_text(
100 |         base_file.read_text(encoding="utf-8"),
101 |         encoding="utf-8",
102 |     )
103 | 
104 |     config_path = run_dir / "config.json"
105 |     config_path.write_text(
106 |         json.dumps(
107 |             {
108 |                 "model_type": "open_provence",
109 |                 "some_other_field": "value",
110 |             },
111 |             indent=2,
112 |             ensure_ascii=False,
113 |         )
114 |         + "\n",
115 |         encoding="utf-8",
116 |     )
117 | 
118 |     stream = io.StringIO()
119 |     sync.sync_targets(base_file, output_dir, overwrite=False, stream=stream)
120 | 
121 |     assert "SKIP (already up to date)" in stream.getvalue()
122 | 


--------------------------------------------------------------------------------
/tests/test_eval_mldr_official.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import sys
  4 | from pathlib import Path
  5 | from typing import Any
  6 | 
  7 | from datasets import Dataset
  8 | 
  9 | ROOT = Path(__file__).resolve().parents[1]
 10 | if str(ROOT) not in sys.path:
 11 |     sys.path.append(str(ROOT))
 12 | 
 13 | from scripts.eval_mldr import (  # noqa: E402
 14 |     _should_use_naver_provence_model,
 15 |     build_records,
 16 |     parse_args,
 17 | )
 18 | 
 19 | 
 20 | def _build_dummy_dataset() -> Dataset:
 21 |     return Dataset.from_list(
 22 |         [
 23 |             {
 24 |                 "query_id": "q1",
 25 |                 "query": "dummy question",
 26 |                 "positive_passages": [
 27 |                     {"text": "positive text", "docid": "doc1", "title": "Title 1"},
 28 |                     {"text": "another positive", "docid": "doc2", "title": None},
 29 |                 ],
 30 |                 "negative_passages": [],
 31 |             }
 32 |         ]
 33 |     )
 34 | 
 35 | 
 36 | def test_official_detector_handles_remote_ids() -> None:
 37 |     assert _should_use_naver_provence_model(
 38 |         "naver/provence-reranker-debertav3-v1",
 39 |         is_local=False,
 40 |     )
 41 |     assert _should_use_naver_provence_model(
 42 |         "NAVER/Provence-multilingual",
 43 |         is_local=False,
 44 |     )
 45 |     assert _should_use_naver_provence_model(
 46 |         "naver/xprovence-reranker-bgem3-v1",
 47 |         is_local=False,
 48 |     )
 49 |     assert not _should_use_naver_provence_model(
 50 |         "naver/other-model",
 51 |         is_local=False,
 52 |     )
 53 |     assert not _should_use_naver_provence_model(
 54 |         "./local/provence",
 55 |         is_local=True,
 56 |     )
 57 | 
 58 | 
 59 | def test_parse_args_auto_adjusts_for_official(monkeypatch, tmp_path: Path) -> None:
 60 |     monkeypatch.setattr("scripts.eval_mldr.torch.cuda.is_available", lambda: True)
 61 |     argv = [
 62 |         "prog",
 63 |         "--model",
 64 |         "naver/xprovence-reranker-bgem3-v1",
 65 |         "--lang",
 66 |         "en",
 67 |         "--output-dir",
 68 |         str(tmp_path / "out"),
 69 |         "--no-eval",
 70 |     ]
 71 |     monkeypatch.setattr(sys, "argv", argv)
 72 | 
 73 |     args = parse_args()
 74 | 
 75 |     assert args.device == "cuda"
 76 |     assert args.torch_dtype == "bfloat16"
 77 |     assert args.auto_device_cuda
 78 |     assert args.auto_torch_dtype
 79 | 
 80 | 
 81 | def test_build_records_fills_missing_fields_for_official_results() -> None:
 82 |     dataset = _build_dummy_dataset()
 83 | 
 84 |     def process_fn(**_: Any) -> dict[str, Any]:
 85 |         return {
 86 |             "pruned_context": [["positive text", "another positive"]],
 87 |             "reranking_score": [[0.8, 0.6]],
 88 |             "compression_rate": [[20.0, 30.0]],
 89 |         }
 90 | 
 91 |     records, stats, num_queries = build_records(
 92 |         process_fn,
 93 |         dataset,
 94 |         threshold=0.1,
 95 |         batch_size=2,
 96 |         log_timing=False,
 97 |         use_best_reranker_score=True,
 98 |         show_progress=False,
 99 |     )
100 | 
101 |     assert num_queries == 1
102 |     assert len(records) == 2
103 |     for record in records:
104 |         assert record["kept_sentences"] == []
105 |         assert record["removed_sentences"] == []
106 |     assert stats["pos_scores"] == [0.8, 0.6]
107 | 
108 | 
109 | def test_build_records_accepts_scalar_outputs() -> None:
110 |     dataset = Dataset.from_list(
111 |         [
112 |             {
113 |                 "query_id": "q1",
114 |                 "query": "dummy question",
115 |                 "positive_passages": [
116 |                     {"text": "positive text", "docid": "doc1", "title": None},
117 |                 ],
118 |                 "negative_passages": [],
119 |             }
120 |         ]
121 |     )
122 | 
123 |     def process_fn(**_: Any) -> dict[str, Any]:
124 |         return {
125 |             "pruned_context": "positive text",
126 |             "reranking_score": 0.9,
127 |             "compression_rate": 25.0,
128 |         }
129 | 
130 |     records, stats, num_queries = build_records(
131 |         process_fn,
132 |         dataset,
133 |         threshold=0.1,
134 |         batch_size=1,
135 |         log_timing=False,
136 |         use_best_reranker_score=True,
137 |         show_progress=False,
138 |     )
139 | 
140 |     assert num_queries == 1
141 |     assert len(records) == 1
142 |     assert records[0]["pruned_text"] == "positive text"
143 |     assert records[0]["kept_sentences"] == []
144 |     assert records[0]["removed_sentences"] == []
145 |     assert stats["pos_scores"] == [0.9]
146 | 


--------------------------------------------------------------------------------
/scripts/hf_utils/update_standalone.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Release helper: copy the local `open_provence/modeling_open_provence_standalone.py`
  3 | into the four published HF model repos (README list) without touching git-lfs
  4 | artifacts, then commit and push.
  5 | 
  6 | Runbook (Nov 22, 2025):
  7 | 1) Update the standalone file locally as needed.
  8 | 2) Execute `python scripts/hf_utils/update_standalone.py`.
  9 |    - Clones / pulls into `tmp/release_models/<model-id>` with
 10 |      `GIT_LFS_SKIP_SMUDGE=1` to avoid LFS downloads.
 11 |    - Copies the standalone file, commits with a dated message, and pushes.
 12 | 3) Verify: `git -C tmp/release_models/<model-id> log -1 --oneline`
 13 |    should show `chore: update standalone file (<date>)`.
 14 | 4) Optional: run `python scripts/hf_utils/hf_model_process_check.py`
 15 |    to smoke-test the pushed code via AutoModel.
 16 | """
 17 | 
 18 | from __future__ import annotations
 19 | 
 20 | import argparse
 21 | import os
 22 | import shutil
 23 | import subprocess
 24 | from collections.abc import Iterable
 25 | from datetime import datetime
 26 | from pathlib import Path
 27 | 
 28 | DEFAULT_MODELS: tuple[str, ...] = (
 29 |     "hotchpotch/open-provence-reranker-v1",
 30 |     "hotchpotch/open-provence-reranker-xsmall-v1",
 31 |     "hotchpotch/open-provence-reranker-large-v1",
 32 |     "hotchpotch/open-provence-reranker-v1-gte-modernbert-base",
 33 | )
 34 | 
 35 | REPO_ROOT = Path(__file__).resolve().parents[2]
 36 | STANDALONE_SRC = REPO_ROOT / "open_provence" / "modeling_open_provence_standalone.py"
 37 | 
 38 | 
 39 | def run(cmd: list[str], *, cwd: Path | None = None, env: dict[str, str] | None = None) -> None:
 40 |     merged_env = os.environ.copy()
 41 |     merged_env.update(env or {})
 42 |     print(f"[cmd] {' '.join(cmd)} (cwd={cwd})")
 43 |     subprocess.run(cmd, cwd=cwd, env=merged_env, check=True)
 44 | 
 45 | 
 46 | def ensure_repo(repo_id: str, base_dir: Path, env: dict[str, str]) -> Path:
 47 |     target_dir = base_dir / repo_id.split("/", maxsplit=1)[1]
 48 |     if not target_dir.exists():
 49 |         base_dir.mkdir(parents=True, exist_ok=True)
 50 |         run(
 51 |             ["git", "clone", f"https://huggingface.co/{repo_id}", str(target_dir)],
 52 |             env=env,
 53 |         )
 54 |     else:
 55 |         run(["git", "-C", str(target_dir), "pull", "--rebase"], env=env)
 56 |     return target_dir
 57 | 
 58 | 
 59 | def copy_standalone(dest_repo: Path) -> None:
 60 |     dest = dest_repo / "modeling_open_provence_standalone.py"
 61 |     shutil.copy2(STANDALONE_SRC, dest)
 62 |     print(f"[copy] {STANDALONE_SRC} -> {dest}")
 63 | 
 64 | 
 65 | def has_changes(repo_dir: Path) -> bool:
 66 |     result = subprocess.run(
 67 |         ["git", "-C", str(repo_dir), "status", "--porcelain"],
 68 |         check=True,
 69 |         capture_output=True,
 70 |         text=True,
 71 |     )
 72 |     return result.stdout.strip() != ""
 73 | 
 74 | 
 75 | def commit_and_push(repo_dir: Path, message: str, env: dict[str, str]) -> None:
 76 |     run(["git", "-C", str(repo_dir), "add", "modeling_open_provence_standalone.py"], env=env)
 77 |     if not has_changes(repo_dir):
 78 |         print("[skip] No changes to commit.")
 79 |         return
 80 |     run(["git", "-C", str(repo_dir), "commit", "-m", message], env=env)
 81 |     run(["git", "-C", str(repo_dir), "push"], env=env)
 82 | 
 83 | 
 84 | def update_models(models: Iterable[str], base_dir: Path, commit_message: str) -> None:
 85 |     git_env = {"GIT_LFS_SKIP_SMUDGE": "1"}
 86 |     for repo_id in models:
 87 |         print(f"\n=== Updating {repo_id} ===")
 88 |         repo_dir = ensure_repo(repo_id, base_dir, git_env)
 89 |         copy_standalone(repo_dir)
 90 |         commit_and_push(repo_dir, commit_message, git_env)
 91 | 
 92 | 
 93 | def parse_args() -> argparse.Namespace:
 94 |     parser = argparse.ArgumentParser(
 95 |         description="Sync modeling_open_provence_standalone.py into HF model repos without git-lfs.",
 96 |     )
 97 |     parser.add_argument(
 98 |         "--models",
 99 |         nargs="*",
100 |         default=DEFAULT_MODELS,
101 |         help="Hugging Face model IDs to update (defaults to the four models in README.md).",
102 |     )
103 |     parser.add_argument(
104 |         "--base-dir",
105 |         type=Path,
106 |         default=Path("tmp/release_models"),
107 |         help="Local directory for cloning HF model repos.",
108 |     )
109 |     parser.add_argument(
110 |         "--message",
111 |         default=f"chore: update standalone file ({datetime.now().date().isoformat()})",
112 |         help="Git commit message to use for pushes.",
113 |     )
114 |     return parser.parse_args()
115 | 
116 | 
117 | def main() -> None:
118 |     args = parse_args()
119 |     update_models(models=args.models, base_dir=args.base_dir, commit_message=args.message)
120 | 
121 | 
122 | if __name__ == "__main__":
123 |     main()
124 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "open-provence"
  3 | version = "0.1.0"
  4 | description = "OpenProvence: efficient and robust context pruning for retrieval-augmented generation"
  5 | license = { text = "MIT" }
  6 | readme = "README.md"
  7 | authors = [
  8 |     { name = "OpenProvence Contributors", email = "hotchpotch@gmail.com" }
  9 | ]
 10 | maintainers = [
 11 |     { name = "OpenProvence Contributors", email = "hotchpotch@gmail.com" }
 12 | ]
 13 | requires-python = ">=3.11"
 14 | keywords = [
 15 |     "Query-dependent pruning",
 16 |     "Text pruning",
 17 |     "RAG",
 18 |     "Retrieval-Augmented Generation",
 19 |     "Transformer Networks",
 20 |     "PyTorch",
 21 |     "NLP",
 22 |     "deep learning",
 23 | ]
 24 | classifiers = [
 25 |     "Development Status :: 5 - Production/Stable",
 26 |     "Intended Audience :: Science/Research",
 27 |     "License :: OSI Approved :: MIT License",
 28 |     "Programming Language :: Python :: 3.11",
 29 |     "Programming Language :: Python :: 3.12",
 30 |     "Programming Language :: Python :: 3.13",
 31 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 32 | ]
 33 | dependencies = [
 34 |     "transformers>=4.57.1",
 35 |     "tqdm",
 36 |     "torch>=2.8.0,<2.9",
 37 |     "scikit-learn",
 38 |     "scipy",
 39 |     "huggingface-hub>=0.20.0",
 40 |     "Pillow",
 41 |     "typing_extensions>=4.5.0",
 42 |     "datasets==2.20.0",
 43 |     "sentencepiece>=0.2.0",
 44 |     "einops>=0.8.1",
 45 |     "protobuf>=6.31.1",
 46 |     "bunkai>=1.5.7",
 47 |     "langdetect>=1.0.9",
 48 |     "accelerate>=0.26.0",
 49 |     "wandb>=0.21.0",
 50 |     "matplotlib>=3.9.4",
 51 |     "nltk>=3.9.1",
 52 |     "fast-bunkai>=0.1.0",
 53 | ]
 54 | 
 55 | [project.urls]
 56 | Homepage = "https://github.com/hotchpotch/open_provence"
 57 | Repository = "https://github.com/hotchpotch/open_provence"
 58 | Documentation = "https://github.com/hotchpotch/open_provence/tree/main/docs"
 59 | Issues = "https://github.com/hotchpotch/open_provence/issues"
 60 | 
 61 | 
 62 | [project.optional-dependencies]
 63 | train = ["datasets", "accelerate>=0.20.3"]
 64 | dev = [
 65 |     "datasets",
 66 |     "accelerate>=0.20.3",
 67 |     "pre-commit",
 68 |     "pytest",
 69 |     "pytest-cov",
 70 |     "pytest-xdist>=3.6.1",
 71 | ]
 72 | flash-attn = ["flash-attn>=2.7.4.post1"]
 73 | 
 74 | [project.scripts]
 75 | open_provence_trainer = "open_provence.trainer_cli:main"
 76 | 
 77 | [build-system]
 78 | requires = ["setuptools>=42", "wheel"]
 79 | build-backend = "setuptools.build_meta"
 80 | 
 81 | [tool.setuptools.packages.find]
 82 | include = ["open_provence*"]
 83 | namespaces = false
 84 | 
 85 | [tool.ruff]
 86 | target-version = "py311"
 87 | line-length = 99
 88 | fix = true
 89 | src = ["open_provence", "tests", "scripts"]
 90 | extend-exclude = [
 91 |   "configs",
 92 |   "debug_output",
 93 |   "docs",
 94 |   "htmlcov",
 95 |   "log",
 96 |   "output",
 97 |   "results",
 98 |   "open_provence.egg-info",
 99 |   "tmp",
100 |   "utils",
101 |   "wandb",
102 |   "**/.mypy_cache",
103 |   "**/.pytest_cache",
104 |   ".tox",
105 |   "venv",
106 |   ".venv",
107 | ]
108 | include = ["**/*.py"]
109 | 
110 | [tool.ruff.lint]
111 | select = [
112 |   "E",
113 |   "F",
114 |   "W",
115 |   "I",
116 |   "UP",
117 | ]
118 | ignore = [
119 |   "E203", # Whitespace before ':'
120 |   "E501", # Line too long (82 > 79 characters)
121 |   "D105", # undocumented-magic-method
122 |   "D107", # undocumented-public-init
123 |   "D205", # blank-line-after-summary
124 |   "D415", # ends-in-punctuation
125 |   # DoNotAssignLambda
126 |   "E731"
127 | ]
128 | 
129 | [tool.ruff.lint.per-file-ignores]
130 | "examples/**" = [
131 |     # Ignore `E402` (import violations) in all examples
132 |     "E402", 
133 |     # Ignore missing required imports
134 |     "I002"
135 |     ]
136 | "docs/**" = [
137 |     # Ignore missing required imports
138 |     "I002"
139 |     ]
140 | 
141 | [tool.ruff.lint.isort]
142 | known-third-party = ["datasets"]
143 | required-imports = ["from __future__ import annotations"]
144 | 
145 | [tool.ruff.lint.pydocstyle]
146 | convention = "google"
147 | 
148 | [tool.ruff.format]
149 | quote-style = "double"
150 | 
151 | [tool.pytest.ini_options]
152 | testpaths = [
153 |     "tests"
154 | ]
155 | addopts = "--strict-markers -m 'not slow and not custom'"
156 | markers = [
157 |     "slow: marks tests as slow",
158 |     "custom: marks tests for third-party models with custom modules"
159 | ]
160 | 
161 | [tool.pyright]
162 | pythonVersion = "3.11"
163 | pythonPlatform = "Linux"
164 | typeCheckingMode = "standard"
165 | reportMissingImports = "none"  # External dependencies may not have stubs
166 | reportUnusedImport = "warning"  # Allow unused imports that are re-exported
167 | reportUnusedClass = true
168 | reportUnusedFunction = true
169 | reportUnusedVariable = "warning"  # Common in unpacking
170 | reportDuplicateImport = true
171 | reportOptionalSubscript = false
172 | reportOptionalMemberAccess = false
173 | reportOptionalCall = false
174 | reportOptionalIterable = false
175 | reportOptionalContextManager = false
176 | reportOptionalOperand = false
177 | exclude = [
178 |     "configs",
179 |     "debug_output",
180 |     "docs",
181 |     "log",
182 |     "output",
183 |     "results",
184 |     "open_provence.egg-info",
185 |     "tmp",
186 |     "utils",
187 |     "wandb",
188 |     "**/__pycache__",
189 |     ".venv",
190 |     "venv",
191 |     ".tox",
192 | ]
193 | include = [
194 |     "open_provence",
195 |     "tests",
196 |     "scripts",
197 | ]
198 | 
199 | [[tool.uv.index]]
200 | name = "torch-cpu"
201 | url = "https://download.pytorch.org/whl/cpu"
202 | explicit = true
203 | 
204 | [[tool.uv.index]]
205 | name = "torch-cu128"
206 | url = "https://download.pytorch.org/whl/cu128"
207 | explicit = true
208 | 
209 | [tool.uv.sources]
210 | torch = [
211 |     { index = "torch-cpu", group = "cpu" },
212 |     { index = "torch-cu128", group = "cuda", marker = "sys_platform == 'linux' and platform_machine == 'x86_64'" },
213 | ]
214 | 
215 | [tool.uv]
216 | default-groups = ["dev", "cuda"]
217 | conflicts = [
218 |     [
219 |         { group = "cpu" },
220 |         { group = "cuda" },
221 |     ],
222 | ]
223 | 
224 | [dependency-groups]
225 | cpu = [
226 |     "torch>=2.8.0,<2.9",
227 | ]
228 | 
229 | cuda = [
230 |     "torch>=2.8.0,<2.9",
231 | ]
232 | 
233 | dev = [
234 |     "litellm>=1.77.7",
235 |     "openai>=2.3.0",
236 |     "pyright>=1.1.406",
237 |     "pytest>=8.4.1",
238 |     "pytest-xdist>=3.6.1",
239 |     "sentence-transformers>=5.1.1",
240 |     "ruff>=0.6.9",
241 |     "tox-uv>=1.29.0",
242 |     "wandb>=0.21.0",
243 |     #    "vllm>=0.9.0.1",
244 |     "trafilatura>=2.0.0",
245 |     "spacy>=3.8.7",
246 | ]
247 | 
248 | flash-attn = [
249 |     "flash-attn>=2.7.4.post1",
250 | ]
251 | 


--------------------------------------------------------------------------------
/tests/test_trainer_sampling.py:
--------------------------------------------------------------------------------
  1 | """Tests for dataset sampling logic in ``open_provence.trainer``."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import random
  6 | from pathlib import Path
  7 | from typing import Any, cast
  8 | 
  9 | import pytest
 10 | from datasets import Dataset, DatasetDict
 11 | from open_provence.trainer import (
 12 |     DataArguments,
 13 |     _sample_dataset_randomly,
 14 |     prepare_dataset,
 15 |     sample_items_by_label_priority,
 16 | )
 17 | 
 18 | 
 19 | def test_sample_dataset_randomly_is_deterministic() -> None:
 20 |     dataset = Dataset.from_dict({"value": list(range(10))})
 21 | 
 22 |     rnd_first = random.Random(42)
 23 |     rnd_second = random.Random(42)
 24 | 
 25 |     sampled_first = _sample_dataset_randomly(dataset, 3, rnd_first, "test")
 26 |     sampled_second = _sample_dataset_randomly(dataset, 3, rnd_second, "test")
 27 | 
 28 |     assert sampled_first["value"] == sampled_second["value"]
 29 |     assert len(sampled_first) == 3
 30 | 
 31 | 
 32 | def test_sample_dataset_randomly_returns_original_if_large_request() -> None:
 33 |     dataset = Dataset.from_dict({"value": [1, 2, 3]})
 34 |     rnd = random.Random(42)
 35 | 
 36 |     same_dataset = _sample_dataset_randomly(dataset, 5, rnd, "test")
 37 |     assert same_dataset is dataset
 38 | 
 39 | 
 40 | def test_sample_dataset_randomly_rejects_non_positive_sample_size() -> None:
 41 |     dataset = Dataset.from_dict({"value": [1, 2, 3]})
 42 |     rnd = random.Random(42)
 43 | 
 44 |     with pytest.raises(ValueError):
 45 |         _sample_dataset_randomly(dataset, 0, rnd, "test")
 46 | 
 47 | 
 48 | def _build_dataset(size: int = 10, validation_size: int = 6) -> DatasetDict:
 49 |     data = {
 50 |         "query": [f"q{i}" for i in range(size)],
 51 |         "positive": [f"pos{i}" for i in range(size)],
 52 |         "negative": [f"neg{i}" for i in range(size)],
 53 |         "teacher_score": [float(i) for i in range(size)],
 54 |     }
 55 |     validation = {
 56 |         "query": [f"vq{i}" for i in range(validation_size)],
 57 |         "positive": [f"vpos{i}" for i in range(validation_size)],
 58 |         "negative": [f"vneg{i}" for i in range(validation_size)],
 59 |         "teacher_score": [float(i) for i in range(validation_size)],
 60 |     }
 61 |     return DatasetDict(
 62 |         {
 63 |             "train": Dataset.from_dict(data),
 64 |             "validation": Dataset.from_dict(validation),
 65 |         }
 66 |     )
 67 | 
 68 | 
 69 | def test_prepare_dataset_supports_local_paths(tmp_path: Path) -> None:
 70 |     dataset = _build_dataset()
 71 |     dataset_path = tmp_path / "local_ds"
 72 |     dataset.save_to_disk(dataset_path)
 73 | 
 74 |     data_args = DataArguments(
 75 |         dataset_name="unused",
 76 |         subset="default",
 77 |         teacher_column="teacher_score",
 78 |         datasets=[
 79 |             {
 80 |                 "dataset_name": str(dataset_path),
 81 |                 "teacher_column": "teacher_score",
 82 |             }
 83 |         ],
 84 |     )
 85 | 
 86 |     train_dataset, eval_dataset = prepare_dataset(data_args, seed=13)
 87 | 
 88 |     assert len(train_dataset) == len(dataset["train"])
 89 |     assert len(eval_dataset) == len(dataset["validation"])
 90 | 
 91 | 
 92 | def test_prepare_dataset_applies_n_samples(monkeypatch: pytest.MonkeyPatch) -> None:
 93 |     def fake_load_dataset(name: str, subset: str | None = None) -> DatasetDict:
 94 |         return _build_dataset()
 95 | 
 96 |     monkeypatch.setattr("open_provence.trainer.load_dataset", fake_load_dataset)
 97 | 
 98 |     data_args = DataArguments(
 99 |         dataset_name="dummy",
100 |         subset="default",
101 |         teacher_column="teacher_score",
102 |         datasets=[
103 |             {
104 |                 "dataset_name": "dummy",
105 |                 "subset": "default",
106 |                 "teacher_column": "teacher_score",
107 |                 "n_samples": 5,
108 |             }
109 |         ],
110 |     )
111 | 
112 |     train_dataset, eval_dataset = prepare_dataset(data_args, seed=42)
113 | 
114 |     assert len(train_dataset) == 5
115 |     assert len(eval_dataset) == 3
116 | 
117 |     # Deterministic sampling: rerunning with the same seed yields identical results
118 |     train_dataset_again, eval_dataset_again = prepare_dataset(data_args, seed=42)
119 |     assert train_dataset_again["query"] == train_dataset["query"]
120 |     assert eval_dataset_again["query"] == eval_dataset["query"]
121 | 
122 | 
123 | def test_prepare_dataset_accepts_fractional_n_samples(monkeypatch: pytest.MonkeyPatch) -> None:
124 |     def fake_load_dataset(name: str, subset: str | None = None) -> DatasetDict:
125 |         return _build_dataset()
126 | 
127 |     monkeypatch.setattr("open_provence.trainer.load_dataset", fake_load_dataset)
128 | 
129 |     data_args = DataArguments(
130 |         dataset_name="dummy",
131 |         subset="default",
132 |         teacher_column="teacher_score",
133 |         datasets=[
134 |             {
135 |                 "dataset_name": "dummy",
136 |                 "subset": "default",
137 |                 "teacher_column": "teacher_score",
138 |                 "n_samples": 0.2,
139 |             }
140 |         ],
141 |     )
142 | 
143 |     train_dataset, eval_dataset = prepare_dataset(data_args, seed=42)
144 | 
145 |     assert len(train_dataset) == 2  # ceil(10 * 0.2)
146 |     assert len(eval_dataset) == 2  # ceil(6 * 0.2)
147 | 
148 |     train_dataset_again, eval_dataset_again = prepare_dataset(data_args, seed=42)
149 |     assert train_dataset_again["query"] == train_dataset["query"]
150 |     assert eval_dataset_again["query"] == eval_dataset["query"]
151 | 
152 | 
153 | def test_sample_items_handles_missing_labels() -> None:
154 |     dataset = Dataset.from_dict(
155 |         {
156 |             "texts": [
157 |                 ["doc0", "doc1", "doc2", "doc3"],
158 |                 ["doc4", "doc5", "doc6"],
159 |             ],
160 |             "teacher_score": [
161 |                 [0.9, 0.1, 0.2, 0.3],
162 |                 [0.8, 0.6, 0.2],
163 |             ],
164 |             "extra": [
165 |                 ["meta0", "meta1", "meta2", "meta3"],
166 |                 ["meta4", "meta5", "meta6"],
167 |             ],
168 |         }
169 |     )
170 | 
171 |     sampled = sample_items_by_label_priority(dataset, max_items=2, seed=7)
172 | 
173 |     sampled_rows = [cast(dict[str, Any], row) for row in sampled]
174 | 
175 |     for row in sampled_rows:
176 |         assert len(cast(list[Any], row["texts"])) == 2
177 |         assert len(cast(list[Any], row["teacher_score"])) == 2
178 |         assert len(cast(list[Any], row["extra"])) == 2
179 | 
180 |     # Deterministic across runs with same seed
181 |     sampled_again = [
182 |         cast(dict[str, Any], row)
183 |         for row in sample_items_by_label_priority(dataset, max_items=2, seed=7)
184 |     ]
185 |     assert sampled_again == sampled_rows
186 | 


--------------------------------------------------------------------------------
/scripts/utils/sync_output_modeling.py:
--------------------------------------------------------------------------------
  1 | """Synchronise modeling_open_provence_standalone.py files in output directories."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import argparse
  6 | import json
  7 | import sys
  8 | from dataclasses import dataclass
  9 | from pathlib import Path
 10 | from typing import Iterable, TextIO
 11 | 
 12 | 
 13 | _DEPRECATED_CONFIG_KEYS: tuple[str, ...] = (
 14 |     "splitter_default_language",
 15 |     "standalone_process_default_language",
 16 |     "modeling_open_provence_default_language",
 17 | )
 18 | 
 19 | 
 20 | @dataclass
 21 | class TargetState:
 22 |     modeling_path: Path
 23 |     config_path: Path | None
 24 |     modeling_needs_update: bool
 25 |     config_needs_update: bool
 26 |     removed_keys: tuple[str, ...]
 27 | 
 28 |     def requires_action(self) -> bool:
 29 |         return self.modeling_needs_update or self.config_needs_update
 30 | 
 31 | 
 32 | def _load_base_content(base_file: Path) -> str:
 33 |     if not base_file.exists():
 34 |         raise FileNotFoundError(f"Base modeling file not found: {base_file}")
 35 |     return base_file.read_text(encoding="utf-8")
 36 | 
 37 | 
 38 | def _evaluate_config(modeling_path: Path) -> tuple[Path | None, bool, tuple[str, ...]]:
 39 |     config_path = modeling_path.with_name("config.json")
 40 |     if not config_path.exists():
 41 |         return None, False, ()
 42 | 
 43 |     try:
 44 |         config = json.loads(config_path.read_text(encoding="utf-8"))
 45 |     except json.JSONDecodeError:
 46 |         return config_path, False, ()
 47 | 
 48 |     if config.get("model_type") != "open_provence":
 49 |         return config_path, False, ()
 50 | 
 51 |     removed_keys = tuple(key for key in _DEPRECATED_CONFIG_KEYS if key in config)
 52 |     return config_path, bool(removed_keys), removed_keys
 53 | 
 54 | 
 55 | def _gather_target_states(base_content: str, output_dir: Path) -> list[TargetState]:
 56 |     if not output_dir.exists():
 57 |         return []
 58 | 
 59 |     states: list[TargetState] = []
 60 |     for modeling_path in sorted(output_dir.rglob("modeling_open_provence_standalone.py")):
 61 |         current_content = modeling_path.read_text(encoding="utf-8")
 62 |         modeling_needs_update = current_content != base_content
 63 |         config_path, config_needs_update, removed_keys = _evaluate_config(modeling_path)
 64 |         states.append(
 65 |             TargetState(
 66 |                 modeling_path=modeling_path,
 67 |                 config_path=config_path,
 68 |                 modeling_needs_update=modeling_needs_update,
 69 |                 config_needs_update=config_needs_update,
 70 |                 removed_keys=removed_keys,
 71 |             )
 72 |         )
 73 |     return states
 74 | 
 75 | 
 76 | def parse_args() -> argparse.Namespace:
 77 |     parser = argparse.ArgumentParser(
 78 |         description="Copy the latest modeling_open_provence_standalone.py into every output run (dry run by default)."
 79 |     )
 80 |     parser.add_argument(
 81 |         "--overwrite",
 82 |         action="store_true",
 83 |         help="Apply changes; without this flag the script reports pending updates (dry run).",
 84 |     )
 85 |     parser.add_argument(
 86 |         "--output-dir",
 87 |         type=Path,
 88 |         default=Path("output"),
 89 |         help="Root directory that contains run outputs (default: ./output).",
 90 |     )
 91 |     return parser.parse_args()
 92 | 
 93 | 
 94 | def plan_sync(base_file: Path, output_dir: Path) -> list[TargetState]:
 95 |     base_content = _load_base_content(base_file)
 96 |     return _gather_target_states(base_content, output_dir)
 97 | 
 98 | 
 99 | def _format_removed_keys(keys: Iterable[str]) -> str:
100 |     formatted = ", ".join(sorted(keys))
101 |     return formatted if formatted else ""
102 | 
103 | 
104 | def sync_targets(
105 |     base_file: Path,
106 |     output_dir: Path,
107 |     overwrite: bool,
108 |     *,
109 |     stream: TextIO = sys.stdout,
110 | ) -> None:
111 |     if not output_dir.exists():
112 |         print(f"No output directory found at {output_dir}", file=stream)
113 |         return
114 | 
115 |     base_content = _load_base_content(base_file)
116 |     targets = _gather_target_states(base_content, output_dir)
117 |     if not targets:
118 |         print("No matching modeling_open_provence_standalone.py files found.", file=stream)
119 |         return
120 | 
121 |     mode = "Applying updates" if overwrite else "Planned updates"
122 |     print(f"{mode} for {len(targets)} target(s):", file=stream)
123 | 
124 |     any_pending = False
125 | 
126 |     for state in targets:
127 |         header = f"- {state.modeling_path}"
128 |         if not overwrite:
129 |             if not state.requires_action():
130 |                 print(f"{header} → SKIP (already up to date)", file=stream)
131 |                 continue
132 | 
133 |             any_pending = True
134 |             if state.modeling_needs_update:
135 |                 print(
136 |                     f"{header} → would copy latest modeling_open_provence_standalone.py",
137 |                     file=stream,
138 |                 )
139 |             if state.config_needs_update:
140 |                 removed = _format_removed_keys(state.removed_keys)
141 |                 print(f"{header} → would remove deprecated config keys: {removed}", file=stream)
142 |             continue
143 | 
144 |         # overwrite
145 |         if state.modeling_needs_update:
146 |             state.modeling_path.write_text(base_content, encoding="utf-8")
147 |             print(f"{header} → copied modeling_open_provence_standalone.py", file=stream)
148 |         else:
149 |             print(f"{header} → SKIP (already up to date)", file=stream)
150 | 
151 |         if state.config_needs_update and state.config_path is not None:
152 |             config_path = state.config_path
153 |             config = json.loads(config_path.read_text(encoding="utf-8"))
154 |             for key in state.removed_keys:
155 |                 config.pop(key, None)
156 |             config_path.write_text(
157 |                 json.dumps(config, ensure_ascii=False, indent=2) + "\n",
158 |                 encoding="utf-8",
159 |             )
160 |             removed = _format_removed_keys(state.removed_keys)
161 |             print(f"{header} → removed deprecated config keys: {removed}", file=stream)
162 | 
163 |     if not overwrite and any_pending:
164 |         print("Re-run with --overwrite to apply these updates.", file=stream)
165 | 
166 | 
167 | def main() -> None:
168 |     args = parse_args()
169 |     repo_root = Path(__file__).resolve().parents[2]
170 |     base_file = repo_root / "open_provence" / "modeling_open_provence_standalone.py"
171 |     output_dir = args.output_dir
172 |     if not output_dir.is_absolute():
173 |         output_dir = repo_root / output_dir
174 | 
175 |     sync_targets(base_file, output_dir, overwrite=args.overwrite)
176 | 
177 | 
178 | if __name__ == "__main__":
179 |     main()
180 | 


--------------------------------------------------------------------------------
/scripts/hf_utils/hf_model_process_check.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import argparse
  4 | from collections.abc import Iterable, Sequence
  5 | from dataclasses import dataclass
  6 | 
  7 | from transformers import AutoModel
  8 | 
  9 | DEFAULT_MODELS: tuple[str, ...] = (
 10 |     "hotchpotch/open-provence-reranker-v1",
 11 |     "hotchpotch/open-provence-reranker-xsmall-v1",
 12 |     "hotchpotch/open-provence-reranker-large-v1",
 13 |     "hotchpotch/open-provence-reranker-v1-gte-modernbert-base",
 14 | )
 15 | 
 16 | question: str = "What's your favorite Japanese food?"
 17 | context: str = """
 18 | Work deadlines piled up today, and I kept rambling about budget spreadsheets to my roommate.
 19 | Next spring I'm planning a trip to Japan so I can wander Kyoto's markets and taste every regional dish I find.
 20 | Sushi is honestly my favourite—I want to grab a counter seat and let the chef serve endless nigiri until I'm smiling through soy sauce.
 21 | Later I remembered to water the plants and pay the electricity bill before finally getting some sleep.
 22 | """
 23 | 
 24 | 
 25 | @dataclass
 26 | class Case:
 27 |     name: str
 28 |     question: str | Sequence[str]
 29 |     # allow up to 3-level nesting: queries -> docs -> sentences
 30 |     context: str | Sequence[str] | Sequence[Sequence[str]] | Sequence[Sequence[Sequence[str]]]
 31 | 
 32 | 
 33 | @dataclass
 34 | class SampleResult:
 35 |     case: str
 36 |     sample: str
 37 |     score: float | None
 38 |     compression: float
 39 |     pruned: str | None
 40 | 
 41 | 
 42 | def build_cases() -> list[Case]:
 43 |     questions = [question, question]
 44 |     contexts = [context, context]
 45 | 
 46 |     context_sentences = [line for line in context.splitlines(True) if line.strip()]
 47 |     context_sentences_wrapped = [context_sentences]
 48 |     contexts_nested = [context_sentences_wrapped, context_sentences_wrapped]
 49 | 
 50 |     return [
 51 |         Case("q=str, c=str", question, context),
 52 |         Case("q=list[str], c=list[str]", questions, contexts),
 53 |         Case("q=str, c=list[str] (split sentences)", question, context_sentences),
 54 |         Case(
 55 |             "q=str, c=list[list[str]] (split sentences, single doc)",
 56 |             question,
 57 |             context_sentences_wrapped,
 58 |         ),
 59 |         Case(
 60 |             "q=list[str], c=list[list[str]] (split sentences per query)",
 61 |             questions,
 62 |             contexts_nested,
 63 |         ),
 64 |     ]
 65 | 
 66 | 
 67 | def _iter_samples(
 68 |     pruned_context, rerank_score, compression_rate
 69 | ) -> Iterable[tuple[str, str | None, float | None, float]]:
 70 |     if not isinstance(pruned_context, list):
 71 |         yield "", pruned_context, rerank_score, compression_rate
 72 |         return
 73 | 
 74 |     for idx, text in enumerate(pruned_context):
 75 |         text_str = "\n".join(text) if isinstance(text, list) else text
 76 | 
 77 |         score = rerank_score[idx] if isinstance(rerank_score, list) else rerank_score
 78 |         compression = (
 79 |             compression_rate[idx] if isinstance(compression_rate, list) else compression_rate
 80 |         )
 81 | 
 82 |         if isinstance(score, list):
 83 |             score = score[0] if score else None
 84 |         if isinstance(compression, list):
 85 |             compression = compression[0] if compression else 0.0
 86 | 
 87 |         yield f"#{idx}", text_str, score, float(compression)
 88 | 
 89 | 
 90 | def run_cases(model, threshold: float, verbose: bool) -> list[SampleResult]:
 91 |     results: list[SampleResult] = []
 92 |     for case in build_cases():
 93 |         result = model.process(
 94 |             question=case.question,
 95 |             context=case.context,
 96 |             threshold=threshold,
 97 |             show_progress=verbose,
 98 |         )
 99 |         for sample_tag, pruned, score, compression in _iter_samples(
100 |             result["pruned_context"],
101 |             result["reranking_score"],
102 |             result["compression_rate"],
103 |         ):
104 |             results.append(
105 |                 SampleResult(
106 |                     case=case.name,
107 |                     sample=sample_tag,
108 |                     score=None if score is None else float(score),
109 |                     compression=float(compression),
110 |                     pruned=pruned if verbose else None,
111 |                 )
112 |             )
113 |     return results
114 | 
115 | 
116 | def _format_table(rows: list[SampleResult]) -> str:
117 |     headers = ["Case", "Sample", "Rerank score", "Compression"]
118 |     data: list[list[str]] = []
119 |     for row in rows:
120 |         sample = row.sample or "-"
121 |         score = "-" if row.score is None else f"{row.score:.4f}"
122 |         compression = f"{row.compression:.2f}"
123 |         data.append([row.case, sample, score, compression])
124 | 
125 |     col_widths = [max(len(item[i]) for item in ([headers] + data)) for i in range(len(headers))]
126 | 
127 |     def fmt_row(items: Sequence[str]) -> str:
128 |         return " | ".join(item.ljust(col_widths[idx]) for idx, item in enumerate(items))
129 | 
130 |     divider = "-+-".join("-" * width for width in col_widths)
131 |     lines = [fmt_row(headers), divider]
132 |     lines.extend(fmt_row(row) for row in data)
133 |     return "\n".join(lines)
134 | 
135 | 
136 | def parse_args() -> argparse.Namespace:
137 |     parser = argparse.ArgumentParser(
138 |         description="Smoke-test the four HF models using the run.py sample inputs.",
139 |     )
140 |     parser.add_argument(
141 |         "--models",
142 |         nargs="*",
143 |         default=DEFAULT_MODELS,
144 |         help="Hugging Face model IDs to load (default: README models).",
145 |     )
146 |     parser.add_argument(
147 |         "--threshold",
148 |         type=float,
149 |         default=0.1,
150 |         help="Pruning threshold passed to model.process.",
151 |     )
152 |     parser.add_argument(
153 |         "--verbose",
154 |         action="store_true",
155 |         help="Print pruned text for each sample in addition to the summary table.",
156 |     )
157 |     return parser.parse_args()
158 | 
159 | 
160 | def main() -> None:
161 |     args = parse_args()
162 |     for model_id in args.models:
163 |         print(f"\n=== {model_id} ===")
164 |         model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
165 |         model.eval()
166 |         rows = run_cases(model, threshold=args.threshold, verbose=args.verbose)
167 | 
168 |         if args.verbose:
169 |             for row in rows:
170 |                 if row.pruned is None:
171 |                     continue
172 |                 print(f"\n-- {row.case} {row.sample or ''}".strip())
173 |                 print("Pruned context:\n" + row.pruned)
174 |                 print(f"Rerank score: {row.score}")
175 |                 print(f"Compression: {row.compression:.2f}")
176 | 
177 |         print("\n" + _format_table(rows))
178 | 
179 | 
180 | if __name__ == "__main__":
181 |     main()
182 | 


--------------------------------------------------------------------------------
/docs/eval_dataset.md:
--------------------------------------------------------------------------------
  1 | # Dataset Evaluation Guide
  2 | 
  3 | `scripts/eval_datasets.py` measures how many annotated evidence spans survive pruning across a configuration of context-relevance datasets. This document explains how to run the CLI, what each configuration does, and how to interpret the generated artefacts. It intentionally omits score tables so the instructions remain evergreen.
  4 | 
  5 | ## 1. What the script checks
  6 | 
  7 | For each dataset in a config file, the script:
  8 | 
  9 | 1. Loads the dataset from Hugging Face (e.g., `hotchpotch/msmarco-context-relevance`).
 10 | 2. Runs `model.process()` to prune each passage.
 11 | 3. Compares the pruned spans against the labelled evidence annotations.
 12 | 4. Computes span-level precision, recall, and a β = 2 F2 score (recall-weighted) plus mean compression.
 13 | 
 14 | Dropping relevant spans (false negatives) is more damaging than keeping surplus context, so F2 is the headline metric.
 15 | 
 16 | ## 2. Known gotchas in the datasets
 17 | 
 18 | - Some datasets contain very long passages (>60 k characters). If you hit memory errors, temporarily limit evaluation via `--limit`, use the nano configs, or regenerate the dataset with shorter spans.
 19 | - A small number of queries in the multilingual sets are malformed or language-mismatched. The published configs already omit the worst offenders. If you uncover new issues, send a PR to update the source dataset rather than editing the evaluation script.
 20 | - Compression percentages are per-dataset averages; for heterogeneous corpora (e.g., GooAQ vs. JA-focused Wikipedia), expect different baseline compression even at the same threshold.
 21 | 
 22 | ## 3. Config files
 23 | 
 24 | All configs live under `configs/eval_datasets/`:
 25 | 
 26 | | File | Purpose |
 27 | | --- | --- |
 28 | | `ja.yaml`, `en.yaml` | Full evaluation suites (all datasets, full sample counts). |
 29 | | `ja_nano.yaml`, `en_nano.yaml` | “Nano” subsets with per-dataset `n_samples` overrides for quick smoke tests. Use these when iterating on code or verifying regressions; they run 10–20× faster. |
 30 | 
 31 | Each entry in a config looks like:
 32 | 
 33 | ```yaml
 34 | - dataset_name: hotchpotch/msmarco-context-relevance
 35 |   subset: default
 36 |   n_samples: 100        # only in *_nano.yaml
 37 | ```
 38 | 
 39 | The script reads each row sequentially. You can clone a file and add/remove datasets for ad‑hoc scenarios. Optional keys:
 40 | 
 41 | - `split`: override the global split from the YAML header.
 42 | - `n_samples`: cap the number of records loaded from that dataset (only present in `*_nano.yaml`).
 43 | 
 44 | ## 4. Core command template
 45 | 
 46 | ```bash
 47 | uv run python scripts/eval_datasets.py \
 48 |   --config CONFIG_PATH \
 49 |   --model MODEL_DIR \
 50 |   --threshold 0.1 \
 51 |   --batch-size 256 \
 52 |   --timing-details \
 53 |   --output-json tmp/eval_<label>.json \
 54 |   --output-file tmp/eval_<label>.md
 55 | ```
 56 | 
 57 | Replace:
 58 | 
 59 | - `CONFIG_PATH` with one of the YAML files described above.
 60 | - `MODEL_DIR` with the local run directory or exported checkpoint (the script auto-detects `final_model/` when present).
 61 | - `--threshold` with the pruning threshold you want to sweep.
 62 | - `--thresholds` / `--th` (optional) to evaluate multiple thresholds in one run.
 63 | 
 64 | Useful flags:
 65 | 
 66 | | Flag | Description |
 67 | | --- | --- |
 68 | | `--thresholds/--th` | Comma-separated list of extra thresholds. Repeat the flag to add more. |
 69 | | `--split` | Override the split for every dataset (default: YAML `split`). |
 70 | | `--limit` | Evaluate only the first *N* examples per dataset (applied after any `n_samples` cap). |
 71 | | `--target` | Restrict evaluation to specific datasets (`dataset_name:subset`). Repeatable. |
 72 | | `--device` | Force a specific device (e.g., `cuda`, `cuda:1`, `cpu`). |
 73 | | `--batch-size` | Controls the number of examples passed to `model.process` per call (default `512`). |
 74 | | `--no-progress` / `--silent` | Suppress progress bars or all intermediate logs. |
 75 | | `--timing-details` | Print per-stage timing and include them in the summaries. |
 76 | | `--use-automodel` | Load checkpoints via `transformers.AutoModel` (useful for remote-code models). |
 77 | 
 78 | The Markdown (`.md`) and JSON (`.json`) summaries land wherever you point `--output-file` / `--output-json`. Many workflows use `tmp/` during iteration, then copy artefacts under `output/release_models/<model>/eval_results/` when ready to publish.
 79 | 
 80 | ### Nano quick-checks
 81 | 
 82 | To run a fast smoke test:
 83 | 
 84 | ```bash
 85 | uv run python scripts/eval_datasets.py \
 86 |   --config configs/eval_datasets/ja_nano.yaml \
 87 |   --model output/release_models/open-provence-reranker-v1 \
 88 |   --threshold 0.1 \
 89 |   --batch-size 128 \
 90 |   --output-json tmp/eval_v1_ja_nano.json
 91 | ```
 92 | 
 93 | The `*_nano.yaml` configs cap each dataset at `n_samples=100`, so the whole run finishes in a few minutes while still hitting every dataset.
 94 | 
 95 | ## 5. Threshold sweeps
 96 | 
 97 | To sweep thresholds without editing the YAML, call the script in a loop:
 98 | 
 99 | ```bash
100 | for th in 0.05 0.1 0.3 0.5; do
101 |   uv run python scripts/eval_datasets.py \
102 |     --config configs/eval_datasets/en.yaml \
103 |   --model output/release_models/open-provence-reranker-v1-gte-modernbert-base \
104 |     --threshold "$th" \
105 |     --batch-size 256 \
106 |     --output-json tmp/eval_en_th_${th//./_}.json \
107 |     --output-file tmp/eval_en_th_${th//./_}.md
108 | done
109 | ```
110 | 
111 | Combine this with the nano configs for rapid iteration:
112 | 
113 | ```bash
114 | for th in 0.05 0.1; do
115 |   uv run python scripts/eval_datasets.py \
116 |     --config configs/eval_datasets/en_nano.yaml \
117 |   --model output/release_models/open-provence-reranker-v1-gte-modernbert-base \
118 |     --threshold "$th" \
119 |     --batch-size 128 \
120 |     --output-json tmp/eval_en_nano_th_${th//./_}.json
121 | done
122 | ```
123 | 
124 | Once the numbers look good, re-run with the full configs to generate publishable artefacts.
125 | 
126 | ## 6. After the run
127 | 
128 | 1. Inspect `tmp/eval_<label>.json` and check that:
129 |    - `macro` metrics (macro F2/recall/precision) are within expected ranges.
130 |    - `datasets` entries are present for every config row.
131 | 2. Copy the JSON and Markdown into the release tree, e.g.,
132 |    ```bash
133 |    cp tmp/eval_en_th_0_1.{json,md} \
134 |       output/release_models/open-provence-reranker-v1-gte-modernbert-base/eval_results/eval_datasets_en.*
135 |    ```
136 | 3. Update any dated reports (e.g., `docs/eval_reports/<date>.md`) if the numbers correspond to a release milestone.
137 | 
138 | ## 7. Checklist
139 | 
140 | - [ ] Config matches the intended language (no `en` model on `ja.yaml` without overrides).
141 | - [ ] Ignore file includes newly discovered bad records when necessary.
142 | - [ ] Markdown/JSON summaries stored under `tmp/` have been archived or copied to the release directory.
143 | - [ ] Threshold sweeps include both full and nano configs during development.
144 | 
145 | Following this playbook keeps dataset-level pruning evaluations reproducible and manageable, whether you are iterating on new pruning heads or validating regression fixes.
146 | 


--------------------------------------------------------------------------------
/open_provence/data_structures.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Data structures for Query-dependent Text Pruning and Reranking.
  3 | """
  4 | 
  5 | from __future__ import annotations
  6 | 
  7 | from dataclasses import dataclass
  8 | from typing import Any
  9 | 
 10 | import numpy as np
 11 | import torch
 12 | 
 13 | 
 14 | @dataclass
 15 | class OpenProvenceOutput:
 16 |     """
 17 |     Output dataclass for chunk-based pruning predictions.
 18 | 
 19 |     Attributes:
 20 |         ranking_scores: Reranking scores for each document [batch_size]
 21 |         chunk_predictions: Chunk-level binary predictions [batch_size, num_chunks]
 22 |         chunk_scores: Chunk-level keep probabilities [batch_size, num_chunks]
 23 |         token_scores: Token-level keep probabilities [batch_size, seq_len]
 24 |         chunk_positions: Original chunk positions [batch_size, num_chunks, 2]
 25 |         compression_ratio: Ratio of chunks kept vs total chunks
 26 |     """
 27 | 
 28 |     ranking_scores: float | np.ndarray | None = None
 29 |     chunk_predictions: np.ndarray | None = None  # [batch_size, num_chunks]
 30 |     chunk_scores: np.ndarray | None = None  # [batch_size, num_chunks]
 31 |     token_scores: np.ndarray | None = None  # [batch_size, seq_len]
 32 |     chunk_positions: list[list[tuple[int, int]]] | None = None
 33 |     compression_ratio: float | None = None
 34 | 
 35 |     def to_dict(self) -> dict[str, Any]:
 36 |         """Convert to dictionary format for serialization."""
 37 |         result = {}
 38 |         for key, value in self.__dict__.items():
 39 |             if value is not None:
 40 |                 if isinstance(value, (np.ndarray, torch.Tensor)):
 41 |                     result[key] = value.tolist()
 42 |                 else:
 43 |                     result[key] = value
 44 |         return result
 45 | 
 46 | 
 47 | @dataclass
 48 | class OpenProvenceOnlyOutput:
 49 |     """
 50 |     Output dataclass for pruning-only mode (no ranking).
 51 | 
 52 |     Attributes:
 53 |         pruning_masks: Binary masks indicating which tokens to keep [batch_size, seq_len]
 54 |         pruning_logits: Raw pruning logits from the model [batch_size, seq_len, 2]
 55 |         pruning_probs: Pruning probabilities for each token [batch_size, seq_len, 2]
 56 |         sentences: List of tokens for each document
 57 |         compression_ratio: Average compression ratio achieved
 58 |         num_pruned_tokens: Total number of tokens pruned
 59 |         pruned_documents: Pruned documents (if return_documents=True)
 60 |     """
 61 | 
 62 |     # Pruning outputs
 63 |     pruning_masks: np.ndarray | None = None  # [batch_size, seq_len]
 64 |     pruning_logits: torch.Tensor | None = None  # [batch_size, seq_len, 2]
 65 |     pruning_probs: np.ndarray | None = None  # [batch_size, seq_len, 2]
 66 | 
 67 |     # Token information
 68 |     sentences: list[list[str]] | None = None  # Tokens for each document
 69 | 
 70 |     # Metadata
 71 |     compression_ratio: float | None = None
 72 |     num_pruned_tokens: int | None = None
 73 |     pruned_documents: list[str] | None = None
 74 | 
 75 |     def to_dict(self) -> dict[str, Any]:
 76 |         """Convert to dictionary format for serialization."""
 77 |         result = {}
 78 |         for key, value in self.__dict__.items():
 79 |             if value is not None:
 80 |                 if isinstance(value, (np.ndarray, torch.Tensor)):
 81 |                     result[key] = value.tolist()
 82 |                 else:
 83 |                     result[key] = value
 84 |         return result
 85 | 
 86 | 
 87 | @dataclass
 88 | class RerankingOpenProvenceOutput:
 89 |     """
 90 |     Output dataclass containing both reranking and pruning results.
 91 | 
 92 |     Attributes:
 93 |         ranking_scores: Reranking scores for each document [batch_size]
 94 |         ranking_logits: Raw ranking logits from the model [batch_size, 1]
 95 |         pruning_masks: Binary masks indicating which sentences to keep [batch_size, max_sentences]
 96 |         pruning_logits: Raw pruning logits from the model [batch_size, seq_len, 2]
 97 |         pruning_probs: Pruning probabilities for each token/sentence [batch_size, seq_len, 2]
 98 |         sentences: List of sentences for each document
 99 |         sentence_boundaries: Token boundaries for each sentence [batch_size, max_sentences, 2]
100 |         original_positions: Character positions in original text [batch_size, max_sentences, 2]
101 |         compression_ratio: Average compression ratio achieved
102 |         num_pruned_sentences: Total number of sentences pruned
103 |         pruned_documents: Pruned documents (if return_documents=True)
104 |     """
105 | 
106 |     # Reranking outputs
107 |     ranking_scores: np.ndarray | None = None  # [batch_size]
108 |     ranking_logits: torch.Tensor | None = None  # [batch_size, 1]
109 | 
110 |     # Pruning outputs
111 |     pruning_masks: np.ndarray | None = None  # [batch_size, max_sentences]
112 |     pruning_logits: torch.Tensor | None = None  # [batch_size, seq_len, 2]
113 |     pruning_probs: np.ndarray | None = None  # [batch_size, seq_len, 2]
114 | 
115 |     # Chunking information
116 |     sentences: list[list[str]] | None = None  # Sentences for each document
117 |     sentence_boundaries: list[list[tuple[int, int]]] | None = None  # Token boundaries
118 |     original_positions: list[list[tuple[int, int]]] | None = None  # Character positions
119 | 
120 |     # Metadata
121 |     compression_ratio: float | None = None
122 |     num_pruned_sentences: int | None = None
123 |     pruned_documents: list[str] | None = None
124 | 
125 |     def to_dict(self) -> dict[str, Any]:
126 |         """Convert to dictionary format for serialization."""
127 |         result = {}
128 |         for key, value in self.__dict__.items():
129 |             if value is not None:
130 |                 if isinstance(value, (np.ndarray, torch.Tensor)):
131 |                     result[key] = value.tolist()
132 |                 else:
133 |                     result[key] = value
134 |         return result
135 | 
136 |     def __repr__(self) -> str:
137 |         """String representation of the output."""
138 |         parts = []
139 |         if self.ranking_scores is not None:
140 |             parts.append(f"ranking_scores={self.ranking_scores.shape}")
141 |         if self.pruning_masks is not None:
142 |             parts.append(f"pruning_masks={self.pruning_masks.shape}")
143 |         if self.compression_ratio is not None:
144 |             parts.append(f"compression_ratio={self.compression_ratio:.2f}")
145 |         return f"RerankingOpenProvenceOutput({', '.join(parts)})"
146 | 
147 | 
148 | @dataclass
149 | class OpenProvenceConfig:
150 |     """Configuration for pruning and reranking functionality."""
151 | 
152 |     # Pruning head configuration
153 |     pruning_hidden_size: int | None = None  # If None, uses model's hidden_size
154 |     pruning_num_labels: int = 2  # Binary: keep/prune
155 |     pruning_dropout: float = 0.1
156 | 
157 |     # Chunking configuration
158 |     chunker_type: str = "multilingual"  # "multilingual", "simple", "custom"
159 |     max_sentences: int = 64
160 |     min_sentence_length: int = 5
161 |     max_sentence_length: int = 500
162 | 
163 |     # Pruning behavior
164 |     pruning_mode: str = "sentence"  # "sentence" or "token"
165 |     default_pruning_threshold: float = 0.5
166 |     min_sentences_to_keep: int = 1
167 | 
168 |     # Performance options
169 |     use_cache: bool = True
170 |     batch_size: int = 32
171 | 
172 |     def to_dict(self) -> dict[str, Any]:
173 |         """Convert to dictionary format."""
174 |         return self.__dict__
175 | 


--------------------------------------------------------------------------------
/scripts/context-relevance-datasets/upload_context_relevance_to_hf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Upload Provence-style context relevance datasets to the Hugging Face Hub.
  3 | 
  4 | Usage example:
  5 | 
  6 |     python scripts/context-relevance-datasets/upload_context_relevance_to_hf.py \
  7 |         --dataset-path output/context-relevance-datasets/base/tomaarsen_natural-questions-hard-negatives_triplet-5_with_relevance_with_teacher_scores \
  8 |         --repo-id hotchpotch/natural-questions-context-relevance \
  9 |         --subset default \
 10 |         --commit-message "add nq context relevance with teacher scores"
 11 | """
 12 | 
 13 | from __future__ import annotations
 14 | 
 15 | import argparse
 16 | import logging
 17 | from pathlib import Path
 18 | 
 19 | from datasets import Dataset, DatasetDict, load_from_disk
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | def parse_args() -> argparse.Namespace:
 25 |     parser = argparse.ArgumentParser(
 26 |         description="Upload a context relevance dataset to the Hugging Face Hub."
 27 |     )
 28 |     parser.add_argument(
 29 |         "--dataset-path",
 30 |         required=True,
 31 |         type=Path,
 32 |         help="Path to the dataset directory created by the conversion scripts.",
 33 |     )
 34 |     parser.add_argument(
 35 |         "--repo-id",
 36 |         required=True,
 37 |         help="Target Hugging Face Hub repository in the form <namespace>/<dataset_name>.",
 38 |     )
 39 |     parser.add_argument(
 40 |         "--subset",
 41 |         default="default",
 42 |         help="Configuration name for the dataset (HF subset). Defaults to 'default'.",
 43 |     )
 44 |     parser.add_argument(
 45 |         "--split",
 46 |         default=None,
 47 |         help="If set, only push a single split (train/validation/test). Otherwise pushes all splits.",
 48 |     )
 49 |     parser.add_argument(
 50 |         "--max-shard-size",
 51 |         default="500MB",
 52 |         help="Maximum shard size when uploading (default: 500MB).",
 53 |     )
 54 |     parser.add_argument(
 55 |         "--num-shards",
 56 |         type=int,
 57 |         default=None,
 58 |         help="Number of shards to write. Defaults to automatic selection based on max_shard_size.",
 59 |     )
 60 |     parser.add_argument(
 61 |         "--num-proc",
 62 |         type=int,
 63 |         default=None,
 64 |         help="Number of processes for dataset preparation before upload.",
 65 |     )
 66 |     parser.add_argument(
 67 |         "--commit-message",
 68 |         default="Upload dataset",
 69 |         help="Optional commit message for the upload.",
 70 |     )
 71 |     parser.add_argument(
 72 |         "--commit-description",
 73 |         default=None,
 74 |         help="Optional commit description accompanying the upload.",
 75 |     )
 76 |     parser.add_argument(
 77 |         "--revision",
 78 |         default=None,
 79 |         help="Optional branch name to push to (defaults to main).",
 80 |     )
 81 |     parser.add_argument(
 82 |         "--token",
 83 |         default=None,
 84 |         help="Optional Hugging Face access token. Defaults to the logged-in user token.",
 85 |     )
 86 |     parser.add_argument(
 87 |         "--public",
 88 |         action="store_true",
 89 |         help="Make the uploaded repository public. Private by default.",
 90 |     )
 91 |     parser.add_argument(
 92 |         "--no-embed",
 93 |         action="store_true",
 94 |         help="Disable embedding external files in the Parquet shards (sets embed_external_files=False).",
 95 |     )
 96 |     parser.add_argument(
 97 |         "--dry-run",
 98 |         action="store_true",
 99 |         help="Print upload parameters without performing the push.",
100 |     )
101 |     parser.add_argument(
102 |         "--log-level",
103 |         default="INFO",
104 |         choices=["DEBUG", "INFO", "WARNING", "ERROR"],
105 |         help="Logging verbosity.",
106 |     )
107 |     return parser.parse_args()
108 | 
109 | 
110 | def ensure_dataset_dict(data) -> DatasetDict:
111 |     if isinstance(data, DatasetDict):
112 |         return data
113 |     if isinstance(data, Dataset):
114 |         return DatasetDict({"train": data})
115 |     raise TypeError(f"Unsupported dataset type: {type(data)}")
116 | 
117 | 
118 | def estimate_nbytes(ds: DatasetDict, split: str | None) -> int | None:
119 |     try:
120 |         if split:
121 |             return ds[split]._estimate_nbytes()  # type: ignore[attr-defined]
122 |         total = 0
123 |         for _, subset in ds.items():
124 |             total += subset._estimate_nbytes()  # type: ignore[attr-defined]
125 |         return total
126 |     except Exception:
127 |         return None
128 | 
129 | 
130 | def format_size(num_bytes: int | None) -> str:
131 |     if num_bytes is None:
132 |         return "unknown"
133 |     units = ["B", "KB", "MB", "GB", "TB"]
134 |     size = float(num_bytes)
135 |     unit_index = 0
136 |     while size >= 1024 and unit_index < len(units) - 1:
137 |         size /= 1024
138 |         unit_index += 1
139 |     return f"{size:.2f}{units[unit_index]}"
140 | 
141 | 
142 | def main() -> None:
143 |     args = parse_args()
144 |     logging.basicConfig(level=getattr(logging, args.log_level))
145 | 
146 |     dataset_path = args.dataset_path.resolve()
147 |     if not dataset_path.exists():
148 |         raise FileNotFoundError(f"Dataset path not found: {dataset_path}")
149 | 
150 |     logger.info("Loading dataset from %s", dataset_path)
151 |     data = load_from_disk(str(dataset_path))
152 |     dataset = ensure_dataset_dict(data)
153 | 
154 |     if args.split and args.split not in dataset:
155 |         raise ValueError(
156 |             f"Split '{args.split}' not found in dataset. Available: {list(dataset.keys())}"
157 |         )
158 | 
159 |     total_rows = (
160 |         sum(len(split) for split in dataset.values())
161 |         if not args.split
162 |         else len(dataset[args.split])
163 |     )
164 |     total_size = estimate_nbytes(dataset, args.split)
165 |     logger.info(
166 |         "Prepared dataset with %d rows (estimated size: %s)", total_rows, format_size(total_size)
167 |     )
168 | 
169 |     base_kwargs = {
170 |         "repo_id": args.repo_id,
171 |         "config_name": args.subset,
172 |         "commit_message": args.commit_message,
173 |         "commit_description": args.commit_description,
174 |         "private": None if args.public else True,
175 |         "token": args.token,
176 |         "revision": args.revision,
177 |         "max_shard_size": args.max_shard_size,
178 |         "num_shards": args.num_shards,
179 |         "embed_external_files": not args.no_embed,
180 |     }
181 | 
182 |     if args.split:
183 |         logger.info("Uploading single split '%s'", args.split)
184 |         target_dataset = dataset[args.split]
185 |         upload_kwargs = {**base_kwargs, "split": args.split}
186 |         logger.info("Upload parameters: %s", upload_kwargs)
187 |         if args.dry_run:
188 |             logger.info("Dry run enabled; skipping push_to_hub call.")
189 |             return
190 |         commit_info = target_dataset.push_to_hub(**upload_kwargs)
191 |     else:
192 |         logger.info("Uploading all splits for DatasetDict")
193 |         logger.info("Upload parameters: %s", base_kwargs)
194 |         if args.dry_run:
195 |             logger.info("Dry run enabled; skipping push_to_hub call.")
196 |             return
197 |         commit_info = dataset.push_to_hub(**base_kwargs)
198 | 
199 |     logger.info("Upload complete: %s", commit_info)
200 | 
201 | 
202 | if __name__ == "__main__":
203 |     main()
204 | 


--------------------------------------------------------------------------------
/open_provence/utils/model_architecture.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Model architecture detection and state_dict conversion utilities.
  3 | 
  4 | This module provides utilities for detecting model architectures and converting
  5 | state_dicts between different model formats, particularly for handling ModernBert
  6 | and BERT-like models.
  7 | """
  8 | 
  9 | from __future__ import annotations
 10 | 
 11 | import logging
 12 | from typing import Any, Optional
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class ModelArchitectureUtils:
 18 |     """Utilities for model architecture detection and state_dict conversion."""
 19 | 
 20 |     # Known architecture patterns
 21 |     ARCHITECTURE_PATTERNS = {
 22 |         "modernbert": {
 23 |             "identifiers": ["tok_embeddings", "attn.Wqkv", "mlp_norm"],
 24 |             "structure": "flat",  # No model prefix in base form
 25 |             "requires_prefix": "model.",  # Prefix needed for classification models
 26 |         },
 27 |         "bert": {
 28 |             "identifiers": ["word_embeddings", "encoder.layer", "LayerNorm"],
 29 |             "structure": "nested",  # Has bert. prefix
 30 |             "requires_prefix": None,
 31 |         },
 32 |         "roberta": {
 33 |             "identifiers": ["roberta.embeddings", "roberta.encoder"],
 34 |             "structure": "nested",
 35 |             "requires_prefix": None,
 36 |         },
 37 |     }
 38 | 
 39 |     @staticmethod
 40 |     def detect_architecture(state_dict_keys: list[str]) -> str:
 41 |         """
 42 |         Detect model architecture from state_dict keys.
 43 | 
 44 |         Args:
 45 |             state_dict_keys: List of keys from model state_dict
 46 | 
 47 |         Returns:
 48 |             Architecture name: "modernbert", "bert", "roberta", or "unknown"
 49 |         """
 50 |         # key_set = set(state_dict_keys)  # Not used
 51 |         keys_str = " ".join(state_dict_keys)
 52 | 
 53 |         # Check for specific patterns
 54 |         for arch_name, patterns in ModelArchitectureUtils.ARCHITECTURE_PATTERNS.items():
 55 |             identifiers = patterns["identifiers"]
 56 |             if all(
 57 |                 any(identifier in key for key in state_dict_keys) for identifier in identifiers
 58 |             ):
 59 |                 logger.info(f"Detected {arch_name} architecture")
 60 |                 return arch_name
 61 | 
 62 |         # Additional checks for ModernBert
 63 |         if "tok_embeddings" in keys_str and "Wqkv" in keys_str:
 64 |             return "modernbert"
 65 | 
 66 |         # Check by prefix
 67 |         if any(k.startswith("bert.") for k in state_dict_keys):
 68 |             return "bert"
 69 |         elif any(k.startswith("roberta.") for k in state_dict_keys):
 70 |             return "roberta"
 71 | 
 72 |         logger.warning("Could not detect model architecture")
 73 |         return "unknown"
 74 | 
 75 |     @staticmethod
 76 |     def needs_prefix_conversion(
 77 |         saved_keys: list[str], target_architecture: str
 78 |     ) -> tuple[bool, Optional[str]]:
 79 |         """
 80 |         Check if state_dict keys need prefix conversion.
 81 | 
 82 |         Args:
 83 |             saved_keys: Keys from saved state_dict
 84 |             target_architecture: Target model architecture
 85 | 
 86 |         Returns:
 87 |             Tuple of (needs_conversion, prefix_to_add)
 88 |         """
 89 |         # Check if keys already have expected prefix
 90 |         if target_architecture == "modernbert":
 91 |             # ModernBert in classification form needs "model." prefix
 92 |             has_model_prefix = any(k.startswith("model.") for k in saved_keys)
 93 |             has_flat_structure = any(
 94 |                 k.startswith("embeddings.") or k.startswith("layers.") for k in saved_keys
 95 |             )
 96 | 
 97 |             if has_flat_structure and not has_model_prefix:
 98 |                 return True, "model."
 99 | 
100 |         return False, None
101 | 
102 |     @staticmethod
103 |     def convert_state_dict_keys(
104 |         state_dict: dict[str, Any],
105 |         add_prefix: Optional[str] = None,
106 |         remove_prefix: Optional[str] = None,
107 |         skip_keys: Optional[list[str]] = None,
108 |     ) -> dict[str, Any]:
109 |         """
110 |         Convert state_dict keys by adding or removing prefixes.
111 | 
112 |         Args:
113 |             state_dict: Original state dict
114 |             add_prefix: Prefix to add to keys
115 |             remove_prefix: Prefix to remove from keys
116 |             skip_keys: List of key patterns to skip conversion
117 | 
118 |         Returns:
119 |             Converted state dict
120 |         """
121 |         if skip_keys is None:
122 |             skip_keys = ["pruning_head"]
123 | 
124 |         converted = {}
125 | 
126 |         for key, value in state_dict.items():
127 |             # Check if key should be skipped
128 |             if any(skip_pattern in key for skip_pattern in skip_keys):
129 |                 converted[key] = value
130 |                 continue
131 | 
132 |             new_key = key
133 | 
134 |             # Remove prefix if specified
135 |             if remove_prefix and key.startswith(remove_prefix):
136 |                 new_key = key[len(remove_prefix) :]
137 | 
138 |             # Add prefix if specified
139 |             if add_prefix:
140 |                 new_key = f"{add_prefix}{new_key}"
141 | 
142 |             converted[new_key] = value
143 | 
144 |         return converted
145 | 
146 |     @staticmethod
147 |     def auto_fix_state_dict(
148 |         state_dict: dict[str, Any],
149 |         target_model_keys: list[str],
150 |         architecture: Optional[str] = None,
151 |     ) -> dict[str, Any]:
152 |         """
153 |         Automatically fix state_dict to match target model structure.
154 | 
155 |         Args:
156 |             state_dict: State dict to fix
157 |             target_model_keys: Expected keys from target model
158 |             architecture: Model architecture (will be detected if not provided)
159 | 
160 |         Returns:
161 |             Fixed state dict
162 |         """
163 |         saved_keys = list(state_dict.keys())
164 | 
165 |         if architecture is None:
166 |             architecture = ModelArchitectureUtils.detect_architecture(saved_keys)
167 | 
168 |         # For ModernBert, check if we need to add model. prefix
169 |         if architecture == "modernbert":
170 |             needs_prefix, prefix = ModelArchitectureUtils.needs_prefix_conversion(
171 |                 saved_keys, architecture
172 |             )
173 | 
174 |             if needs_prefix:
175 |                 logger.info(f"Adding '{prefix}' prefix to ModernBert state_dict keys")
176 |                 return ModelArchitectureUtils.convert_state_dict_keys(
177 |                     state_dict, add_prefix=prefix, skip_keys=["pruning_head"]
178 |                 )
179 | 
180 |         # If no conversion needed, return original
181 |         return state_dict
182 | 
183 |     @staticmethod
184 |     def normalize_state_dict_for_saving(
185 |         state_dict: dict[str, Any], architecture: str
186 |     ) -> dict[str, Any]:
187 |         """
188 |         Normalize state_dict for consistent saving format.
189 | 
190 |         Args:
191 |             state_dict: State dict to normalize
192 |             architecture: Model architecture
193 | 
194 |         Returns:
195 |             Normalized state dict
196 |         """
197 |         # For ModernBert, we want to save without model. prefix
198 |         if architecture == "modernbert" and any(k.startswith("model.") for k in state_dict.keys()):
199 |             logger.info("Removing 'model.' prefix from ModernBert state_dict for saving")
200 |             return ModelArchitectureUtils.convert_state_dict_keys(
201 |                 state_dict, remove_prefix="model.", skip_keys=["pruning_head"]
202 |             )
203 | 
204 |         return state_dict
205 | 


--------------------------------------------------------------------------------
/AGENTS.md:
--------------------------------------------------------------------------------
 1 | # AI Agent Operating Guide
 2 | 
 3 | ## Repository Map
 4 | - Core pruning and ranking logic lives in `open_provence/`, with `encoder.py`, `trainer.py`, `models/open_provence_head.py`, and `modeling_open_provence_standalone.py` coordinating model internals; helpers sit under `open_provence/utils/`.
 5 | - CLI entry points live in `scripts/`: training/evaluation wrappers such as `eval_datasets.py` and `eval_mldr.py`, dataset converters under `context-relevance-datasets/`, and release helpers under `utils/`. Launch them via `uv run python ...` or the `open_provence_trainer` console script.
 6 | - Experiment definitions live in `configs/` (YAML/JSON). Generated artefacts belong in `output/<config>_<timestamp>/` and mirrored to `wandb/`; keep ad-hoc scratch files in `tmp/`.
 7 | - Documentation is under `docs/` (see @docs/train.md, @docs/create_context_relevance_dataset.md, @docs/eval_dataset.md, @docs/eval_mldr.md, `@docs/eval_reports/*`). Tests mirror the package within `tests/`.
 8 | - Python baseline is 3.11+. Respect Ruff defaults (4-space indent, ≤99 characters, sorted imports) and start new modules with `from __future__ import annotations`.
 9 | 
10 | ## Architecture Overview
11 | - **Data + Config Layer**: YAML/JSON configs in `configs/` describe datasets, training hyperparameters, and evaluation suites. Dataset preparation scripts under `scripts/context-relevance-datasets/` convert raw triplets into Provence format (@docs/create_context_relevance_dataset.md).
12 | - **Training Stack**: `python -m open_provence.runner` / `open_provence_trainer` parse configs into `open_provence/trainer.py` and `open_provence/encoder.py`, which orchestrate model heads from `open_provence/models/`. The workflow, checkpoints, and reference metrics are documented in @docs/train.md.
13 | - **Model Packaging**: `open_provence/modeling_open_provence_standalone.py` is the Hugging Face facing entry point. It must stay dependency-light and portable because checkpoints exported to the Hub ship this file for remote inference; preserve its standalone imports and ensure inference paths remain functional.
14 | - **Inference & Serving**: Local inference relies on `modeling_open_provence_standalone.py` (auto-loaded by `AutoModel.from_pretrained(..., trust_remote_code=True)`) or the evaluation CLIs (`scripts/eval_datasets.py`, `scripts/eval_mldr.py`) that wrap `model.process` for batch pruning. Downstream agents integrate via the same API shown in README examples.
15 | - **Evaluation Layer**: Dataset retention metrics live in `scripts/eval_datasets.py` (@docs/eval_dataset.md), while long-document robustness uses `scripts/eval_mldr.py` (@docs/eval_mldr.md). Generated reports are stored under `output/.../eval_results/` or `@docs/eval_reports/*`.
16 | - **Observability**: Training metadata is mirrored to `wandb/` and organised by run slug (`<config>-<timestamp>`). Logs and summaries (Markdown/JSON) accompany each run for regression tracking.
17 | - **Release Flow**: Successful checkpoints move from `output/<config>_<timestamp>/final_model/` into `output/release_models/<name>/`, bundling the standalone modeling file and evaluation artefacts before publishing to Hugging Face.
18 | 
19 | ## Environment Setup
20 | - Install dependencies with `uv sync`. By default this now resolves the CUDA 12.8 wheel (`torch==2.8.0+cu128`) on Linux x86_64; use `uv sync --no-default-groups --group dev --group cpu` when you need the CPU/Metal build instead.
21 | - If you need to pin a specific CUDA wheel manually after syncing, run:
22 |   ```bash
23 |   uv pip install --index https://download.pytorch.org/whl/cu128 --index-strategy unsafe-best-match "torch==2.8.0+cu128"
24 |   ```
25 |   Optional: install FlashAttention via `uv sync --group flash-attn` or a vetted wheel in `tmp/` such as `uv pip install ./tmp/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp311-cp311-linux_x86_64.whl`.
26 | - Export credentials used by the tooling: `WANDB_API_KEY`, Hugging Face token (`huggingface-cli login`), and (for MLDR LLM judging) `OPENAI_API_KEY`.
27 | - Use local virtualenv isolation (`uv`) and avoid polluting the repo with global cache files; large downloads belong under `output/`, `wandb/`, or `tmp/`.
28 | 
29 | ## Day-to-Day Development
30 | - Run the full CI stack with `uv run tox run-parallel` after implementing changes; it runs pytest, Ruff, formatting, and type checks in one go while keeping the environments concurrent. When you are instructed to “run tox” locally, default to `uv run tox run-parallel` to complete the suite faster.
31 | - For focused iteration, the equivalent single commands are:
32 |   ```bash
33 |   uv run pytest
34 |   uv run ruff check .
35 |   uv run ruff format --check --diff
36 |   uv run pyright
37 |   ```
38 | - Launch training from configs with either `uv run python -m open_provence.runner <config.yaml>` or `uv run open_provence_trainer <config.yaml>`.
39 | - Run pruning sweeps via `uv run python scripts/eval_datasets.py --config configs/eval_datasets/en_nano.yaml --model output/<run>/final_model --threshold 0.1 --output-json tmp/eval_en_nano.json --output-file tmp/eval_en_nano.md`; use `scripts/eval_mldr.py` for MLDR runs (flags in docs).
40 | - Keep notebooks and diagnostics in `tmp/` and mirror new helpers under `tests/` to maintain parity with production modules.
41 | 
42 | ## Regression Guardrails for Modeling Changes
43 | - If you touch training loops, heads, or model plumbing (e.g., files under `open_provence/models/`, `open_provence/trainer.py`, `open_provence/modeling_open_provence_standalone.py`), reproduce the reference runs from `docs/train.md`:
44 |   ```bash
45 |   uv run open_provence_trainer configs/toy-open-provence-reranker-v1-gte-modernbert-base.yaml
46 |   uv run open_provence_trainer configs/toy-open-provence-reranker-v1.yaml
47 |   ```
48 |   These complete in roughly 5 minutes (English) and about 6 minutes (Japanese, including nano
49 |   evals) on an RTX 4090; record the output directories and compare nano metrics against the
50 |   documented baselines (F2 ≈ 0.80 en / 0.79 ja at threshold 0.1).
51 | - Archive artefacts under `output/<config>_<timestamp>/final_model/` and sync Weights & Biases runs (`toy-open-provence-reranker-*-20251029-*`) for traceability.
52 | - When PRs alter model behaviour, summarize the training command, threshold metrics, and any evaluation deltas directly in the PR body, citing the artefacts. Block merges if the nano checks regress materially.
53 | - Before drafting a PR description, read the five most recently merged PRs and mirror their structure. Start the PR body by clearly stating why the change is necessary before explaining what changed.
54 | 
55 | ## Evaluation Pipelines
56 | - Dataset-level pruning checks: follow `docs/eval_dataset.md` and run
57 |   ```bash
58 |   uv run python scripts/eval_datasets.py \
59 |     --config configs/eval_datasets/en_nano.yaml \
60 |     --model output/<run>/final_model \
61 |     --threshold 0.1 \
62 |     --batch-size 256 \
63 |     --output-json tmp/eval_en_nano.json \
64 |     --output-file tmp/eval_en_nano.md
65 |   ```
66 |   Swap in `ja_nano.yaml` or the full configs once metrics look good. Copy JSON/MD summaries into the release tree when publishing models.
67 | - MLDR long-document evaluation: see `docs/eval_mldr.md`. Typical invocation:
68 |   ```bash
69 |   uv run python scripts/eval_mldr.py \
70 |     --model output/<run>/final_model \
71 |     --lang en \
72 |     --threshold 0.1 \
73 |     --limit 200 \
74 |     --ignore-file scripts/eval_mldr/ignored_questions.yaml \
75 |     --output-dir output/eval_mldr_runs/<slug>
76 |   ```
77 |   Preserve `process/` and `eval_llm/summary.{json,md}` outputs, and document any ignore-list edits.
78 | 
79 | ## Dataset Creation Workflow
80 | - Convert triplet datasets using `scripts/context-relevance-datasets/generate_ds_from_sentense_transformer.py` (see `docs/create_context_relevance_dataset.md` for flags and prerequisites).
81 | - Augment context spans with relevance labels (`add_context_spans_relevance.py`) and teacher scores (`add_reranker_teacher_scores.py`); install language-specific splitters (`nltk`, `fast-bunkai`) beforehand.
82 | - Verify conversions with small previews (`datasets.load_from_disk`) and keep intermediate artefacts under `output/context-relevance-datasets/`.
83 | - When sharing new datasets, document schema quirks or filters in `docs/` and reference the Hugging Face IDs used.
84 | 
85 | ## Commit, Review, and PR Expectations
86 | - Commit messages follow `type: concise summary` (e.g., `feat: add bilingual pruning thresholds`) and reference relevant configs or issues.
87 | - Before opening a PR:
88 |   - Run `uv run tox` (or `uv run pytest`, `uv run ruff check .`, `uv run ruff format --check --diff`, `uv run pyright`) and include the outcomes.
89 |   - Document manual diff reviews (`git diff`, `git diff origin/main...`) and summarize affected scripts/configs plus any artefacts written to `output/` or `wandb/`.
90 |   - Update docs when behaviour changes (e.g., amend `docs/train.md` if new training steps are required).
91 |   - Attach evaluation results (nano dataset metrics, MLDR summaries) whenever model quality could shift.
92 | - Keep new assets in `output/` or `wandb/`; justify any large files and avoid polluting the package tree. Remove stale scratch runs only after results are logged.
93 | 
94 | Following these practices keeps the repository reproducible and ready for both human and AI contributors. Refer back to `docs/train.md`, `docs/eval_dataset.md`, and `docs/eval_mldr.md` whenever updating training, evaluation, or dataset pipelines.
95 | 


--------------------------------------------------------------------------------
/docs/eval_mldr.md:
--------------------------------------------------------------------------------
  1 | # MLDR Evaluation Guide
  2 | 
  3 | This document explains how to regenerate MLDR pruning runs with `scripts/eval_mldr.py`. It focuses on **workflow and tooling** rather than reporting metrics, so you can reproduce results or adapt the pipeline for new checkpoints.
  4 | 
  5 | ## 1. What is the MLDR dataset?
  6 | 
  7 | [`Shitao/MLDR`](https://huggingface.co/datasets/Shitao/MLDR) is a multilingual long-document retrieval benchmark. Each record contains:
  8 | 
  9 | - a query string,
 10 | - one positive passage that should remain after pruning,
 11 | - several negative passages that are expected to be removed,
 12 | - metadata such as language tags and document identifiers.
 13 | 
 14 | By default we evaluate `test[:200]`, but you can point the script at any MLDR slice via `--split` (e.g., `--split test --limit 500`, or `--split "dev[:100]"`). The positive + negative passages are long-form text; many exceed several kilobytes, so pruning is essential before running an LLM judge.
 15 | 
 16 | ### Known data issues and ignore list
 17 | 
 18 | - Some queries (especially in Japanese) are malformed or not natural language questions.
 19 | - A handful of passages exceed 60k UTF-8 characters; letting them pass through causes GPU OOM or very slow CPU runs.
 20 | - We maintain a curated ignore list at `scripts/eval_mldr/ignored_questions.yaml`, keyed by language (`en`, `jp`). The CLI points to this file by default; if it is missing you can use `--force-no-ignore` to continue, but doing so is discouraged unless you understand the impact on metrics.
 21 | - If you discover new outliers (e.g., corrupted glyphs or language-mismatched queries), append them to the YAML file with a short comment describing the reason.
 22 | 
 23 | ## 2. Prerequisites
 24 | 
 25 | 1. **Dependencies** – install the project environment:
 26 |    ```bash
 27 |    uv sync
 28 |    ```
 29 |    On Linux x86_64 this resolves the CUDA 12.8 wheel by default; pass `--no-default-groups --group dev --group cpu`
 30 |    if you need a CPU/Metal-only setup.
 31 | 2. **Credentials**
 32 |    - Hugging Face account (for dataset/model downloads, optional if cached).
 33 |    - OpenAI API key exported as `OPENAI_API_KEY` (LiteLLM forwards requests to `gpt-5-nano` by default).
 34 | 3. **Hardware**
 35 |    - GPU is recommended but not required. The script automatically selects CUDA when available. CPU-only runs will succeed but take longer.
 36 | 4. **Optional tooling**
 37 |    - For Naver Provence models (`naver/provence-*`, `naver/xprovence-*`), install spaCy sentence splitter dependencies referenced in the script comments if you plan to run those baselines.
 38 | 
 39 | ## 3. Boilerplate command
 40 | 
 41 | The unified entry point is `scripts/eval_mldr.py`. It can run the pruning stage (`model.process`) and the LLM judging stage back-to-back, or you can execute them separately.
 42 | 
 43 | ```bash
 44 | uv run python scripts/eval_mldr.py \
 45 |   --model output/release_models/open-provence-reranker-v1-gte-modernbert-base \
 46 |   --lang en \
 47 |   --threshold 0.1 \
 48 |   --limit 200 \
 49 |   --split test \
 50 |   --ignore-file scripts/eval_mldr/ignored_questions.yaml \
 51 |   --concurrency 5 \
 52 |   --output-dir output/eval_mldr_runs/open-provence_en_th_0_1 \
 53 |   --no-progress
 54 | ```
 55 | 
 56 | We typically sweep four pruning thresholds—`0.05`, `0.1`, `0.3`, and `0.5`—to mirror the dataset benchmarks. You can launch consecutive runs with a simple loop:
 57 | 
 58 | ```bash
 59 | for th in 0.05 0.1 0.3 0.5; do
 60 |   uv run python scripts/eval_mldr.py \
 61 |     --model output/release_models/open-provence-reranker-v1-gte-modernbert-base \
 62 |     --lang en \
 63 |     --threshold "$th" \
 64 |     --limit 200 \
 65 |     --split test \
 66 |     --ignore-file scripts/eval_mldr/ignored_questions.yaml \
 67 |     --concurrency 5 \
 68 |     --output-dir output/eval_mldr_runs/open-provence_en_th_${th//./_} \
 69 |     --no-progress
 70 | done
 71 | ```
 72 | 
 73 | Key flags:
 74 | 
 75 | | Flag | Purpose |
 76 | | --- | --- |
 77 | | `--model` | Hugging Face ID or local checkpoint directory. Local paths are auto-resolved to `final_model/` when present. |
 78 | | `--lang` | Logical evaluation language (`en` or `jp`) used for the ignore list and reporting. |
 79 | | `--threshold` | Pruning threshold. Higher values prune more aggressively. |
 80 | | `--text-source` | Choose `pruned` (default) to evaluate the model outputs or `original` to judge raw passages. |
 81 | | `--limit` | Number of MLDR queries to sample (default `200`). Combine with `--split` for custom slices. |
 82 | | `--split` | Hugging Face split expression (`test`, `test[:200]`, etc.). Defaults to `test`. |
 83 | | `--ignore-file` | YAML with query IDs to skip; defaults to `scripts/eval_mldr/ignored_questions.yaml`. |
 84 | | `--output-dir` | Destination folder for artifacts. |
 85 | 
 86 | **Optional knobs** (use as needed):
 87 | 
 88 | - `--mldr-lang` — Override the MLDR dataset language when it differs from `--lang`.
 89 | - Sentence splitting is auto-detected (`auto`) and seamlessly handles Japanese and English without manual overrides.
 90 | - `--batch-size` — Controls batch size passed to `model.process` (default `16`). Lower it for tight GPU memory.
 91 | - `--device`, `--torch-dtype` — Pin execution to `cuda`/`cpu` or force precision (`float16`, `bfloat16`, etc.). When a Naver Provence Hub model is detected, the script auto-selects CUDA + `bfloat16` unless overridden.
 92 | - `--max-length` — Cap the token window supplied to the model loader.
 93 | - `--log-timing`, `--reranker-first-score`, `--no-progress` — Enable extra logging, change reranker aggregation, or silence progress bars.
 94 | - `--llm-model`, `--reasoning-effort` — Switch the LiteLLM backend and effort level for the judging stage (`gpt-5-nano` + `low` by default).
 95 | - `--force-process`, `--skip-process`, `--no-eval`, `--force-eval` — Control whether each stage runs when outputs already exist.
 96 | - `--include-negatives`, `--max-text-chars`, `--concurrency`, `--retries`, `--retry-delay`, `--request-timeout`, `--force-no-ignore` — Tune the evaluation filter, truncation length (default `60,000` characters), request parallelism, retry policy, and ignore-list usage.
 97 | 
 98 | ### Output structure
 99 | 
100 | ```
101 | output-dir/
102 |   process/                # used when --text-source pruned (default)
103 |     dataset/              # HF Dataset with pruned passages
104 |     summary.json          # stats such as avg compression, runtime
105 |     result.md             # sample inspection notebook
106 |   process_original/       # populated when --text-source original
107 |     dataset/
108 |     summary.json
109 |     result.md
110 |   eval_llm/
111 |     raw/                  # LiteLLM judgment cache (optional)
112 |     summary.json          # aggregate LLM results
113 |     summary.md            # human-readable recap
114 | ```
115 | 
116 | ## 4. Running baselines and partial reruns (LLM evaluation on MLDR dataset)
117 | 
118 | - **Pruned run only**: If you just want to rebuild the pruned dataset without LLM judging, add `--no-eval`. You can judge later with `--skip-process --force-eval`.
119 | - **Original text baseline**: Use `--text-source original` to bypass pruning and evaluate the raw passages. This is helpful for establishing upper bounds. Example:
120 |   ```bash
121 |   uv run python scripts/eval_mldr.py \
122 |     --model output/release_models/open-provence-reranker-v1-gte-modernbert-base \
123 |     --lang en \
124 |     --text-source original \
125 |     --limit 200 \
126 |     --split test \
127 |     --ignore-file scripts/eval_mldr/ignored_questions.yaml \
128 |     --output-dir output/eval_mldr_runs/open-provence_en_original \
129 |     --no-progress --force-process --force-eval
130 |   ```
131 | - **Reuse existing data**: When you just need to re-run the judge (e.g., switching LLM models), point to the same `output-dir` and pass `--skip-process --force-eval`.
132 | 
133 | ## 5. Notes for Naver checkpoints
134 | 
135 | `naver/provence-reranker-debertav3-v1` and `naver/xprovence-reranker-bgem3-v1` rely on custom CUDA kernels. These may fail on very new GPU architectures. If you encounter NVRTC `--gpu-architecture` errors:
136 | 
137 | 1. Retry on a machine with an Ampere/Lovelace GPU where CUDA 12.x kernels are tested.
138 | 2. Lower `--batch-size` (e.g., 2) and keep `--torch-dtype bfloat16` for Provence.
139 | 3. If GPU execution remains unstable, fall back to CPU (`--device cpu --torch-dtype float32`), understanding that the run will take significantly longer.
140 | 
141 | ## 6. After the run
142 | 
143 | 1. Copy `summary.{json,md}` to the release tree, e.g.,
144 |    ```bash
145 |    cp output/eval_mldr_runs/open-provence_en_th_0_1/eval_llm/summary.{json,md} \
146 |       output/release_models/open-provence-reranker-v1-gte-modernbert-base/eval_results/mldr_en_th_0_1/eval_llm/
147 |    ```
148 | 2. Archive the full `process/`（または `process_original/`）ディレクトリを `tmp/mldr_runs/<model>/<slug>/` に保管しておくと再判定が容易です。
149 | 3. Update documentation or reports (e.g., `docs/eval_reports/<date>.md`) only after verifying the aggregate JSON values.
150 | 
151 | ## 7. Checklist
152 | 
153 | - [ ] Data split is `test[:200]` (or another documented limit).
154 | - [ ] Ignore list points to `scripts/eval_mldr/ignored_questions.yaml`.
155 | - [ ] `process/summary.json` shows realistic compression percentages (no unexpected 0%/100% unless running `original` mode).
156 | - [ ] `eval_llm/summary.json` contains non-zero `records_evaluated` and `failed` is zero.
157 | - [ ] Summaries copied into `output/release_models/<model>/eval_results/…`.
158 | 
159 | Following these steps ensures consistent MLDR pruning experiments across releases and team members. If you extend the pipeline—e.g., using a different judge model or language split—document the new flags and any additional ignore rules alongside the run artefacts.
160 | 


--------------------------------------------------------------------------------
/open_provence/losses.py:
--------------------------------------------------------------------------------
  1 | """
  2 | OpenProvenceLoss for training with dynamic label generation based on chunk relevance.
  3 | """
  4 | 
  5 | from __future__ import annotations
  6 | 
  7 | import logging
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class OpenProvenceLoss(nn.Module):
 16 |     """
 17 |     Loss function designed for OpenProvenceDataCollator outputs.
 18 | 
 19 |     This loss function handles:
 20 |     - Flattened query-text pairs from the data collator
 21 |     - Reconstruction of batch structure for ranking loss
 22 |     - Token-level pruning loss
 23 | 
 24 |     The DataCollator determines whether to use teacher scores or hard labels
 25 |     by setting the appropriate scores_column parameter.
 26 |     """
 27 | 
 28 |     def __init__(
 29 |         self,
 30 |         model,
 31 |         ranking_loss_fn: nn.Module | None = None,
 32 |         pruning_loss_fn: nn.Module | None = None,
 33 |         ranking_weight: float = 0.05,
 34 |         pruning_weight: float = 1.0,
 35 |         is_regression: bool = True,
 36 |         use_raw_logits: bool = True,
 37 |     ):
 38 |         """
 39 |         Args:
 40 |             model: OpenProvenceEncoder model
 41 |             ranking_loss_fn: Loss function for ranking (default: MSELoss for regression, BCEWithLogitsLoss for classification)
 42 |             pruning_loss_fn: Loss function for pruning (default: CrossEntropyLoss)
 43 |             ranking_weight: Weight for ranking loss (default: 0.05)
 44 |             pruning_weight: Weight for pruning loss (default: 1.0)
 45 |             is_regression: Whether the ranking task is regression (True) or classification (False)
 46 |         """
 47 |         super().__init__()
 48 | 
 49 |         self.model = model
 50 |         self.ranking_weight = ranking_weight
 51 |         self.pruning_weight = pruning_weight
 52 |         self.is_regression = is_regression
 53 |         self.use_raw_logits = use_raw_logits  # Follow Provence: use raw logits
 54 | 
 55 |         # Loss functions
 56 |         if is_regression:
 57 |             self.ranking_loss_fn = ranking_loss_fn or nn.MSELoss()
 58 |         else:
 59 |             self.ranking_loss_fn = ranking_loss_fn or nn.BCEWithLogitsLoss()
 60 | 
 61 |         self.pruning_loss_fn = pruning_loss_fn or nn.CrossEntropyLoss(ignore_index=-100)
 62 | 
 63 |         # Initialize last loss components tracker
 64 |         self.last_loss_components = {}
 65 | 
 66 |     def forward(
 67 |         self, sentence_features: list[dict[str, torch.Tensor]], labels: dict[str, torch.Tensor]
 68 |     ):
 69 |         """
 70 |         Compute joint loss for ranking and pruning.
 71 | 
 72 |         Args:
 73 |             sentence_features: List with single dict containing flattened tokenized inputs
 74 |             labels: Dictionary containing:
 75 |                 - 'ranking_targets': [batch_size, max_docs] matrix of targets (teacher scores or labels)
 76 |                 - 'pruning_labels': [num_pairs, seq_len] flattened format
 77 |                 - 'batch_indices': [num_pairs] mapping to original batch
 78 |                 - 'doc_indices': [num_pairs] mapping to document index
 79 |                 - 'docs_per_query': List[int] number of docs per query
 80 | 
 81 |         Returns:
 82 |             Total loss value
 83 |         """
 84 |         # Get model inputs
 85 |         inputs = sentence_features[0]
 86 | 
 87 |         # Get model outputs
 88 |         outputs = self.model.forward(
 89 |             input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]
 90 |         )
 91 | 
 92 |         total_loss = 0.0
 93 |         losses = {}
 94 | 
 95 |         # 1. Ranking loss
 96 |         if "ranking_logits" in outputs and "ranking_targets" in labels:
 97 |             ranking_loss = self._compute_ranking_loss(outputs, labels)
 98 |             if ranking_loss is not None:
 99 |                 total_loss += self.ranking_weight * ranking_loss
100 |                 losses["ranking_loss"] = ranking_loss
101 |         elif "ranking_targets" in labels:
102 |             raise ValueError(
103 |                 "Model did not output ranking_logits but ranking_targets were provided."
104 |             )
105 | 
106 |         # 2. Pruning loss (for both modes)
107 |         if "pruning_logits" in outputs and "pruning_labels" in labels:
108 |             pruning_loss = self._compute_pruning_loss(outputs, labels)
109 |             if pruning_loss is not None:
110 |                 total_loss += self.pruning_weight * pruning_loss
111 |                 losses["pruning_loss"] = pruning_loss
112 |         else:
113 |             raise ValueError(
114 |                 "Pruning logits or labels missing. pruning_logits in outputs: {} pruning_labels in labels: {}".format(
115 |                     "pruning_logits" in outputs, "pruning_labels" in labels
116 |                 )
117 |             )
118 | 
119 |         # Store loss components for logging
120 |         self.last_loss_components = losses
121 | 
122 |         # Log loss components
123 |         if logger.isEnabledFor(logging.DEBUG):
124 |             logger.debug(f"Loss components: {losses}")
125 | 
126 |         # Return total loss as tensor for HF Trainer compatibility
127 |         return total_loss
128 | 
129 |     def _compute_ranking_loss(
130 |         self, outputs: dict[str, torch.Tensor], labels: dict[str, torch.Tensor]
131 |     ):
132 |         """Compute ranking loss from flattened outputs and matrix labels."""
133 |         ranking_logits = outputs["ranking_logits"]  # [num_pairs]
134 | 
135 |         # Ensure ranking_logits is 1D
136 |         if ranking_logits.dim() > 1:
137 |             ranking_logits = ranking_logits.squeeze(-1)
138 | 
139 |         # Get reconstruction info
140 |         batch_indices = labels["batch_indices"]  # [num_pairs]
141 |         doc_indices = labels["doc_indices"]  # [num_pairs]
142 | 
143 |         # Target values (either teacher scores or hard labels, determined by DataCollator)
144 |         target_matrix = labels["ranking_targets"]  # [batch_size, max_docs]
145 | 
146 |         # Extract target values for each pair
147 |         target_values = []
148 |         for batch_idx, doc_idx in zip(batch_indices, doc_indices):
149 |             if doc_idx < target_matrix.shape[1]:
150 |                 target_val = target_matrix[batch_idx, doc_idx]
151 |                 # Skip padding values
152 |                 if target_val != -100:
153 |                     target_values.append(target_val)
154 |                 else:
155 |                     target_values.append(0.0)  # Fallback
156 |             else:
157 |                 target_values.append(0.0)  # Fallback
158 | 
159 |         target_tensor = torch.tensor(
160 |             target_values, dtype=torch.float32, device=ranking_logits.device
161 |         )
162 | 
163 |         # Ensure same length
164 |         min_len = min(len(ranking_logits), len(target_tensor))
165 |         ranking_logits = ranking_logits[:min_len]
166 |         target_tensor = target_tensor[:min_len]
167 | 
168 |         if len(ranking_logits) == 0:
169 |             return None
170 | 
171 |         # Compute loss - Provence paper uses raw MSE without sigmoid
172 |         if self.is_regression and self.use_raw_logits:
173 |             # Direct MSE loss as per Provence paper: (s_n - z_{n,0})^2
174 |             # Handle different output dimensions
175 |             if ranking_logits.dim() > 1:
176 |                 if ranking_logits.shape[-1] == 2:
177 |                     # 2-class model: use first dimension (Provence style)
178 |                     ranking_scores = ranking_logits[:, 0]
179 |                 elif ranking_logits.shape[-1] == 1:
180 |                     # 1-class model: squeeze last dimension
181 |                     ranking_scores = ranking_logits.squeeze(-1)
182 |                 else:
183 |                     # Multi-class: use first dimension
184 |                     ranking_scores = ranking_logits[:, 0]
185 |             else:
186 |                 ranking_scores = ranking_logits
187 | 
188 |             loss = self.ranking_loss_fn(ranking_scores, target_tensor)
189 |         else:
190 |             # Standard loss computation
191 |             loss = self.ranking_loss_fn(ranking_logits, target_tensor)
192 | 
193 |         return loss
194 | 
195 |     def _compute_pruning_loss(
196 |         self, outputs: dict[str, torch.Tensor], labels: dict[str, torch.Tensor]
197 |     ):
198 |         """Compute token-level pruning loss."""
199 |         pruning_logits = outputs["pruning_logits"]  # [num_pairs, seq_len, 2]
200 |         pruning_labels = labels["pruning_labels"]  # [num_pairs, seq_len]
201 | 
202 |         # Ensure compatible shapes
203 |         if pruning_logits.shape[:2] != pruning_labels.shape:
204 |             min_batch = min(pruning_logits.shape[0], pruning_labels.shape[0])
205 |             min_seq = min(pruning_logits.shape[1], pruning_labels.shape[1])
206 | 
207 |             pruning_logits = pruning_logits[:min_batch, :min_seq]  # [min_batch, min_seq, 2]
208 |             pruning_labels = pruning_labels[:min_batch, :min_seq]  # [min_batch, min_seq]
209 | 
210 |         # Flatten for cross entropy loss
211 |         _, _ = pruning_labels.shape
212 |         pruning_logits_flat = pruning_logits.view(-1, 2)  # [batch*seq, 2]
213 |         pruning_labels_flat = pruning_labels.view(-1)  # [batch*seq]
214 | 
215 |         # Check for valid labels before computing loss
216 |         valid_mask = pruning_labels_flat != -100
217 |         num_valid = valid_mask.sum().item()
218 | 
219 |         if num_valid == 0:
220 |             # No valid tokens to compute loss on
221 |             logger.warning(
222 |                 f"No valid pruning labels found (all are -100). "
223 |                 f"Batch shape: {pruning_labels.shape}, "
224 |                 f"Total tokens: {pruning_labels.numel()}. Returning zero loss."
225 |             )
226 |             return torch.tensor(0.0, device=pruning_logits.device, requires_grad=True)
227 | 
228 |         # Compute loss (ignore_index=-100 will skip padding)
229 |         loss = self.pruning_loss_fn(pruning_logits_flat, pruning_labels_flat)
230 | 
231 |         # Check for NaN
232 |         if torch.isnan(loss):
233 |             logger.error(
234 |                 f"NaN detected in pruning loss! "
235 |                 f"Valid tokens: {num_valid}, "
236 |                 f"Logits min/max: {pruning_logits_flat.min():.4f}/{pruning_logits_flat.max():.4f}"
237 |             )
238 |             # Return a small positive loss instead of NaN
239 |             return torch.tensor(0.001, device=pruning_logits.device, requires_grad=True)
240 | 
241 |         return loss
242 | 


--------------------------------------------------------------------------------
/docs/train.md:
--------------------------------------------------------------------------------
  1 | # Training Workflow
  2 | 
  3 | This guide walks through the end-to-end process for training OpenProvence reranker–pruner models. It introduces the ready-made configs, explains the YAML structure, and highlights what to look for when validating a fresh run.
  4 | 
  5 | ## Prerequisites
  6 | - Run `uv sync` to install the base environment. On Linux x86_64 this now resolves the CUDA 12.8 wheel (`torch==2.8.0+cu128`); pass `--no-default-groups --group dev --group cpu` if you need the CPU/Metal wheel instead.
  7 | - Sign in to the Hugging Face Hub if any referenced datasets require authentication.
  8 | - Export `WANDB_API_KEY` when you want metrics in the shared Weights & Biases project.
  9 | - Use a single NVIDIA GPU with ≥16 GB of memory; every recipe in this guide fits that footprint. **Installing FlashAttention (`uv sync --group flash-attn` or by adding the vetted wheel in `tmp/`) delivers a noticeable speed-up.**
 10 | 
 11 | ## Ready-to-Use Configurations
 12 | - Release checkpoints (all timed on an RTX 5090)  
 13 |   - [configs/open-provence-reranker-v1-gte-modernbert-base.yaml](../configs/open-provence-reranker-v1-gte-modernbert-base.yaml) — English-only ModernBERT backbone trained on MSMARCO/NQ/GooAQ. **≈10 hours**.
 14 |   - [configs/open-provence-reranker-v1.yaml](../configs/open-provence-reranker-v1.yaml) — Japanese & English dual-language recipe backed by `hotchpotch/japanese-reranker-base-v2` and the full multilingual corpus. **≈10 hours**.
 15 |   - [configs/open-provence-reranker-xsmall-v1.yaml](../configs/open-provence-reranker-xsmall-v1.yaml) — 30M-parameter bilingual checkpoint optimised for latency-sensitive deployments. **≈5 hours**.
 16 |   - [configs/open-provence-reranker-large-v1.yaml](../configs/open-provence-reranker-large-v1.yaml) — 310M-parameter bilingual checkpoint tuned for maximum retention and compression. **≈20 hours**.
 17 | - Toy pipelines (smoke tests, 5–10 minutes on an RTX 5090)  
 18 |   - [configs/toy-open-provence-reranker-v1-gte-modernbert-base.yaml](../configs/toy-open-provence-reranker-v1-gte-modernbert-base.yaml) — English toy run sampling ≈12 k examples.
 19 |   - [configs/toy-open-provence-reranker-v1.yaml](../configs/toy-open-provence-reranker-v1.yaml) — Japanese & English toy run mirroring the release mixture at tiny scale.
 20 | 
 21 | Clone one of these files for experiments and keep your custom versions under `configs/`.
 22 | 
 23 | ## Running a Training Job
 24 | 
 25 | ### Quick toy run (5–10 minutes)
 26 | 
 27 | Before you launch the full recipes, warm up the pipeline with the toy configs. They sample a tiny slice of each dataset, train in roughly 5–10 minutes on an RTX 5090, and still produce sensible pruning behaviour.
 28 | 
 29 | ```bash
 30 | # English toy model
 31 | uv run open_provence_trainer configs/toy-open-provence-reranker-v1-gte-modernbert-base.yaml
 32 | 
 33 | # Japanese & English toy model
 34 | uv run open_provence_trainer configs/toy-open-provence-reranker-v1.yaml
 35 | ```
 36 | 
 37 | If the logs or nano metrics drift far from the reference values in the “Toy Dataset Reference” section below, double-check your environment before advancing to the full-scale runs.
 38 | 
 39 | ### English full run
 40 | ```bash
 41 | uv run open_provence_trainer configs/open-provence-reranker-v1-gte-modernbert-base.yaml
 42 | ```
 43 | 
 44 | The trainer prints the parsed arguments, then begins a single epoch with `per_device_train_batch_size=4` and `gradient_accumulation_steps=64` (effective batch size 256). Logs appear every 100 steps, and evaluation runs every 500 steps.
 45 | 
 46 | ### Japanese & English full run
 47 | ```bash
 48 | uv run open_provence_trainer configs/open-provence-reranker-v1.yaml
 49 | ```
 50 | 
 51 | This configuration processes a larger corpus, so expect longer wall-clock time. Several datasets include `upsample_factor` to balance coverage across domains.
 52 | 
 53 | ### Monitoring and artifacts
 54 | - Outputs live under `output/<config>_<timestamp>/`. The final checkpoint is always stored in `final_model/`.
 55 | - When `report_to=["wandb"]`, runs are uploaded to the `hotchpotch/open-provence` project with slug `<config>-<timestamp>`.
 56 | - After training, the `eval_datasets` block automatically kicks off `scripts/eval_datasets.py` with the language-appropriate config so you get nano evaluation results without additional commands.
 57 | 
 58 | ### Resuming after an interruption
 59 | 
 60 | Interrupted runs keep Hugging Face–style checkpoints inside the training output directory (`checkpoint-3000/`, `checkpoint-3500/`, ...). Restart training with any of the following options:
 61 | 
 62 | - Command line: `uv run open_provence_trainer <config.yaml> --checkpoint /path/to/output/run_dir` automatically resumes from the latest `checkpoint-*` under that directory. To pin a given step, pass the checkpoint directory itself (e.g., `--checkpoint /.../checkpoint-5000`).
 63 | - Hugging Face style: `--resume_from_checkpoint /.../checkpoint-5000` (or the YAML equivalent `training_args.resume_from_checkpoint`) also works; we still auto-set `output_dir` to the checkpoint’s parent run directory so artifacts stay together.
 64 | - Config-driven: add `training_args.checkpoint: /.../output/run_dir` (parent) or `training_args.resume_from_checkpoint: /.../output/run_dir/checkpoint-5000` when you want the recipe to resume automatically.
 65 | 
 66 | The trainer validates that every resolved checkpoint contains `trainer_state.json` and prints which directory it picked (including the step number) before restarting so you can verify the resume target.
 67 | 
 68 | ## Configuration Anatomy
 69 | 
 70 | Every config has the same high-level shape:
 71 | 
 72 | ```yaml
 73 | model_args:
 74 |   model_name_or_path: ...
 75 |   classifier_dropout: ...
 76 | 
 77 | data_args:
 78 |   datasets:
 79 |     - dataset_name: ...
 80 |       subset: ...
 81 |       teacher_column: ...
 82 |       items: ...           # optional: limit contexts per query
 83 |       n_samples: ...       # optional: sample cap (toy configs)
 84 |       upsample_factor: ... # optional: repeat dataset during sampling
 85 | 
 86 | training_args:
 87 |   learning_rate: ...
 88 |   per_device_train_batch_size: ...
 89 |   gradient_accumulation_steps: ...
 90 |   num_train_epochs: 1
 91 |   bf16: true
 92 |   dataloader_num_workers: 8
 93 |   eval_steps: 500
 94 |   report_to: ["wandb"]
 95 |   eval_datasets:
 96 |     config: ...
 97 |     threshold: 0.1
 98 |     batch_size: 32
 99 | ```
100 | 
101 | ### Important fields
102 | - **`model_args`** — selects the base encoder. Swap `model_name_or_path` when trying a new backbone.
103 | - **`data_args.datasets`** — controls dataset mixing. Useful options:
104 |   - `items`: how many passages to sample per query (acts like negative sampling).
105 |   - `n_samples`: cap dataset size for faster iteration (used in toy configs).
106 |   - `upsample_factor`: repeat a dataset to boost its contribution.
107 |   - `teacher_column`: column containing teacher reranker scores for distillation.
108 | - **`training_args`** — mirrors Hugging Face `TrainingArguments` with pruning defaults:
109 |   - Effective batch size = `per_device_train_batch_size × gradient_accumulation_steps`.
110 |   - Mixed precision uses BF16 by default; enable `fp16` if your hardware lacks BF16 support.
111 |   - `eval_steps` and `logging_steps` dictate evaluation cadence and logging granularity.
112 |   - The nested `eval_datasets` block defines which evaluation YAML runs automatically when training completes.
113 | 
114 | When customising, clone the closest template, tweak the dataset list, and adjust batch size or accumulation to fit your GPU memory budget.
115 | 
116 | ## After Training
117 | 1. Inspect `output/<config>_<timestamp>/final_model/` to confirm weights, tokenizer files, and evaluation summaries were produced.
118 | 2. Copy nano evaluation artifacts into your release folder or follow up with the full evaluation suites described in `docs/eval_dataset.md` and `docs/eval_mldr.md`.
119 | 3. Record the exact command, config, and output path in your PR or experiment log. This keeps reproducibility tight across the team.
120 | 
121 | ## Toy Dataset Reference (2025-10-29)
122 | 
123 | Run the toy config whenever you need a quick health check:
124 | 
125 | ```bash
126 | uv run open_provence_trainer configs/toy-open-provence-reranker-v1-gte-modernbert-base.yaml
127 | ```
128 | 
129 | On an RTX 5090 this finishes in ~5 minutes (effective batch size 64). Expect logs similar to:
130 | 
131 | ```
132 | {'eval_loss': 0.2196, 'eval_pruning_loss': 0.2269, 'eval_ranking_loss': 0.0560, 'step': 180}
133 | {'loss': 0.4167, 'pruning_loss': 0.4147, 'ranking_loss': 0.0386, 'step': 181}
134 | Training completed successfully!
135 | Model saved to: ./output/toy-open-provence-reranker-v1-gte-modernbert-base_20251029_090143/final_model
136 | ```
137 | 
138 | Automatic nano evaluation metrics at threshold 0.1:
139 | 
140 | | Dataset | F2 | Recall | Precision | Mean Compression (%) | Span Accuracy |
141 | | --- | --- | --- | --- | --- | --- |
142 | | hotchpotch/msmarco-context-relevance:freq2 | 0.7887 | 0.9947 | 0.4314 | 10.41 | 0.4919 |
143 | | hotchpotch/natural-questions-context-relevance:nodup_freq2 | 0.6513 | 0.9013 | 0.3088 | 73.80 | 0.8363 |
144 | | hotchpotch/gooaq-context-relevance-130k:default | 0.8214 | 0.9782 | 0.5006 | 46.65 | 0.7208 |
145 | 
146 | ### Japanese & English toy run (2025-10-30)
147 | 
148 | ```bash
149 | uv run open_provence_trainer configs/toy-open-provence-reranker-v1.yaml
150 | ```
151 | 
152 | The run completed in ~4 minutes 22 seconds on an RTX 5090 (effective batch size 64). Artefacts live under
153 | `output/toy-open-provence-reranker-v1_20251030_092420/final_model`, and the automatic nano evaluation used
154 | `configs/eval_datasets/ja_nano.yaml` at threshold 0.1. Expect evaluation logs similar to:
155 | 
156 | | Dataset | F2 | Recall | Precision | Mean Compression (%) | Span Accuracy |
157 | | --- | --- | --- | --- | --- | --- |
158 | | hotchpotch/msmarco-context-relevance:freq2 | 0.7831 | 0.9930 | 0.4243 | 9.65 | 0.4773 |
159 | | hotchpotch/natural-questions-context-relevance:nodup_freq2 | 0.6036 | 0.9821 | 0.2375 | 62.54 | 0.7548 |
160 | | hotchpotch/gooaq-context-relevance-130k:default | 0.7499 | 0.9716 | 0.3921 | 32.08 | 0.5706 |
161 | | hotchpotch/japanese-context-relevance:msmarco-ja-freq2 | 0.8489 | 0.9664 | 0.5712 | 25.03 | 0.6970 |
162 | | hotchpotch/japanese-context-relevance:auto-wiki-qa-nemotron | 0.7046 | 0.8835 | 0.3893 | 70.63 | 0.8727 |
163 | | hotchpotch/japanese-context-relevance:jaquad-freq2 | 0.7152 | 0.9221 | 0.3770 | 72.37 | 0.8818 |
164 | | hotchpotch/japanese-context-relevance:jqara | 0.6359 | 0.8279 | 0.3299 | 67.18 | 0.8376 |
165 | | hotchpotch/japanese-context-relevance:jsquad-freq2 | 0.7280 | 0.8859 | 0.4250 | 62.27 | 0.8264 |
166 | | hotchpotch/japanese-context-relevance:miracl | 0.8221 | 0.9529 | 0.5307 | 43.04 | 0.7808 |
167 | | hotchpotch/japanese-context-relevance:mkqa | 0.6406 | 0.8682 | 0.3127 | 67.61 | 0.8437 |
168 | | hotchpotch/japanese-context-relevance:mr-tydi | 0.8222 | 0.9508 | 0.5336 | 44.52 | 0.7919 |
169 | | hotchpotch/japanese-context-relevance:quiz-no-mori | 0.6456 | 0.7701 | 0.3920 | 72.99 | 0.8651 |
170 | | hotchpotch/japanese-context-relevance:quiz-works | 0.6516 | 0.8069 | 0.3683 | 70.80 | 0.8547 |
171 | | hotchpotch/japanese-context-relevance:JFWIR | 0.6515 | 0.7901 | 0.3829 | 60.83 | 0.7377 |
172 | 
173 | These values illustrate that the Japanese slices stay within the expected F2 range (0.64–0.85) and compression
174 | rates (25–73%) for healthy pruning behaviour. Investigate large deviations before proceeding to full-scale
175 | runs.
176 | 
177 | **If your toy results diverge dramatically from these numbers, something is misconfigured—double-check dataset access, GPU precision settings, and the trainer logs before trusting the run.**
178 | 


--------------------------------------------------------------------------------
/scripts/eval_mldr/ignored_questions.yaml:
--------------------------------------------------------------------------------
  1 | en:
  2 | - qid: q-en-2
  3 |   question: What type of coating do ZM1130 - ZM1132 have?
  4 |   reason: Positive passage length exceeds limit (226954 characters > 60000). Automatically
  5 |     flagged.
  6 | - qid: q-en-18
  7 |   question: What are some unique dishes in Uttarakhand?
  8 |   reason: Positive passage length exceeds limit (77141 characters > 60000). Automatically
  9 |     flagged.
 10 | - qid: q-en-30
 11 |   question: What position did he finish on the season's leading run-getters list?
 12 |   reason: Positive passage length exceeds limit (62404 characters > 60000). Automatically
 13 |     flagged.
 14 | - qid: q-en-47
 15 |   question: Who did Burnham realize he could not win with support from?
 16 |   reason: Positive passage length exceeds limit (63814 characters > 60000). Automatically
 17 |     flagged.
 18 | - qid: q-en-58
 19 |   question: Where does the first free-flight test of the Dream Chaser take place?
 20 |   reason: Positive passage length exceeds limit (64718 characters > 60000). Automatically
 21 |     flagged.
 22 | - qid: q-en-66
 23 |   question: What are some examples of international films that Buñuel was involved
 24 |     in during the period of 1954-1960?
 25 |   reason: Positive passage length exceeds limit (70495 characters > 60000). Automatically
 26 |     flagged.
 27 | - qid: q-en-76
 28 |   question: Who replaced Prost at Williams?
 29 |   reason: Positive passage length exceeds limit (82053 characters > 60000). Automatically
 30 |     flagged.
 31 | - qid: q-en-124
 32 |   question: What score did Cameron receive in favor of lesbian, gay and bisexual equality
 33 |     in 2010?
 34 |   reason: Positive passage length exceeds limit (81557 characters > 60000). Automatically
 35 |     flagged.
 36 | - qid: q-en-126
 37 |   question: What is generally credited as the first "Special Edition" release to home
 38 |     video?
 39 |   reason: Positive passage length exceeds limit (60753 characters > 60000). Automatically
 40 |     flagged.
 41 | - qid: q-en-146
 42 |   question: Who won his first ever MotoGP race?
 43 |   reason: Positive passage length exceeds limit (122082 characters > 60000). Automatically
 44 |     flagged.
 45 | - qid: q-en-172
 46 |   question: When was Radioland Murders released?
 47 |   reason: Positive passage length exceeds limit (103525 characters > 60000). Automatically
 48 |     flagged.
 49 | - qid: q-en-224
 50 |   question: What can be found in the list of television movies and episodes?
 51 |   reason: Positive passage length exceeds limit (65478 characters > 60000). Automatically
 52 |     flagged.
 53 | - qid: q-en-229
 54 |   question: When was the object with the ID 338737 discovered?
 55 |   reason: Positive passage length exceeds limit (124097 characters > 60000). Automatically
 56 |     flagged.
 57 | - qid: q-en-292
 58 |   question: What is one reason for the perceived failure of social services to work
 59 |     with the Somali community in London?
 60 |   reason: Positive passage length exceeds limit (65770 characters > 60000). Automatically
 61 |     flagged.
 62 | - qid: q-en-366
 63 |   question: What is the test method for measuring the pour point of petroleum?
 64 |   reason: Positive passage length exceeds limit (83698 characters > 60000). Automatically
 65 |     flagged.
 66 | - qid: q-en-375
 67 |   question: When did Stewart propose marriage to deHavilland?
 68 |   reason: Positive passage length exceeds limit (65723 characters > 60000). Automatically
 69 |     flagged.
 70 | - qid: q-en-392
 71 |   question: Who won the Football League One Manager of the Month award in the first
 72 |     week of April?
 73 |   reason: Positive passage length exceeds limit (63742 characters > 60000). Automatically
 74 |     flagged.
 75 | - qid: q-en-400
 76 |   question: Who had taken over large parts of Richard's continental domains?
 77 |   reason: Positive passage length exceeds limit (63650 characters > 60000). Automatically
 78 |     flagged.
 79 | - qid: q-en-401
 80 |   question: What are the potential consequences of tougher immigration enforcement?
 81 |   reason: Positive passage length exceeds limit (92743 characters > 60000). Automatically
 82 |     flagged.
 83 | - qid: q-en-462
 84 |   question: How many pilots did No. 257 Squadron lose in the crash landing after combat
 85 |     with Bf 110s?
 86 |   reason: Positive passage length exceeds limit (78120 characters > 60000). Automatically
 87 |     flagged.
 88 | - qid: q-en-470
 89 |   question: What determines the health of the U.S.?
 90 |   reason: Positive passage length exceeds limit (92438 characters > 60000). Automatically
 91 |     flagged.
 92 | - qid: q-en-473
 93 |   question: What did Čabrinović throw at the car?
 94 |   reason: Positive passage length exceeds limit (135880 characters > 60000). Automatically
 95 |     flagged.
 96 | - qid: q-en-522
 97 |   question: What was the reason for terminating the contract between Williams and
 98 |     Zanardi?
 99 |   reason: Positive passage length exceeds limit (70437 characters > 60000). Automatically
100 |     flagged.
101 | - qid: q-en-553
102 |   question: What kind of environment is suggested to be the place where tetrapods
103 |     evolved?
104 |   reason: Positive passage length exceeds limit (65680 characters > 60000). Automatically
105 |     flagged.
106 | - qid: q-en-330
107 |   question: Who did Anderson defeat the following week?
108 |   reason: 'question is malformed/ill-posed: ''the following week'' lacks a clear anchor
109 |     or reference in the question or passages; cannot determine which week is meant.'
110 | - qid: q-en-388
111 |   question: What were the original anchors of the mall?
112 |   reason: 'Ambiguous question: ''the mall'' is not specified among multiple malls
113 |     listed; cannot determine which mall''s original anchors are being asked about.'
114 | jp:
115 | - qid: q-ja-15
116 |   question: ることもあるのでしょうか？
117 |   reason: question is a fragment/incoherent and lacks a clear, answerable query
118 | - qid: q-ja-75
119 |   question: が制作した木版画が使用された。この木版画は、日本の伝統的な木版画とは異なる西洋の技法を取り入れたものであり、当時の印刷技術の進化を象徴している。この木版画はどのような特徴を持っているのですか？
120 |   reason: 'The question text is incoherent/malformed: it begins with ''が制作した'' without
121 |     a clear subject, making the intent unusable.'
122 | - qid: q-ja-59
123 |   question: れのジャンルにまたがっており、クラシック音楽からポップス、ジャズ、ロックまで幅広いスタイルがあります。作曲家は多くの場合、自身の感情や経験を音楽に反映させていますが、この作曲家の音楽にはどのようなテーマや要素が特に顕著に現れているのでしょうか？
124 |   reason: Question text is incoherent/malformed (starts mid-sentence and lacks a clear
125 |     subject)
126 | - qid: q-ja-16
127 |   question: は、荒木の絵画スタイルについてどのような影響を与えていますか？
128 |   reason: question text is incoherent or missing essential words
129 | - qid: q-ja-36
130 |   question: が地球の公転軌道上を移動する速度が遅い場合、皆既食の観測時間はどのように変化するのでしょうか？
131 |   reason: question text is incoherent or missing essential words
132 | - qid: q-ja-76
133 |   question: は呼ばれ、チャンピオンズリーグの象徴とされています。ビッグイヤーの永久保持を認められるためにはどのような条件を満たす必要があるのでしょうか？
134 |   reason: question text is incoherent or missing essential words
135 | - qid: q-ja-146
136 |   question: の学究的理論はどのような内容でしたか？
137 |   reason: question text is incoherent or missing essential words
138 | - qid: q-ja-153
139 |   question: の航空機においては）、主翼を支持するための構造物も含まれることがある。この胴体の設計や構造に関して、どのような工夫がされているのですか？
140 |   reason: question text is incoherent or missing essential words (appears to be a
141 |     fragment)
142 | - qid: q-ja-189
143 |   question: ンケ族（23.8%、2000年）など、様々な民族から構成されています。また、他にもマリ人、ソウシウ族、クルマ族なども存在します。
144 |   reason: question is a fragment with no clear query and is unusable
145 | - qid: q-ja-211
146 |   question: 作り出すことは可能でしょうか？
147 |   reason: 'question is malformed/unclear: lacks a clear object or referent to ''create''
148 |     and is not answerable from the given passages'
149 | - qid: q-ja-206
150 |   question: 置があると言えますか？
151 |   reason: question is incoherent or missing essential content; unclear what is being
152 |     asked
153 | - qid: q-ja-212
154 |   question: の映画『鏡』にも感銘を受けたという。武満は映画が音楽に与える影響についても研究し、自身の作曲に取り入れることもあった。
155 |   reason: malformed/incomplete question fragment lacking a clear prompt (subject missing)
156 | - qid: q-ja-237
157 |   question: しています。この傾向は、現代社会において宗教の役割がどのように変化しているのかを考えさせられます。なぜ最近では、信仰する宗教が特に無いという人々が増えているのでしょうか？
158 |   reason: question text is incoherent or missing essential words
159 | - qid: q-ja-275
160 |   question: た後の飽和状態に達するのにどれくらいの時間がかかるのか知りたいですか？
161 |   reason: 'The question is malformed/incoherent: it starts with ''た後'' lacking a clear
162 |     antecedent, making it impossible to determine what saturation time refers to.'
163 | - qid: q-ja-308
164 |   question: は、シェイク・モハメドの言葉に同意しますか？
165 |   reason: question text is incoherent or missing essential words
166 | - qid: q-ja-297
167 |   question: らの謁見を受けることが慣例とされていました。
168 |   reason: incomplete fragment; the question lacks a clear subject and interrogative
169 |     form, making it ill-formed/unusable
170 | - qid: q-ja-443
171 |   question: を目指す」という言葉を掲げていたが、この批判の背後には具体的にどのような問題があったのでしょうか？
172 |   reason: question text is incoherent or missing essential words
173 | - qid: q-ja-508
174 |   question: われていますか？
175 |   reason: The question text is incoherent/incomplete (a fragment in Japanese) and
176 |     lacks a clear interrogative form or context.
177 | - qid: q-ja-618
178 |   question: むね1時間に何本程運行されていますか？
179 |   reason: the question text is garbled/incoherent (missing clear subject or target
180 |     station; likely intended '豊橋駅' but 'むね' is unclear)
181 | - qid: q-ja-714
182 |   question: においても使用された。この時期の新幹線車両の改良は、より高速で快適な輸送を実現するために行われた。
183 |   reason: question is malformed or incomplete (fragment) and has no clear prompt
184 | - qid: q-ja-726
185 |   question: フェがオープンしましたか？
186 |   reason: question text is incoherent or incomplete (fragmentary question lacking
187 |     a clear subject/object)
188 | - qid: q-ja-771
189 |   question: 地化することが計画されていましたか？
190 |   reason: 'question is malformed/unclear: ''地化することが計画されていましたか？'' lacks a clear subject/referent
191 |     and does not form a complete, answerable question based on the given passages.'
192 | - qid: q-ja-754
193 |   question: ありますか？
194 |   reason: 'question is malformed/incomplete: ''ありますか？'' lacks a clear topic or query;
195 |     cannot determine what is being asked.'
196 | - qid: q-ja-832
197 |   question: が、福岡県内に鉄道敷設の必要性を訴えました。この背景には、地方の経済発展や人の移動の便益が期待されていました。しかし、大鉄道会社は地方への鉄道敷設に消極的であり、地方の要望と会社の方針との間には溝が生じていました。この状況を考慮すると、地方の要望を満たすためにはどのような解決策が考えられるのでしょうか？
198 |   reason: 'Question is malformed/incomplete: it starts with a conjunction and lacks
199 |     a clear subject/agent for who advocated railway construction in Fukuoka, making
200 |     it not self-contained or fully understandable.'
201 | - qid: q-ja-801
202 |   question: ムは、現代のサッカー戦術においても重要な役割を果たしていますか？
203 |   reason: The question text is incoherent/truncated ('ムは' lacks a clear referent),
204 |     making it unusable.
205 | - qid: q-ja-1019
206 |   question: の城門は、どのような目的で作られたのですか？
207 |   reason: 'The question is malformed / incomplete: it uses ''の城門'' without a visible
208 |     subject (no noun before の), making it incoherent.'
209 | - qid: q-ja-991
210 |   question: えました。この時点で、ランドはソ連に不信感を持っていましたか？
211 |   reason: 'Malformed/incoherent question: the text begins with an incomplete fragment
212 |     ''えました。'', lacks a clear subject, and cannot be meaningfully answered from the
213 |     provided passages.'
214 | - qid: q-ja-1071
215 |   question: で切り詰めたデザインに変更された。この変更の背後にある意図や理由は何ですか？
216 |   reason: 'question is malformed / incomplete: starts with a dependent clause ''で切り詰めたデザインに変更された''
217 |     without a clear subject or context; lacks essential words to form a complete question.'
218 | - qid: q-ja-1131
219 |   question: れる傾向がある。この成長の急激な変化は、ティラノサウルス科の生態や生物学的な特徴について何か示唆していますか？
220 |   reason: question text is incoherent or incomplete (starts mid-sentence; essential
221 |     words missing)
222 | - qid: q-ja-1164
223 |   question: ーの熱膨張率は何故他の木材よりも大きいのですか？
224 |   reason: 'question is malformed/incomplete: the topic before の is missing (represented
225 |     by a dash), making the query incoherent.'
226 | - qid: q-ja-1227
227 |   question: はどのような手段を取ったのでしょうか？
228 |   reason: 'question is malformed/incoherent: missing subject to reference (lacks a
229 |     clear target to answer about)'
230 | - qid: q-ja-1205
231 |   question: を学んだりするのを見ると、なんだか自分の中にも、彼らのようなものがあるような気がして、ちょっとした自信をもってしまう。」
232 |   reason: question text is incoherent or missing essential words
233 | - qid: q-ja-1323
234 |   question: などに運行されていますか？
235 |   reason: question text is incoherent or missing essential words
236 | - qid: q-ja-1271
237 |   question: マの絶対数が最も多い理由は何ですか？
238 |   reason: question text is incoherent or incomplete (the subject appears truncated,
239 |     e.g., likely intended 'ロマ' but missing the initial characters)
240 | 


--------------------------------------------------------------------------------
/open_provence/models/open_provence_head.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Sentence Compression Head module for query-dependent text pruning.
  3 | Compatible with AutoModelForTokenClassification.
  4 | """
  5 | 
  6 | from __future__ import annotations
  7 | 
  8 | import logging
  9 | import os
 10 | from collections.abc import Callable
 11 | from typing import Any
 12 | 
 13 | import torch
 14 | import torch.nn as nn
 15 | from transformers import PretrainedConfig, PreTrainedModel  # type: ignore[attr-defined]
 16 | from transformers.modeling_outputs import TokenClassifierOutput
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | class OpenProvenceHeadConfig(PretrainedConfig):
 22 |     """Configuration class for OpenProvenceHead."""
 23 | 
 24 |     model_type = "open_provence_head"
 25 | 
 26 |     def __init__(
 27 |         self,
 28 |         hidden_size: int = 768,
 29 |         num_labels: int = 2,
 30 |         classifier_dropout: float = 0.1,
 31 |         sentence_pooling: str = "mean",
 32 |         use_weighted_pooling: bool = False,
 33 |         **kwargs,
 34 |     ):
 35 |         """
 36 |         Args:
 37 |             hidden_size: Hidden size of the input features
 38 |             num_labels: Number of labels (2 for binary: keep/prune)
 39 |             classifier_dropout: Dropout probability
 40 |             sentence_pooling: Pooling strategy for sentence-level predictions
 41 |                              ("mean", "max", "first", "last")
 42 |             use_weighted_pooling: Whether to use attention weights for pooling
 43 |         """
 44 |         super().__init__(**kwargs)
 45 |         self.hidden_size = hidden_size
 46 |         self.num_labels = num_labels
 47 |         self.classifier_dropout = classifier_dropout
 48 |         self.sentence_pooling = sentence_pooling
 49 |         self.use_weighted_pooling = use_weighted_pooling
 50 | 
 51 | 
 52 | class OpenProvenceHead(PreTrainedModel):
 53 |     """
 54 |     Pruning head for query-dependent text pruning.
 55 |     Can be used standalone or integrated with reranking models.
 56 |     Compatible with AutoModelForTokenClassification.
 57 |     """
 58 | 
 59 |     config_class = OpenProvenceHeadConfig
 60 | 
 61 |     def __init__(self, config: OpenProvenceHeadConfig):
 62 |         super().__init__(config)
 63 | 
 64 |         self.num_labels = config.num_labels
 65 |         self.sentence_pooling = config.sentence_pooling
 66 |         self.use_weighted_pooling = config.use_weighted_pooling
 67 | 
 68 |         # Dropout layer
 69 |         self.dropout = nn.Dropout(config.classifier_dropout)
 70 | 
 71 |         # Classification head
 72 |         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
 73 | 
 74 |         # Optional: Weighted pooling layer
 75 |         if self.use_weighted_pooling:
 76 |             self.pooling_weights = nn.Linear(config.hidden_size, 1)
 77 | 
 78 |         # Initialize weights
 79 |         self.init_weights()
 80 | 
 81 |     def forward(
 82 |         self,
 83 |         hidden_states: torch.Tensor,
 84 |         attention_mask: torch.Tensor | None = None,
 85 |         sentence_boundaries: torch.Tensor | None = None,
 86 |         labels: torch.Tensor | None = None,
 87 |         return_dict: bool | None = None,
 88 |         **kwargs,
 89 |     ) -> TokenClassifierOutput | tuple[torch.Tensor, ...]:
 90 |         """
 91 |         Forward pass for pruning classification.
 92 | 
 93 |         Args:
 94 |             hidden_states: [batch_size, seq_len, hidden_size]
 95 |             attention_mask: [batch_size, seq_len]
 96 |             sentence_boundaries: [batch_size, max_sentences, 2] - start/end token indices
 97 |             labels: [batch_size, seq_len] or [batch_size, max_sentences]
 98 |             return_dict: Whether to return a TokenClassifierOutput object
 99 | 
100 |         Returns:
101 |             TokenClassifierOutput or tuple of tensors
102 |         """
103 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
104 | 
105 |         # Apply dropout
106 |         hidden_states = self.dropout(hidden_states)
107 | 
108 |         # Token-level classification
109 |         logits = self.classifier(hidden_states)  # [batch_size, seq_len, num_labels]
110 | 
111 |         loss = None
112 |         if labels is not None:
113 |             loss_fct = nn.CrossEntropyLoss()
114 | 
115 |             if sentence_boundaries is not None:
116 |                 # Sentence-level loss
117 |                 loss = self._compute_sentence_loss(
118 |                     logits, labels, sentence_boundaries, attention_mask
119 |                 )
120 |             else:
121 |                 # Token-level loss
122 |                 if attention_mask is not None:
123 |                     active_loss = attention_mask.view(-1) == 1
124 |                     active_logits = logits.view(-1, self.num_labels)
125 |                     active_labels = labels.view(-1)
126 | 
127 |                     if active_loss.sum() > 0:
128 |                         active_logits = active_logits[active_loss]
129 |                         active_labels = active_labels[active_loss]
130 |                         loss = loss_fct(active_logits, active_labels)
131 |                     else:
132 |                         loss = torch.tensor(0.0, device=logits.device, requires_grad=True)
133 |                 else:
134 |                     loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
135 | 
136 |         if not return_dict:
137 |             output = (logits,) + kwargs.get("hidden_states", ())
138 |             return ((loss,) + output) if loss is not None else output
139 | 
140 |         return TokenClassifierOutput(
141 |             loss=loss,  # type: ignore[arg-type]
142 |             logits=logits,
143 |             hidden_states=kwargs.get("hidden_states"),
144 |             attentions=kwargs.get("attentions"),
145 |         )
146 | 
147 |     def _compute_sentence_loss(
148 |         self,
149 |         logits: torch.Tensor,
150 |         labels: torch.Tensor,
151 |         boundaries: torch.Tensor,
152 |         attention_mask: torch.Tensor | None = None,
153 |     ) -> torch.Tensor:
154 |         """
155 |         Compute sentence-level loss by pooling token predictions within each sentence.
156 | 
157 |         Args:
158 |             logits: [batch_size, seq_len, num_labels]
159 |             labels: [batch_size, max_sentences] - sentence-level labels
160 |             boundaries: [batch_size, max_sentences, 2] - token boundaries
161 |             attention_mask: [batch_size, seq_len]
162 |         """
163 |         batch_size, _, num_labels = logits.shape
164 |         max_sentences = boundaries.shape[1]
165 |         device = logits.device
166 | 
167 |         # Initialize sentence logits
168 |         sentence_logits = torch.zeros(
169 |             batch_size, max_sentences, num_labels, device=device, dtype=logits.dtype
170 |         )
171 | 
172 |         # Pool logits for each sentence
173 |         for b in range(batch_size):
174 |             for s in range(max_sentences):
175 |                 start, end = boundaries[b, s]
176 | 
177 |                 # Skip padding (-1 boundaries)
178 |                 if start == -1 or end == -1:
179 |                     continue
180 | 
181 |                 # Get token logits for this sentence
182 |                 sentence_tokens = logits[b, start:end]  # [num_tokens, num_labels]
183 | 
184 |                 if sentence_tokens.shape[0] == 0:
185 |                     continue
186 | 
187 |                 # Apply pooling
188 |                 if self.sentence_pooling == "mean":
189 |                     # Note: weighted pooling not available in this context since we only have logits
190 |                     pooled = sentence_tokens.mean(dim=0)
191 |                 elif self.sentence_pooling == "max":
192 |                     pooled = sentence_tokens.max(dim=0)[0]
193 |                 elif self.sentence_pooling == "first":
194 |                     pooled = sentence_tokens[0]
195 |                 elif self.sentence_pooling == "last":
196 |                     pooled = sentence_tokens[-1]
197 |                 else:
198 |                     # Default to mean
199 |                     pooled = sentence_tokens.mean(dim=0)
200 | 
201 |                 sentence_logits[b, s] = pooled
202 | 
203 |         # Flatten for loss computation
204 |         valid_mask = boundaries[:, :, 0] != -1  # [batch_size, max_sentences]
205 |         valid_logits = sentence_logits[valid_mask]  # [num_valid, num_labels]
206 |         valid_labels = labels[valid_mask]  # [num_valid]
207 | 
208 |         # Compute loss
209 |         if valid_logits.shape[0] > 0:
210 |             loss_fct = nn.CrossEntropyLoss()
211 |             loss = loss_fct(valid_logits, valid_labels)
212 |         else:
213 |             loss = torch.tensor(0.0, device=device, requires_grad=True)
214 | 
215 |         return loss
216 | 
217 |     def predict_sentences(
218 |         self,
219 |         hidden_states: torch.Tensor,
220 |         sentence_boundaries: torch.Tensor,
221 |         attention_mask: torch.Tensor | None = None,
222 |     ) -> torch.Tensor:
223 |         """
224 |         Get sentence-level predictions by pooling token predictions.
225 | 
226 |         Args:
227 |             hidden_states: [batch_size, seq_len, hidden_size]
228 |             sentence_boundaries: [batch_size, max_sentences, 2]
229 |             attention_mask: [batch_size, seq_len]
230 | 
231 |         Returns:
232 |             Sentence probabilities [batch_size, max_sentences, num_labels]
233 |         """
234 |         # Get token-level logits
235 |         hidden_states = self.dropout(hidden_states)
236 |         logits = self.classifier(hidden_states)
237 | 
238 |         batch_size = logits.shape[0]
239 |         max_sentences = sentence_boundaries.shape[1]
240 |         device = logits.device
241 | 
242 |         # Initialize sentence predictions
243 |         sentence_probs = torch.zeros(
244 |             batch_size, max_sentences, self.num_labels, device=device, dtype=logits.dtype
245 |         )
246 | 
247 |         # Pool predictions for each sentence
248 |         for b in range(batch_size):
249 |             for s in range(max_sentences):
250 |                 start, end = sentence_boundaries[b, s]
251 | 
252 |                 if start == -1 or end == -1:
253 |                     # Invalid boundary - set to neutral prediction
254 |                     sentence_probs[b, s] = torch.tensor([0.5, 0.5], device=device)
255 |                     continue
256 | 
257 |                 # Get token probabilities for this sentence
258 |                 sentence_logits = logits[b, start:end]
259 | 
260 |                 if sentence_logits.shape[0] == 0:
261 |                     sentence_probs[b, s] = torch.tensor([0.5, 0.5], device=device)
262 |                     continue
263 | 
264 |                 # Convert to probabilities
265 |                 sentence_token_probs = torch.softmax(sentence_logits, dim=-1)
266 | 
267 |                 # Pool probabilities
268 |                 if self.sentence_pooling == "mean":
269 |                     pooled_probs = sentence_token_probs.mean(dim=0)
270 |                 elif self.sentence_pooling == "max":
271 |                     pooled_probs = sentence_token_probs.max(dim=0)[0]
272 |                 elif self.sentence_pooling == "first":
273 |                     pooled_probs = sentence_token_probs[0]
274 |                 elif self.sentence_pooling == "last":
275 |                     pooled_probs = sentence_token_probs[-1]
276 |                 else:
277 |                     pooled_probs = sentence_token_probs.mean(dim=0)
278 | 
279 |                 sentence_probs[b, s] = pooled_probs
280 | 
281 |         return sentence_probs
282 | 
283 |     def save_pretrained(
284 |         self,
285 |         save_directory: str | os.PathLike[str],
286 |         is_main_process: bool = True,
287 |         state_dict: dict[str, torch.Tensor] | None = None,
288 |         save_function: Callable[..., Any] | None = None,
289 |         push_to_hub: bool = False,
290 |         max_shard_size: int | str = "5GB",
291 |         safe_serialization: bool = True,
292 |         variant: str | None = None,
293 |         token: str | bool | None = None,
294 |         save_peft_format: bool = True,
295 |         **kwargs: Any,
296 |     ) -> None:
297 |         """Delegate to the base implementation while defaulting to `torch.save` when needed."""
298 |         super().save_pretrained(
299 |             save_directory,
300 |             is_main_process=is_main_process,
301 |             state_dict=state_dict,
302 |             save_function=save_function or torch.save,
303 |             push_to_hub=push_to_hub,
304 |             max_shard_size=max_shard_size,
305 |             safe_serialization=safe_serialization,
306 |             variant=variant,
307 |             token=token,
308 |             save_peft_format=save_peft_format,
309 |             **kwargs,
310 |         )
311 | 
312 |     @classmethod
313 |     def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
314 |         """Load the model from a directory or HuggingFace Hub."""
315 |         # Load config
316 |         config = OpenProvenceHeadConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
317 | 
318 |         # Initialize model
319 |         if not isinstance(config, OpenProvenceHeadConfig):
320 |             raise TypeError(f"config must be OpenProvenceHeadConfig, got {type(config)}")
321 |         model = cls(config)
322 | 
323 |         # Load state dict
324 |         if os.path.isdir(pretrained_model_name_or_path):
325 |             state_dict_path = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin")
326 |             if os.path.exists(state_dict_path):
327 |                 state_dict = torch.load(state_dict_path, map_location="cpu")
328 |                 model.load_state_dict(state_dict)
329 | 
330 |         return model
331 | 
332 | 
333 | # Register with AutoModel (this would typically be done in transformers library)
334 | # For now, we'll just document how it would be used:
335 | """
336 | Usage with AutoModelForTokenClassification:
337 | 
338 | from transformers import AutoConfig, AutoModelForTokenClassification
339 | 
340 | # Register the config and model
341 | AutoConfig.register("open_provence_head", OpenProvenceHeadConfig)
342 | AutoModelForTokenClassification.register(OpenProvenceHeadConfig, OpenProvenceHead)
343 | 
344 | # Then use it
345 | model = AutoModelForTokenClassification.from_pretrained(
346 |     "your-username/provence-pruner-deberta-v3",
347 |     trust_remote_code=True
348 | )
349 | """
350 | 


--------------------------------------------------------------------------------
/scripts/context-relevance-datasets/frequency_filter_ds.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import argparse
  4 | import hashlib
  5 | import json
  6 | import shlex
  7 | import sys
  8 | from collections import Counter
  9 | from collections.abc import Iterable, Mapping, Sequence
 10 | from dataclasses import dataclass
 11 | from datetime import UTC, datetime
 12 | from pathlib import Path
 13 | 
 14 | import datasets
 15 | from datasets import (
 16 |     Dataset,
 17 |     DatasetDict,
 18 |     IterableDataset,
 19 |     IterableDatasetDict,
 20 |     load_dataset,
 21 |     load_from_disk,
 22 | )
 23 | 
 24 | 
 25 | @dataclass
 26 | class DuplicateStats:
 27 |     rows_total: int
 28 |     rows_kept: int
 29 |     rows_removed: int
 30 |     texts_total: int
 31 |     texts_unique: int
 32 |     texts_duplicates: int
 33 |     texts_dup_ratio: float
 34 |     texts_total_filtered: int
 35 |     texts_unique_filtered: int
 36 |     texts_duplicates_filtered: int
 37 |     texts_dup_ratio_filtered: float
 38 |     duplicate_buckets_total: Counter[int]
 39 |     duplicate_buckets_kept: Counter[int]
 40 |     duplicate_buckets_removed: Counter[int]
 41 | 
 42 | 
 43 | def parse_args(argv: Sequence[str]) -> argparse.Namespace:
 44 |     parser = argparse.ArgumentParser(
 45 |         description=(
 46 |             "Filter context-relevance datasets by limiting duplicate texts per row "
 47 |             "based on MD5 fingerprints."
 48 |         )
 49 |     )
 50 |     parser.add_argument(
 51 |         "--dataset",
 52 |         required=True,
 53 |         help="Dataset name registered in HuggingFace Datasets.",
 54 |     )
 55 |     parser.add_argument(
 56 |         "--subset",
 57 |         default=None,
 58 |         help="Optional dataset subset / configuration name.",
 59 |     )
 60 |     parser.add_argument(
 61 |         "--split",
 62 |         action="append",
 63 |         dest="splits",
 64 |         help="Splits to include (default: train). May be provided multiple times.",
 65 |     )
 66 |     parser.add_argument(
 67 |         "--threshold",
 68 |         dest="thresholds",
 69 |         type=int,
 70 |         action="append",
 71 |         help="Duplicate allowance per row. May be provided multiple times (default: 1,2,3).",
 72 |     )
 73 |     parser.add_argument(
 74 |         "--output-dir",
 75 |         required=True,
 76 |         help="Directory where the filtered DatasetDict and summary will be stored.",
 77 |     )
 78 |     parser.add_argument(
 79 |         "--debug-limit",
 80 |         type=int,
 81 |         default=None,
 82 |         help="Optional cap on the number of train rows to process (useful for smoke tests).",
 83 |     )
 84 |     parser.add_argument(
 85 |         "--id-column",
 86 |         default="id",
 87 |         help="Column name containing unique identifiers per row (default: id).",
 88 |     )
 89 |     parser.add_argument(
 90 |         "--texts-column",
 91 |         default="texts",
 92 |         help="Column name containing the list of texts to deduplicate (default: texts).",
 93 |     )
 94 |     return parser.parse_args(argv)
 95 | 
 96 | 
 97 | def canonical_command(argv: Sequence[str]) -> str:
 98 |     return " ".join(shlex.quote(part) for part in argv)
 99 | 
100 | 
101 | def ensure_output_dir(path: Path) -> None:
102 |     path.mkdir(parents=True, exist_ok=True)
103 | 
104 | 
105 | def limit_dataset(ds: Dataset, limit: int | None) -> Dataset:
106 |     if limit is None:
107 |         return ds
108 |     if limit <= 0:
109 |         raise ValueError("--debug-limit must be positive.")
110 |     limit = min(limit, len(ds))
111 |     return ds.select(range(limit))
112 | 
113 | 
114 | def frequency_filter_train(
115 |     ds: Dataset,
116 |     threshold: int,
117 |     id_column: str,
118 |     texts_column: str,
119 | ) -> tuple[Dataset, DuplicateStats, set[str]]:
120 |     if threshold < 0:
121 |         raise ValueError("Threshold must be non-negative.")
122 | 
123 |     global_counter: Counter[str] = Counter()
124 |     seen_counter: Counter[str] = Counter()
125 |     duplicate_bucket_total: Counter[int] = Counter()
126 |     duplicate_bucket_kept: Counter[int] = Counter()
127 |     duplicate_bucket_removed: Counter[int] = Counter()
128 |     drop_ids: set[str] = set()
129 | 
130 |     rows_total = len(ds)
131 |     for idx, row in enumerate(ds, start=1):
132 |         if not isinstance(row, Mapping):
133 |             raise TypeError("Dataset row is not a mapping; cannot access columns by name.")
134 | 
135 |         texts_value = row.get(texts_column)
136 |         if not isinstance(texts_value, Iterable):
137 |             raise TypeError(f"Row {row.get(id_column, 'unknown')} has non-iterable texts column.")
138 | 
139 |         texts = list(texts_value)
140 | 
141 |         md5_list: list[str] = []
142 |         for text in texts:
143 |             md5 = hashlib.md5(text.encode("utf-8")).hexdigest()
144 |             md5_list.append(md5)
145 |             global_counter[md5] += 1
146 | 
147 |         dup_count = sum(1 for md5 in md5_list if seen_counter[md5] > 0)
148 |         duplicate_bucket_total[dup_count] += 1
149 | 
150 |         if dup_count > threshold:
151 |             row_id = row.get(id_column)
152 |             if not isinstance(row_id, str):
153 |                 raise TypeError("Row id must be a string when identifying duplicates.")
154 |             drop_ids.add(row_id)
155 |             duplicate_bucket_removed[dup_count] += 1
156 |             continue
157 | 
158 |         duplicate_bucket_kept[dup_count] += 1
159 |         for md5 in md5_list:
160 |             seen_counter[md5] += 1
161 | 
162 |         if idx % 50000 == 0:
163 |             print(
164 |                 f"[threshold={threshold}] processed {idx:,} rows "
165 |                 f"(kept={sum(duplicate_bucket_kept.values()):,}, "
166 |                 f"removed={sum(duplicate_bucket_removed.values()):,})"
167 |             )
168 | 
169 |     total_texts = sum(global_counter.values())
170 |     unique_texts = len(global_counter)
171 |     duplicate_texts = total_texts - unique_texts
172 |     duplicate_ratio = duplicate_texts / total_texts if total_texts else 0.0
173 | 
174 |     filtered_total_texts = sum(seen_counter.values())
175 |     filtered_unique_texts = len(seen_counter)
176 |     filtered_duplicate_texts = filtered_total_texts - filtered_unique_texts
177 |     filtered_duplicate_ratio = (
178 |         filtered_duplicate_texts / filtered_total_texts if filtered_total_texts else 0.0
179 |     )
180 | 
181 |     stats = DuplicateStats(
182 |         rows_total=rows_total,
183 |         rows_kept=sum(duplicate_bucket_kept.values()),
184 |         rows_removed=sum(duplicate_bucket_removed.values()),
185 |         texts_total=total_texts,
186 |         texts_unique=unique_texts,
187 |         texts_duplicates=duplicate_texts,
188 |         texts_dup_ratio=duplicate_ratio,
189 |         texts_total_filtered=filtered_total_texts,
190 |         texts_unique_filtered=filtered_unique_texts,
191 |         texts_duplicates_filtered=filtered_duplicate_texts,
192 |         texts_dup_ratio_filtered=filtered_duplicate_ratio,
193 |         duplicate_buckets_total=duplicate_bucket_total,
194 |         duplicate_buckets_kept=duplicate_bucket_kept,
195 |         duplicate_buckets_removed=duplicate_bucket_removed,
196 |     )
197 | 
198 |     print(
199 |         f"[threshold={threshold}] finished scanning rows_total={rows_total:,} "
200 |         f"rows_kept={stats.rows_kept:,} rows_removed={stats.rows_removed:,}"
201 |     )
202 | 
203 |     filtered_dataset = ds.filter(lambda example: example[id_column] not in drop_ids)
204 |     return filtered_dataset, stats, drop_ids
205 | 
206 | 
207 | def write_summary(
208 |     summary_path: Path,
209 |     *,
210 |     dataset: str,
211 |     subset: str | None,
212 |     splits: Sequence[str],
213 |     threshold: int,
214 |     debug_limit: int | None,
215 |     stats: DuplicateStats,
216 |     drop_ids_count: int,
217 |     command: str,
218 | ) -> None:
219 |     timestamp = datetime.now(UTC).isoformat()
220 | 
221 |     lines: list[str] = []
222 |     lines.append("# Frequency Filter Summary")
223 |     lines.append("")
224 |     lines.append("## Configuration")
225 |     lines.append(f"- Dataset: `{dataset}`")
226 |     lines.append(f"- Subset: `{subset}`" if subset else "- Subset: (none)")
227 |     lines.append(f"- Threshold (N): {threshold}")
228 |     lines.append(f"- Splits requested: {', '.join(splits) if splits else 'train'}")
229 |     lines.append("- Filtered split: train")
230 |     lines.append(f"- Debug limit: {debug_limit if debug_limit else 'None'}")
231 |     lines.append(f"- Command: `{command}`")
232 |     lines.append(f"- Timestamp (UTC): {timestamp}")
233 |     lines.append("")
234 | 
235 |     lines.append("## Train Split Statistics")
236 |     lines.append(f"- Rows (input): {stats.rows_total:,}")
237 |     lines.append(f"- Rows kept: {stats.rows_kept:,}")
238 |     lines.append(f"- Rows removed: {stats.rows_removed:,}")
239 |     lines.append(f"- Drop id count: {drop_ids_count:,}")
240 |     lines.append("")
241 |     lines.append(f"- Texts total (input): {stats.texts_total:,}")
242 |     lines.append(f"- Texts unique (input): {stats.texts_unique:,}")
243 |     lines.append(f"- Texts duplicates (input): {stats.texts_duplicates:,}")
244 |     lines.append(f"- Text duplicate ratio (input): {stats.texts_dup_ratio:.4%}")
245 |     lines.append("")
246 |     lines.append(f"- Texts total (filtered): {stats.texts_total_filtered:,}")
247 |     lines.append(f"- Texts unique (filtered): {stats.texts_unique_filtered:,}")
248 |     lines.append(f"- Texts duplicates (filtered): {stats.texts_duplicates_filtered:,}")
249 |     lines.append(f"- Text duplicate ratio (filtered): {stats.texts_dup_ratio_filtered:.4%}")
250 |     lines.append("")
251 | 
252 |     lines.append("## Duplicate Distribution (by duplicate count seen before row)")
253 |     lines.append("| Duplicate texts | Total rows | Kept rows | Removed rows |")
254 |     lines.append("| --- | ---: | ---: | ---: |")
255 |     max_bucket = max(stats.duplicate_buckets_total.keys(), default=0)
256 |     for dup_count in range(0, max_bucket + 1):
257 |         total_rows = stats.duplicate_buckets_total.get(dup_count, 0)
258 |         kept_rows = stats.duplicate_buckets_kept.get(dup_count, 0)
259 |         removed_rows = stats.duplicate_buckets_removed.get(dup_count, 0)
260 |         if total_rows == 0 and kept_rows == 0 and removed_rows == 0:
261 |             continue
262 |         lines.append(f"| {dup_count} | {total_rows:,} | {kept_rows:,} | {removed_rows:,} |")
263 |     lines.append("")
264 | 
265 |     lines.append("## Metadata")
266 |     lines.append(
267 |         f"- Environment: Python {sys.version.split()[0]} | datasets {datasets.__version__}"
268 |     )
269 |     lines.append(f"- Summary generated at: {timestamp}")
270 |     summary_path.write_text("\n".join(lines), encoding="utf-8")
271 | 
272 | 
273 | def prepare_dataset_dict(
274 |     original: DatasetDict,
275 |     filtered_train: Dataset,
276 | ) -> DatasetDict:
277 |     new_dict = DatasetDict()
278 |     for split_name, ds in original.items():
279 |         if split_name == "train":
280 |             new_dict[split_name] = filtered_train
281 |         else:
282 |             new_dict[split_name] = ds
283 |     return new_dict
284 | 
285 | 
286 | def main(argv: Sequence[str] | None = None) -> None:
287 |     args = parse_args(argv or sys.argv[1:])
288 |     thresholds = args.thresholds or [1, 2, 3]
289 |     thresholds = sorted(set(thresholds))
290 | 
291 |     splits = args.splits or ["train"]
292 |     if "train" not in splits:
293 |         print("Warning: --split did not include 'train'; proceeding to filter train split anyway.")
294 | 
295 |     output_root = Path(args.output_dir)
296 |     ensure_output_dir(output_root)
297 | 
298 |     print("Loading dataset...")
299 |     dataset_dict: datasets.DatasetDict
300 |     try:
301 |         loaded_dataset = (
302 |             load_dataset(args.dataset, name=args.subset)
303 |             if args.subset
304 |             else load_dataset(args.dataset)
305 |         )
306 |     except Exception as load_err:
307 |         dataset_path = Path(args.dataset)
308 |         if dataset_path.is_dir():
309 |             print(
310 |                 "load_dataset failed; attempting load_from_disk on",
311 |                 dataset_path,
312 |             )
313 |             loaded_dataset = load_from_disk(str(dataset_path))
314 |         else:
315 |             raise load_err
316 | 
317 |     if isinstance(loaded_dataset, (IterableDatasetDict, IterableDataset)):
318 |         raise TypeError("Streaming datasets are not supported by the frequency filter.")
319 | 
320 |     if isinstance(loaded_dataset, Dataset):
321 |         dataset_dict = DatasetDict({"train": loaded_dataset})
322 |     else:
323 |         dataset_dict = loaded_dataset
324 | 
325 |     if "train" not in dataset_dict:
326 |         raise ValueError("Dataset must contain a 'train' split.")
327 | 
328 |     base_train = dataset_dict["train"]
329 |     train_to_process = limit_dataset(base_train, args.debug_limit)
330 | 
331 |     multi_threshold = len(thresholds) > 1
332 | 
333 |     for threshold in thresholds:
334 |         print(f"\n=== Processing threshold {threshold} ===")
335 |         filtered_train, stats, drop_ids = frequency_filter_train(
336 |             train_to_process,
337 |             threshold=threshold,
338 |             id_column=args.id_column,
339 |             texts_column=args.texts_column,
340 |         )
341 | 
342 |         command = canonical_command(sys.argv)
343 |         if multi_threshold:
344 |             save_dir = output_root / f"freq{threshold}"
345 |         else:
346 |             save_dir = output_root
347 | 
348 |         print(f"Saving filtered dataset to {save_dir} ...")
349 |         ensure_output_dir(save_dir)
350 | 
351 |         summary_path = save_dir / "frequency_filter_ds_summary.md"
352 |         write_summary(
353 |             summary_path,
354 |             dataset=args.dataset,
355 |             subset=args.subset,
356 |             splits=splits,
357 |             threshold=threshold,
358 |             debug_limit=args.debug_limit,
359 |             stats=stats,
360 |             drop_ids_count=len(drop_ids),
361 |             command=command,
362 |         )
363 | 
364 |         if drop_ids:
365 |             drop_ids_path = save_dir / "drop_ids.txt"
366 |             drop_ids_path.write_text("\n".join(sorted(drop_ids)), encoding="utf-8")
367 | 
368 |         final_dataset_dict = prepare_dataset_dict(dataset_dict, filtered_train)
369 |         final_dataset_dict.save_to_disk(str(save_dir))
370 |         print(f"Saved dataset for threshold={threshold} at {save_dir}")
371 | 
372 |     metadata = {
373 |         "dataset": args.dataset,
374 |         "subset": args.subset,
375 |         "thresholds": thresholds,
376 |         "splits_request": splits,
377 |         "debug_limit": args.debug_limit,
378 |         "output_root": str(output_root),
379 |         "generated_at": datetime.now(UTC).isoformat(),
380 |     }
381 |     metadata_path = output_root / "frequency_filter_metadata.json"
382 |     metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
383 |     print(f"Wrote metadata to {metadata_path}")
384 | 
385 | 
386 | if __name__ == "__main__":
387 |     main()
388 | 


--------------------------------------------------------------------------------
/scripts/context-relevance-datasets/add_reranker_teacher_scores.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Add cross-encoder reranker teacher scores to Provence-style datasets.
  4 | 
  5 | Usage example:
  6 | 
  7 |     python scripts/context-relevance-datasets/add_reranker_teacher_scores.py \
  8 |         --dataset-path output/context-relevance-datasets/base/tomaarsen_natural-questions-hard-negatives_triplet-5_with_relevance \
  9 |         --model hotchpotch/japanese-reranker-xsmall-v2 \
 10 |         --column-name japanese-reranker-xsmall-v2 \
 11 |         --overwrite
 12 | 
 13 | The script loads a DatasetDict created by the provenance converter, scores every
 14 | query/passage pair with the specified model, and writes the augmented dataset to
 15 | disk. Re-running the command with a different model name appends additional
 16 | `teacher_scores.<column_name>` columns to the same dataset directory.
 17 | """
 18 | 
 19 | from __future__ import annotations
 20 | 
 21 | import argparse
 22 | import json
 23 | import logging
 24 | import time
 25 | from collections.abc import Iterable, Sequence
 26 | from dataclasses import dataclass
 27 | from pathlib import Path
 28 | 
 29 | import numpy as np
 30 | import torch
 31 | import torch.nn as nn
 32 | from datasets import Dataset, DatasetDict, concatenate_datasets, load_from_disk
 33 | from sentence_transformers import CrossEncoder
 34 | from tqdm import tqdm
 35 | 
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | 
 39 | @dataclass
 40 | class Config:
 41 |     dataset_path: Path
 42 |     output_path: Path
 43 |     model_name: str
 44 |     column_name: str
 45 |     batch_size: int
 46 |     debug_limit: int | None
 47 |     validate_samples: int
 48 |     overwrite: bool
 49 |     dtype: str
 50 | 
 51 |     query_column: str = "query"
 52 |     texts_column: str = "texts"
 53 |     labels_column: str = "labels"
 54 | 
 55 | 
 56 | def parse_args() -> Config:
 57 |     parser = argparse.ArgumentParser(
 58 |         description="Attach reranker teacher scores to Provence datasets."
 59 |     )
 60 |     parser.add_argument(
 61 |         "--dataset-path",
 62 |         required=True,
 63 |         type=Path,
 64 |         help="Path to the input DatasetDict (output of the relevance pipeline).",
 65 |     )
 66 |     parser.add_argument(
 67 |         "--output-path",
 68 |         type=Path,
 69 |         default=None,
 70 |         help=(
 71 |             "Where to save the augmented dataset. Defaults to the input path plus "
 72 |             "the suffix '_with_teacher_scores' if not already present."
 73 |         ),
 74 |     )
 75 |     parser.add_argument(
 76 |         "--model",
 77 |         default="hotchpotch/japanese-reranker-xsmall-v2",
 78 |         help="Cross-encoder model identifier on the Hugging Face Hub.",
 79 |     )
 80 |     parser.add_argument(
 81 |         "--column-name",
 82 |         default=None,
 83 |         help=(
 84 |             "Column name suffix to use for the new teacher scores. "
 85 |             "Defaults to the sanitized model name (e.g. japanese-reranker-xsmall-v2)."
 86 |         ),
 87 |     )
 88 |     parser.add_argument(
 89 |         "--batch-size",
 90 |         type=int,
 91 |         default=16,
 92 |         help="Batch size for CrossEncoder.predict (default: 16).",
 93 |     )
 94 |     parser.add_argument(
 95 |         "--debug-limit",
 96 |         type=int,
 97 |         default=None,
 98 |         help="If provided, limit each split to the first N examples for quick iteration.",
 99 |     )
100 |     parser.add_argument(
101 |         "--validate-samples",
102 |         type=int,
103 |         default=5,
104 |         help="Number of samples to print for sanity checking.",
105 |     )
106 |     parser.add_argument(
107 |         "--overwrite",
108 |         action="store_true",
109 |         help="Overwrite the output directory if it already exists.",
110 |     )
111 |     parser.add_argument(
112 |         "--dtype",
113 |         default="bfloat16",
114 |         choices=["float32", "float16", "bfloat16"],
115 |         help="Computation dtype for the cross-encoder model (default: bfloat16).",
116 |     )
117 |     parser.add_argument(
118 |         "--log-level",
119 |         default="INFO",
120 |         choices=["DEBUG", "INFO", "WARNING", "ERROR"],
121 |         help="Logging verbosity.",
122 |     )
123 |     args = parser.parse_args()
124 | 
125 |     logging.basicConfig(level=getattr(logging, args.log_level))
126 | 
127 |     dataset_path = args.dataset_path.resolve()
128 |     if args.output_path is None:
129 |         if str(dataset_path).endswith("_with_teacher_scores"):
130 |             output_path = dataset_path
131 |         else:
132 |             output_path = dataset_path.with_name(f"{dataset_path.name}_with_teacher_scores")
133 |     else:
134 |         output_path = args.output_path.resolve()
135 | 
136 |     column_name = args.column_name
137 |     if not column_name:
138 |         column_name = args.model.split("/")[-1].replace(".", "_")
139 | 
140 |     return Config(
141 |         dataset_path=dataset_path,
142 |         output_path=output_path,
143 |         model_name=args.model,
144 |         column_name=column_name,
145 |         batch_size=args.batch_size,
146 |         debug_limit=args.debug_limit,
147 |         validate_samples=args.validate_samples,
148 |         overwrite=args.overwrite,
149 |         dtype=args.dtype,
150 |     )
151 | 
152 | 
153 | def _load_split_from_arrow_shards(split_path: Path) -> Dataset:
154 |     arrow_files = sorted(split_path.glob("*.arrow"))
155 |     if not arrow_files:
156 |         raise FileNotFoundError(f"No Arrow shards found under {split_path}")
157 |     logger.info(
158 |         "Falling back to raw Arrow shards for split at %s (%d files).",
159 |         split_path,
160 |         len(arrow_files),
161 |     )
162 |     datasets = [Dataset.from_file(str(arrow_file)) for arrow_file in arrow_files]
163 |     if len(datasets) == 1:
164 |         return datasets[0]
165 |     return concatenate_datasets(datasets)
166 | 
167 | 
168 | def _load_split_with_fallback(split_path: Path) -> Dataset:
169 |     try:
170 |         return Dataset.load_from_disk(str(split_path))
171 |     except FileNotFoundError:
172 |         return _load_split_from_arrow_shards(split_path)
173 | 
174 | 
175 | def _load_dataset_dict_from_directory(path: Path) -> DatasetDict:
176 |     dataset_dict_path = path / "dataset_dict.json"
177 |     if dataset_dict_path.exists():
178 |         with dataset_dict_path.open() as fp:
179 |             payload = json.load(fp)
180 |         split_names = payload.get("splits", [])
181 |     else:
182 |         split_names = sorted(entry.name for entry in path.iterdir() if entry.is_dir())
183 |     if not split_names:
184 |         raise FileNotFoundError(f"No dataset splits discovered under {path}")
185 | 
186 |     dataset_dict = {}
187 |     for split_name in split_names:
188 |         split_path = path / split_name
189 |         if not split_path.exists():
190 |             logger.warning("Split path %s declared but missing on disk.", split_path)
191 |             continue
192 |         try:
193 |             dataset_dict[split_name] = _load_split_with_fallback(split_path)
194 |         except FileNotFoundError as error:
195 |             logger.warning("Skipping split %s: %s", split_name, error)
196 |     if not dataset_dict:
197 |         raise FileNotFoundError(f"Failed to load any dataset splits from {path}")
198 |     return DatasetDict(dataset_dict)
199 | 
200 | 
201 | def load_dataset_dict(path: Path) -> DatasetDict:
202 |     logger.info("Loading dataset from %s", path)
203 |     try:
204 |         data = load_from_disk(str(path))
205 |         if isinstance(data, Dataset):
206 |             data = DatasetDict({"train": data})
207 |     except FileNotFoundError as error:
208 |         logger.warning(
209 |             "Standard load_from_disk failed (%s). Attempting Arrow shard fallback.",
210 |             error,
211 |         )
212 |         data = _load_dataset_dict_from_directory(path)
213 |     logger.info(
214 |         "Loaded splits: %s",
215 |         ", ".join(f"{name} ({len(split):,})" for name, split in data.items()),
216 |     )
217 |     return data
218 | 
219 | 
220 | def prepare_split(split: Dataset, limit: int | None) -> Dataset:
221 |     if limit is not None and len(split) > limit:
222 |         logger.warning("Debug mode active: trimming split from %d to %d rows", len(split), limit)
223 |         return split.select(range(limit))
224 |     return split
225 | 
226 | 
227 | def initialise_model(model_name: str, dtype: str) -> CrossEncoder:
228 |     logger.info("Loading reranker model %s", model_name)
229 |     model = CrossEncoder(model_name, max_length=None)
230 |     base_model = getattr(model, "model", None)
231 |     device = getattr(model, "device", None)
232 |     if isinstance(base_model, nn.Module) and getattr(device, "type", None) in {"cuda", "mps"}:
233 |         target_dtype = {
234 |             "bfloat16": torch.bfloat16,
235 |             "float16": torch.float16,
236 |         }.get(dtype, torch.float32)
237 |         base_model.to(dtype=target_dtype)
238 |         logger.info("Running on %s with dtype=%s.", device, dtype)
239 |     else:
240 |         logger.info("Running on %s.", device if device is not None else "CPU")
241 |     return model
242 | 
243 | 
244 | def score_split(
245 |     split: Dataset,
246 |     model: CrossEncoder,
247 |     config: Config,
248 |     split_name: str,
249 | ) -> list[list[float]]:
250 |     num_rows = len(split)
251 |     logger.info("Scoring split '%s' with %d rows.", split_name, num_rows)
252 |     scores_per_row: list[list[float]] = []
253 | 
254 |     pair_buffer: list[tuple[str, str]] = []
255 |     meta_buffer: list[tuple[int, int]] = []
256 | 
257 |     start_time = time.time()
258 | 
259 |     def flush_buffer() -> None:
260 |         if not pair_buffer:
261 |             return
262 |         predictions = model.predict(
263 |             pair_buffer,
264 |             show_progress_bar=False,
265 |             batch_size=config.batch_size,
266 |         )
267 |         for (row_idx, text_idx), score in zip(meta_buffer, predictions):
268 |             scores_per_row[row_idx][text_idx] = float(score)
269 |         pair_buffer.clear()
270 |         meta_buffer.clear()
271 | 
272 |     for row_idx in tqdm(range(num_rows), desc=f"{split_name} rows"):
273 |         record = split[row_idx]
274 |         texts: Sequence[str] = record[config.texts_column]
275 |         scores_per_row.append([0.0] * len(texts))
276 |         query = record[config.query_column]
277 | 
278 |         for text_idx, text in enumerate(texts):
279 |             pair_buffer.append((query, text))
280 |             meta_buffer.append((row_idx, text_idx))
281 |             if len(pair_buffer) >= config.batch_size:
282 |                 flush_buffer()
283 | 
284 |     flush_buffer()
285 | 
286 |     elapsed = time.time() - start_time
287 |     total_pairs = sum(len(row) for row in scores_per_row)
288 |     if elapsed > 0:
289 |         logger.info(
290 |             "Split '%s' scored %d pairs in %.1f seconds (%.1f pairs/sec).",
291 |             split_name,
292 |             total_pairs,
293 |             elapsed,
294 |             total_pairs / elapsed,
295 |         )
296 |     else:
297 |         logger.info("Split '%s' scored %d pairs.", split_name, total_pairs)
298 | 
299 |     return scores_per_row
300 | 
301 | 
302 | def add_column(
303 |     split: Dataset,
304 |     column_name: str,
305 |     values: Iterable[list[float]],
306 |     overwrite: bool,
307 | ) -> Dataset:
308 |     target_column = f"teacher_scores.{column_name}"
309 |     if target_column in split.column_names:
310 |         if not overwrite:
311 |             raise ValueError(
312 |                 f"Column '{target_column}' already exists. Use --overwrite to replace it."
313 |             )
314 |         split = split.remove_columns(target_column)
315 |         logger.info("Overwriting existing column %s", target_column)
316 |     split = split.add_column(
317 |         target_column,
318 |         list(values),
319 |         new_fingerprint=f"{target_column}_updated",
320 |     )
321 |     return split
322 | 
323 | 
324 | def validate_samples(split: Dataset, column_name: str, count: int) -> None:
325 |     if count <= 0 or len(split) == 0:
326 |         return
327 |     target_column = f"teacher_scores.{column_name}"
328 |     logger.info("Validating first %d samples for column %s", min(count, len(split)), target_column)
329 |     for idx in range(min(count, len(split))):
330 |         rec = split[idx]
331 |         scores = rec[target_column]
332 |         logger.info("Sample %d id=%s scores=%s", idx, rec.get("id", "N/A"), scores)
333 | 
334 |     # Basic stats
335 |     sample_size = min(1000, len(split))
336 |     all_scores = np.array(sum((split[i][target_column] for i in range(sample_size)), []))
337 |     if all_scores.size:
338 |         logger.info(
339 |             "Score stats -> mean: %.4f, std: %.4f, min: %.4f, max: %.4f",
340 |             float(np.mean(all_scores)),
341 |             float(np.std(all_scores)),
342 |             float(np.min(all_scores)),
343 |             float(np.max(all_scores)),
344 |         )
345 |         if len(split) > sample_size:
346 |             logger.info("(Stats based on first %d rows)", sample_size)
347 | 
348 | 
349 | def _remove_directory(path: Path) -> None:
350 |     if not path.exists():
351 |         return
352 |     for child in path.iterdir():
353 |         if child.is_dir():
354 |             _remove_directory(child)
355 |         else:
356 |             child.unlink()
357 |     path.rmdir()
358 | 
359 | 
360 | def save_dataset(dataset: DatasetDict, path: Path, overwrite: bool) -> None:
361 |     tmp_path = path
362 |     if path.exists():
363 |         if not overwrite:
364 |             raise FileExistsError(
365 |                 f"Output path {path} already exists. Use --overwrite to replace."
366 |             )
367 |         tmp_path = path.parent / f".{path.name}.tmpwrite"
368 |     if tmp_path.exists():
369 |         _remove_directory(tmp_path)
370 | 
371 |     logger.info("Saving dataset to %s", tmp_path)
372 |     dataset.save_to_disk(str(tmp_path))
373 | 
374 |     if tmp_path != path:
375 |         if path.exists():
376 |             _remove_directory(path)
377 |         tmp_path.replace(path)
378 |     logger.info("Dataset saved to %s", path)
379 | 
380 | 
381 | def main() -> None:
382 |     config = parse_args()
383 | 
384 |     dataset = load_dataset_dict(config.dataset_path)
385 |     model = initialise_model(config.model_name, config.dtype)
386 | 
387 |     augmented_splits = {}
388 |     for split_name, split in dataset.items():
389 |         prepared = prepare_split(split, config.debug_limit)
390 |         scores = score_split(prepared, model, config, split_name)
391 |         augmented = add_column(prepared, config.column_name, scores, config.overwrite)
392 |         validate_samples(augmented, config.column_name, config.validate_samples)
393 |         augmented_splits[split_name] = augmented
394 | 
395 |     augmented_dataset = DatasetDict(augmented_splits)
396 |     save_dataset(
397 |         augmented_dataset,
398 |         config.output_path,
399 |         config.overwrite or config.output_path == config.dataset_path,
400 |     )
401 |     logger.info("Teacher score column '%s' added successfully.", config.column_name)
402 | 
403 | 
404 | if __name__ == "__main__":
405 |     main()
406 | 


--------------------------------------------------------------------------------