├── docs ├── imgs ├── examples ├── _static │ ├── favicon.png │ ├── README.md │ ├── mathjax.js │ └── custom_css.css ├── citation.md ├── further_details │ ├── benchmarks.md │ └── acknowledgements.md ├── _overrides │ └── partials │ │ └── source.html ├── requirements.txt └── index.md ├── test ├── __init__.py ├── test_question_answering │ ├── __init__.py │ ├── conftest.py │ ├── test_uasquad_core.py │ ├── test_uasquad.py │ └── test_uasquad_hardening.py ├── test_text_classification │ ├── __init__.py │ ├── conftest.py │ ├── test_classification.py │ └── test_news_dataset_hardening.py ├── test_token_classification │ ├── __init__.py │ ├── conftest.py │ ├── test_token.py │ └── test_pos_dataset_hardening.py └── conftest.py ├── imgs └── NaUKMA.png ├── ua_datasets ├── question_answering │ ├── __init__.py │ └── uasquad_question_answering.py ├── token_classification │ ├── __init__.py │ └── part_of_speech.py ├── text_classification │ ├── __init__.py │ └── news_classification.py ├── __init__.py └── utils.py ├── CITATION.cff ├── .github └── workflows │ ├── ci.yml │ ├── build_docs.yml │ └── release.yml ├── .pre-commit-config.yaml ├── LICENSE ├── pyproject.toml ├── examples ├── mova_pos.md ├── ua_news.md └── ua_squad.md ├── .gitignore ├── mkdocs.yml ├── README.md └── uv.lock /docs/imgs: -------------------------------------------------------------------------------- 1 | ../imgs/ -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/examples: -------------------------------------------------------------------------------- 1 | ../examples/ -------------------------------------------------------------------------------- /test/test_question_answering/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/test_text_classification/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/test_token_classification/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /imgs/NaUKMA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fido-ai/ua-datasets/HEAD/imgs/NaUKMA.png -------------------------------------------------------------------------------- /docs/_static/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fido-ai/ua-datasets/HEAD/docs/_static/favicon.png -------------------------------------------------------------------------------- /ua_datasets/question_answering/__init__.py: -------------------------------------------------------------------------------- 1 | from ua_datasets.question_answering.uasquad_question_answering import UaSquadDataset 2 | 3 | __all__ = ["UaSquadDataset"] 4 | -------------------------------------------------------------------------------- /ua_datasets/token_classification/__init__.py: -------------------------------------------------------------------------------- 1 | from ua_datasets.token_classification.part_of_speech import MovaInstitutePOSDataset 2 | 3 | __all__ = ["MovaInstitutePOSDataset"] 4 | -------------------------------------------------------------------------------- /ua_datasets/text_classification/__init__.py: -------------------------------------------------------------------------------- 1 | from ua_datasets.text_classification.news_classification import NewsClassificationDataset 2 | 3 | __all__ = ["NewsClassificationDataset"] 4 | -------------------------------------------------------------------------------- /docs/_static/README.md: -------------------------------------------------------------------------------- 1 | The favicon is adapted from `math-integral-box` from https://materialdesignicons.com, found by way of https://pictogrammers.com. Specifically it has been adapted by filling in the integral with black. (Originally it has 100% alpha.) 2 | -------------------------------------------------------------------------------- /ua_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from ua_datasets.question_answering import UaSquadDataset 2 | from ua_datasets.text_classification import NewsClassificationDataset 3 | from ua_datasets.token_classification import MovaInstitutePOSDataset 4 | 5 | __all__ = [ 6 | "MovaInstitutePOSDataset", 7 | "NewsClassificationDataset", 8 | "UaSquadDataset", 9 | ] 10 | -------------------------------------------------------------------------------- /test/test_token_classification/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from ua_datasets.token_classification.part_of_speech import MovaInstitutePOSDataset 6 | 7 | 8 | @pytest.fixture(scope="module") 9 | def dataset(dataset_root: Path) -> MovaInstitutePOSDataset: 10 | return MovaInstitutePOSDataset(root=dataset_root, download=True) 11 | -------------------------------------------------------------------------------- /docs/_static/mathjax.js: -------------------------------------------------------------------------------- 1 | window.MathJax = { 2 | tex: { 3 | inlineMath: [["\\(", "\\)"]], 4 | displayMath: [["\\[", "\\]"]], 5 | processEscapes: true, 6 | processEnvironments: true 7 | }, 8 | options: { 9 | ignoreHtmlClass: ".*|", 10 | processHtmlClass: "arithmatex" 11 | } 12 | }; 13 | 14 | document$.subscribe(() => { 15 | MathJax.typesetPromise() 16 | }) 17 | -------------------------------------------------------------------------------- /docs/citation.md: -------------------------------------------------------------------------------- 1 | If you found this library useful in academic research, please cite: 2 | 3 | ```bibtex 4 | @software{ua_datasets_2021, 5 | author = {Ivanyuk-Skulskiy, Bogdan and Zaliznyi, Anton and Reshetar, Oleksand and Protsyk, Oleksiy and Romanchuk, Bohdan and Shpihanovych, Vladyslav}, 6 | month = oct, 7 | title = {ua_datasets: a collection of Ukrainian language datasets}, 8 | url = {https://github.com/fido-ai/ua-datasets}, 9 | version = {1.0.0}, 10 | year = {2021} 11 | } 12 | ``` 13 | 14 | (Also consider starring the project [on GitHub](https://github.com/fido-ai/ua-datasets)!) 15 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Ivanyuk-Skulskiy" 5 | given-names: "Bogdan" 6 | - family-names: "Zaliznyi" 7 | given-names: "Anton" 8 | - family-names: "Reshetar" 9 | given-names: "Oleksand" 10 | - family-names: "Protsyk" 11 | given-names: "Oleksiy" 12 | - family-names: "Romanchuk" 13 | given-names: "Bohdan" 14 | - family-names: "Shpihanovych" 15 | given-names: "Vladyslav" 16 | title: "ua_datasets" 17 | version: 0.0.1 18 | date-released: 2021-10-09 19 | url: "https://github.com/fido-ai/ua-datasets" 20 | -------------------------------------------------------------------------------- /test/test_text_classification/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from ua_datasets import NewsClassificationDataset 6 | 7 | 8 | @pytest.fixture(scope="module") 9 | def train_dataset(dataset_root: Path) -> NewsClassificationDataset: 10 | # Pass Path directly to satisfy type checker (was str via as_posix()). 11 | return NewsClassificationDataset(root=dataset_root, split="train") 12 | 13 | 14 | @pytest.fixture(scope="module") 15 | def test_dataset(dataset_root: Path) -> NewsClassificationDataset: 16 | return NewsClassificationDataset(root=dataset_root, split="test") 17 | -------------------------------------------------------------------------------- /docs/further_details/benchmarks.md: -------------------------------------------------------------------------------- 1 | # UA-Bench 2 | 3 | The goal of __UA-Bench__ is to track the real progress in Ukrainian language model developments. 4 | 5 | ## UA-SQuAD 6 | 7 | | Method | Test results | Extra data | Arichitecture | Venue 8 | | ------------- |:--------:|:-------:|:------:|:------:| 9 | 10 | - [robinhda/ukrainian-qa](https://github.com/robinhad/ukrainian-qa) 11 | 12 | ## UA-News 13 | 14 | | Method | Test results | Extra data | Arichitecture | Venue 15 | | ------------- |:--------:|:-------:|:------:|:------:| 16 | 17 | 18 | ## Mova Institute POS 19 | 20 | | Method | Test results | Extra data | Arichitecture | Venue 21 | | ------------- |:--------:|:-------:|:------:|:------:| 22 | -------------------------------------------------------------------------------- /docs/_overrides/partials/source.html: -------------------------------------------------------------------------------- 1 | {% import "partials/language.html" as lang with context %} 2 | 3 |
4 | {% set icon = config.theme.icon.repo or "fontawesome/brands/git-alt" %} 5 | {% include ".icons/" ~ icon ~ ".svg" %} 6 |
7 |
8 | {{ config.repo_name }} 9 |
10 |
11 | {% if config.theme.twitter_url %} 12 | 13 |
14 | {% include ".icons/fontawesome/brands/twitter.svg" %} 15 |
16 |
17 | {{ config.theme.twitter_name }} 18 |
19 |
20 | {% endif %} 21 | -------------------------------------------------------------------------------- /docs/further_details/acknowledgements.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## We thank our contributors for UA-SQuAD dataset 4 | Kyrpa Mykyta, Ivan Makarov, Tepla Sofiia, Chudnovska Daria, Fedenko Anna, Zaremba Anna, Krainia Daria, Budenkova Marharyta, Butunaieva Diana, Stanislavska Kateryna, Samorodova Sofiia, Martynyshyn Yuliia, Matviienko Iryna, Bezruka Anastasiia, Mostova Mariia, Stepanenko Liubomyr, Bondarenko Vitaliia, Fedorenko Polina, Sydorka Bohdana, Okhrimenko Mykhailo, Hryha Ruslana, Ustynova Olha, Kondratenko Dmytro, Chornomorets Yelyzaveta, Heresh Yuliia, Hynku Anna-Mariia, Tarasiuk Kateryna, Demian Biliavskyi, Piatushko Ruslana, Pakholchak Kateryna, Barabukha Mariia, Poltorak Yuliia, Yuliia Fedor, Usenko Viktoriia, Balanchuk Yana, Kramchenkov Dmytro, Yatsiuk Mariia, Melnyk Tetiana, Biloverbenko Illia, Boiko Khrystyna, Steshenko Kateryna, Korcheva Anna, Syzonenko Anastasiia, Malysheva Alina, Yaroslava Kushcheva, Valeriia Denysenko 5 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # Updated requirements to support plugins (autorefs needs MkDocs>=1.5 for event_priority) 2 | mkdocs>=1.5.3,<2.0 # Core; event_priority available from 1.5 3 | mkdocs-material>=9.5.0,<10.0 # Modern theme compatible with MkDocs 1.5+ 4 | pymdown-extensions>=10.8,<11.0 5 | mkdocstrings>=0.24.0,<0.25.0 # Current stable API; works with mkdocs>=1.5 6 | mkdocstrings-python>=1.10.0,<2.0 # Separate provider package for newer mkdocstrings 7 | mknotebooks>=0.8.0,<0.9 # Compatible with MkDocs 1.5 8 | mkdocs-autorefs>=1.0.1,<2.0 # Provides autorefs plugin using event_priority 9 | mkdocs-include-exclude-files>=0.0.1 10 | jinja2>=3.1.4,<4.0 # Newer Jinja2 fine with updated mkdocstrings 11 | nbconvert>=7.16.0,<8.0 # Modern nbconvert (Python 3.11 compatible) 12 | nbformat>=5.10.0,<6.0 13 | pygments>=2.18.0,<3.0 14 | 15 | # Legacy / project-specific utilities 16 | pytkdocs_tweaks==0.0.6 # Retain existing tweak package (validate compatibility periodically) 17 | -------------------------------------------------------------------------------- /test/test_question_answering/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import cast 3 | 4 | import pytest 5 | 6 | from ua_datasets import UaSquadDataset 7 | 8 | 9 | @pytest.fixture(scope="module", params=["train", "val"]) 10 | def dataset(request: pytest.FixtureRequest, dataset_root: Path) -> UaSquadDataset: 11 | """UaSquadDataset fixture parametrized over splits. 12 | 13 | Skips gracefully if the remote resource is unavailable or filenames differ 14 | from the assumed defaults (train.json / val.json) so that other test 15 | suites can still run. 16 | """ 17 | split: str = request.param 18 | try: 19 | return UaSquadDataset(root=dataset_root, split=split, download=True) 20 | except Exception as exc: # pragma: no cover - network/remote variability 21 | pytest.skip(f"Skipping UaSquadDataset {split!r} split: {exc}") 22 | # Help mypy understand this function always returns a UaSquadDataset (skip raises) 23 | return cast(UaSquadDataset, None) # unreachable 24 | -------------------------------------------------------------------------------- /test/test_text_classification/test_classification.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ua_datasets import NewsClassificationDataset 4 | 5 | 6 | @pytest.mark.parametrize("idx", [1, 10, 100]) 7 | def test_dataset_type( 8 | idx: int, train_dataset: NewsClassificationDataset, test_dataset: NewsClassificationDataset 9 | ) -> None: 10 | title, text, target, _ = train_dataset[idx] 11 | assert isinstance(title, str) 12 | assert isinstance(text, str) 13 | assert isinstance(target, str) 14 | 15 | title, text, target, _ = test_dataset[idx] 16 | assert isinstance(title, str) 17 | assert isinstance(text, str) 18 | assert isinstance(target, str) 19 | 20 | 21 | @pytest.mark.parametrize("dataset_size", [120_417]) 22 | def test_traindataset_size(dataset_size: int, train_dataset: NewsClassificationDataset) -> None: 23 | assert len(train_dataset) == dataset_size 24 | 25 | 26 | @pytest.mark.parametrize("dataset_size", [30_105]) 27 | def test_testdataset_size(dataset_size: int, test_dataset: NewsClassificationDataset) -> None: 28 | assert len(test_dataset) == dataset_size 29 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | test: 11 | strategy: 12 | matrix: 13 | python-version: [ '3.10', '3.11', '3.12' ] 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout 17 | uses: actions/checkout@v4 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Install uv 25 | run: pip install --upgrade uv 26 | 27 | - name: Sync dependencies 28 | run: uv sync --dev 29 | 30 | - name: Lint (ruff) 31 | run: uv run ruff check . 32 | 33 | - name: Lint (ruff format check) 34 | run: uv run ruff format --check . 35 | 36 | - name: Type check (mypy) 37 | run: uv run mypy || true # remove '|| true' later to enforce strict 38 | 39 | - name: Test (pytest) 40 | run: uv run pytest -q --maxfail=1 --disable-warnings 41 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: ruff-check 5 | name: ruff-check 6 | entry: ./.venv/bin/ruff 7 | args: 8 | - check 9 | - --exit-non-zero-on-fix 10 | language: system 11 | types: [python] 12 | pass_filenames: false 13 | always_run: true 14 | - id: ruff-format 15 | name: ruff-format 16 | entry: ./.venv/bin/ruff 17 | args: [ format ] 18 | language: system 19 | types: [ python ] 20 | pass_filenames: false 21 | always_run: true 22 | - id: mypy 23 | name: mypy 24 | entry: ./.venv/bin/mypy 25 | args: 26 | - ua_datasets 27 | language: system 28 | types: [ python ] 29 | pass_filenames: false 30 | always_run: true 31 | - id: uv-sort 32 | name: uv-sort 33 | entry: ./.venv/bin/uv-sort 34 | language: system 35 | pass_filenames: false 36 | - id: uv-lock 37 | name: uv-lock 38 | entry: uv 39 | args: 40 | - lock 41 | - --dry-run 42 | language: system 43 | pass_filenames: false 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Bogdan Ivanyuk-Skulskiy, Anton Zaliznyi, Oleksand Reshetar, Oleksiy Protsyk, Bohdan Romanchuk, Vladyslav Shpihanovych 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /test/conftest.py: -------------------------------------------------------------------------------- 1 | """Global pytest configuration and shared fixtures.""" 2 | 3 | from __future__ import annotations 4 | 5 | from pathlib import Path 6 | from typing import Generator 7 | 8 | import pytest 9 | 10 | 11 | def pytest_addoption(parser: pytest.Parser) -> None: 12 | parser.addoption( 13 | "--dataset-root", 14 | action="store", 15 | default=".data", 16 | help="Root directory where datasets will be cached/downloaded.", 17 | ) 18 | 19 | 20 | @pytest.fixture(scope="session") 21 | def dataset_root(request: pytest.FixtureRequest) -> Path: 22 | """Return the root path for dataset downloads/caches (session scoped).""" 23 | return Path(request.config.getoption("--dataset-root")).resolve() 24 | 25 | 26 | @pytest.fixture(scope="session", autouse=True) 27 | def _cleanup_dataset_root(dataset_root: Path) -> Generator[None, None, None]: 28 | """Remove the dataset root directory after the entire test session. 29 | 30 | Ensures no downloaded artifacts (e.g. `.data` directory) remain in the 31 | repository after tests complete, keeping the working tree clean. 32 | """ 33 | yield 34 | if dataset_root.exists(): 35 | import shutil 36 | 37 | shutil.rmtree(dataset_root, ignore_errors=True) 38 | -------------------------------------------------------------------------------- /.github/workflows/build_docs.yml: -------------------------------------------------------------------------------- 1 | name: Build docs 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | deploy: 10 | strategy: 11 | matrix: 12 | # Quote version to avoid YAML float coercion (3.10 -> 3.1) 13 | python-version: ['3.10'] 14 | os: [ ubuntu-latest ] 15 | runs-on: ${{ matrix.os }} 16 | steps: 17 | - name: Checkout code 18 | uses: actions/checkout@v3 19 | 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v4 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | 25 | - name: Cache build artifacts 26 | uses: actions/cache@v4 27 | with: 28 | path: .cache 29 | key: docs-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'docs/requirements.txt') }} 30 | restore-keys: | 31 | docs-${{ runner.os }}- 32 | 33 | - name: Install dependencies 34 | run: | 35 | python -m pip install --upgrade pip 36 | python -m pip install . 37 | python -m pip install -r docs/requirements.txt 38 | 39 | - name: Build docs 40 | env: 41 | PYTHONWARNINGS: ignore::DeprecationWarning 42 | run: mkdocs gh-deploy --force --strict 43 | -------------------------------------------------------------------------------- /test/test_token_classification/test_token.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ua_datasets.token_classification.part_of_speech import MovaInstitutePOSDataset 4 | 5 | 6 | @pytest.mark.parametrize("dataset_size", [7100]) 7 | def test_dataset_size(dataset_size: int, dataset: MovaInstitutePOSDataset) -> None: 8 | assert len(dataset) == dataset_size 9 | 10 | 11 | def test_first_sample_non_empty(dataset: MovaInstitutePOSDataset) -> None: 12 | sample, labels = dataset[0] 13 | assert sample, "First token sequence should not be empty" 14 | assert labels, "First label sequence should not be empty" 15 | assert len(sample) == len(labels), "Sample and label length must match" 16 | 17 | 18 | def test_unique_labels(dataset: MovaInstitutePOSDataset) -> None: 19 | unique = dataset.unique_labels 20 | assert isinstance(unique, set) 21 | assert unique, "There should be at least one unique label" 22 | # Basic sanity: POS tags often include 'NOUN' or similar; do a soft check 23 | assert any(len(tag) > 1 for tag in unique) 24 | 25 | 26 | def test_iteration(dataset: MovaInstitutePOSDataset) -> None: 27 | first = next(iter(dataset)) 28 | assert isinstance(first, tuple) 29 | assert len(first) == 2 30 | tokens, tags = first 31 | assert len(tokens) == len(tags) 32 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "ua-datasets" 3 | version = "1.0.1" 4 | description = "A collection of ukrainian language datasets" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | license = {file = "LICENSE"} 8 | authors = [ 9 | {name = "FIdo AI", email = "ivanyuk.skulskiy@ukma.edu.ua"}, 10 | ] 11 | keywords = ["ua-datasets"] 12 | classifiers = [ 13 | "Intended Audience :: Developers", 14 | "Intended Audience :: Education", 15 | "Intended Audience :: Science/Research", 16 | "Natural Language :: Ukrainian", 17 | "License :: OSI Approved :: MIT License", 18 | "Programming Language :: Python :: 3", 19 | "Programming Language :: Python :: 3.10", 20 | "Programming Language :: Python :: 3.11", 21 | "Programming Language :: Python :: 3.12", 22 | ] 23 | urls = {repository = "https://github.com/fido-ai/ua-datasets" } 24 | dependencies = [] 25 | 26 | [tool.setuptools] 27 | include-package-data = true 28 | 29 | [tool.setuptools.packages.find] 30 | include = ["ua_datasets*"] 31 | 32 | [tool.setuptools.package-data] 33 | "*" = [ 34 | "README.md", 35 | "LICENSE", 36 | "CITATION.cff", 37 | "assets/*" 38 | ] 39 | 40 | [dependency-groups] 41 | dev = [ 42 | "mypy>=1.18.2", 43 | "pre-commit>=2.21.0", 44 | "pytest>=7.4.4", 45 | "ruff>=0.14.2", 46 | "uv-sort>=0.6.1", 47 | ] 48 | 49 | [tool.ruff] 50 | line-length = 100 51 | target-version = "py310" 52 | 53 | [tool.ruff.lint] 54 | extend-select = ["I", "B", "C4", "SIM", "PT", "RUF"] 55 | ignore = [ 56 | # Example: "E501" # managed by formatter if enabled 57 | ] 58 | 59 | [tool.ruff.lint.isort] 60 | known-first-party = ["ua_datasets"] 61 | combine-as-imports = true 62 | 63 | [tool.ruff.format] 64 | quote-style = "double" 65 | indent-style = "space" 66 | skip-magic-trailing-comma = false 67 | 68 | [tool.mypy] 69 | python_version = "3.10" 70 | packages = ["ua_datasets"] 71 | warn_unused_configs = true 72 | warn_return_any = true 73 | warn_unused_ignores = true 74 | disallow_untyped_defs = true 75 | disallow_incomplete_defs = true 76 | no_implicit_optional = true 77 | show_error_codes = true 78 | pretty = true 79 | -------------------------------------------------------------------------------- /examples/mova_pos.md: -------------------------------------------------------------------------------- 1 | # Mova Institute Part of Speech Dataset 2 | 3 | [Mova Institute](https://mova.institute) Part of Speech tagging dataset to train a model using the Ukrainian language. 4 | 5 | !!! Info 6 | Total number of files: 647 7 | Tokens: 141 286 8 | Words: 111 739 9 | Sentences: 8016 10 | 11 | ## Example of usage 12 | 13 | ### Our API 14 | 15 | ```python 16 | from ua_datasets import MovaInstitutePOSDataset 17 | 18 | mova = MovaInstitutePOSDataset(root='data/', download=True) 19 | 20 | print(mova.data) 21 | print(mova.labels) 22 | ``` 23 | 24 | Sample output: 25 | 26 | ```python 27 | Sample: ['У', 'домі', 'римського', 'патриція', 'Руфіна', 'була', 'прегарна', 'фреска', ',', ...] 28 | Labels: ['ADP', 'NOUN', 'ADJ', 'NOUN', 'PROPN', 'VERB', 'ADJ', 'NOUN', 'PUNCT', ...] 29 | ``` 30 | 31 | ## Labels description 32 | 33 | |Primary parts of speech|Definition |Example 34 | | ------------- |:--------------------------:|:---------------------------------:| 35 | |NOUN |Іменник |зображення,футбол,людина 36 | |VERB |Дієслово |робити,грати,співати 37 | |NUMR |Числівник |один,два,сто 38 | |ADV |Прислівник |абсолютно,безумовно,точно,яскраво | 39 | |ADJ |Прийменник |звичайна,веселий,грайливий,радісний| 40 | |PREP |Прийменник |в,у,на,під,за | 41 | |CONJ |Сполучник |і,та,й,але,а | 42 | |PART |Частка |не,хай,нехай,де,аби | 43 | |__Additional parts of speech__ | 44 | |PRON |Займенник |ти,ми,вони,я | 45 | |ADJP |Дієприкметник |Кохана,написана,прочитана,заспівана| 46 | |NUMR |Порядковий числівник|перший,сотий,другий | 47 | 48 | Samples and corresponding labels: 49 | 50 | ``` 51 | У[ADP] домі[NOUN] римського[ADJ] патриція[NOUN] Руфіна[PROPN] була[VERB] прегарна[ADJ] фреска[NOUN] ... 52 | 53 | Ходить[VERB] постійно[ADV] у[PREP] драній[ADJP]. 54 | 55 | Зробив[VERB] перший[NUMR] крок[NOUN] для[PREP] неї[PRON]. 56 | 57 | Якось[ADV] зібралися[VERB] у[PREP] нього[PRON],[PUNCT] ховаючися[VERB] від[PREP] переслідувань[NOUN] ... 58 | ``` 59 | 60 | More detailed information you can find [here](https://github.com/mova-institute/zoloto/blob/master/docs/tagset.md#%D1%80%D0%B8%D1%81%D0%B8-%D1%84%D0%BE%D1%80%D0%BC) 61 | -------------------------------------------------------------------------------- /test/test_question_answering/test_uasquad_core.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | from pathlib import Path 5 | 6 | from ua_datasets import UaSquadDataset 7 | 8 | TRAIN_JSON = { 9 | "data": [ 10 | {"question": "What is Python?", "context": "Python is a language.", "answer": "a language"}, 11 | {"question": "Who created Python?", "context": "Guido created it.", "answer": "Guido"}, 12 | ] 13 | } 14 | 15 | VAL_JSON = { 16 | "data": [ 17 | {"question": "Where?", "context": "In Europe.", "answer": "Europe"}, 18 | {"question": "When?", "context": "In 1991.", "answer": "1991"}, 19 | {"question": "Why?", "context": "For fun.", "answer": "For fun"}, 20 | ] 21 | } 22 | 23 | 24 | def write_json(root: Path, name: str, obj: dict) -> Path: 25 | p = root / name 26 | p.write_text(json.dumps(obj), encoding="utf8") 27 | return p 28 | 29 | 30 | def test_train_present_no_download(tmp_path: Path) -> None: 31 | write_json(tmp_path, "train.json", TRAIN_JSON) 32 | ds = UaSquadDataset(root=tmp_path, split="train", download=False) 33 | assert len(ds) == 2 34 | ex = ds[0] 35 | assert isinstance(ex, dict) 36 | assert all(isinstance(ex[k], str) and ex[k] for k in ("question", "context")) 37 | if not ex.get("is_impossible"): 38 | assert ex["answers"]["text"] 39 | assert isinstance(ex["answers"]["text"][0], str) 40 | 41 | 42 | def test_train_missing_no_download(tmp_path: Path) -> None: 43 | ds = UaSquadDataset(root=tmp_path, split="train", download=False) 44 | assert len(ds) == 0 45 | 46 | 47 | def test_val_present_no_download(tmp_path: Path) -> None: 48 | write_json(tmp_path, "val.json", VAL_JSON) 49 | ds = UaSquadDataset(root=tmp_path, split="val", download=False) 50 | assert len(ds) == 3 51 | ex = ds[len(ds) // 2] 52 | assert isinstance(ex, dict) 53 | assert all(isinstance(ex[k], str) and ex[k] for k in ("question", "context")) 54 | 55 | 56 | def test_val_missing_no_download(tmp_path: Path) -> None: 57 | ds = UaSquadDataset(root=tmp_path, split="val", download=False) 58 | assert len(ds) == 0 59 | 60 | 61 | def test_iter_matches_len(tmp_path: Path) -> None: 62 | write_json(tmp_path, "train.json", TRAIN_JSON) 63 | ds = UaSquadDataset(root=tmp_path, split="train", download=False) 64 | assert len(list(ds)) == len(ds) 65 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IDE 2 | .idea/ 3 | .vscode/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | .data/ 136 | -------------------------------------------------------------------------------- /examples/ua_news.md: -------------------------------------------------------------------------------- 1 | # UA-News 2 | 3 | ## Dataset Summary 4 | 5 | Ukrainian News is a collection of more than 150 thousand news articles, gathered from more than 20 news resources. Dataset samples are divided into 5 categories: `політика`, `спорт`, `новини`, `бізнес`, `технології`. The dataset is provided by the non-profit student's organization FIdo.ai (machine learning research division of [FIdo](https://www.facebook.com/fido.naukma/), National University of Kyiv-Mohyla Academy) for research purposes in data mining (classification, clustering, keywords extraction, etc.). 6 | 7 | Dataset development is still **in progress** 8 | 9 | ## Dataset Structure 10 | 11 | __Parameters__: 12 | 13 | - `root` : Directory path 14 | 15 | - `download`: Whether to download data 16 | 17 | - `split`: Which split of the data to load (train or test) 18 | 19 | - `return_tags`: Whether to return text keywords 20 | 21 | __Splits__: 22 | 23 | - Train : 24 | - File size: 324 MB 25 | - Number of samples: 120417 26 | - Target distribution 27 | 28 | `політика` : 40364 (33.5%) 29 | 30 | `спорт` : 40364 (33.5%) 31 | 32 | `новини` : 40364 (33.5%) 33 | 34 | `бізнес` : 40364 (33.5%) 35 | 36 | `технології` : 40364 (33.5%) 37 | 38 | - Test: 39 | - File size: 81 MB 40 | - Number of samples: 30105 41 | - Target distribution 42 | 43 | `політика` : 40364 (33.5%) 44 | 45 | `спорт` : 40364 (33.5%) 46 | 47 | `новини` : 40364 (33.5%) 48 | 49 | `бізнес` : 40364 (33.5%) 50 | 51 | `технології` : 40364 (33.5%) 52 | 53 | 54 | __Data sample__ 55 | ``` 56 | { 57 | "title" : 'На Донеччині зафіксували сьомий випадок коронавірусу', 58 | "text" : 'Про це повідомив голова Донецької ОДА Павло Кириленко в Facebook ..., 59 | "tags" : ['Донецька область', 'COVID-19', 'Новини'], 60 | "target" : 'новини' 61 | } 62 | ``` 63 | 64 | ## Example of usage 65 | 66 | ### Our API 67 | 68 | ```python 69 | from ua_datasets import NewsClassificationDataset 70 | 71 | train_data = NewsClassificationDataset(root='data/', split='train', return_tags=True) 72 | 73 | for title, text, tags, target in train_data: 74 | print(title, text, tags, target) 75 | ``` 76 | 77 | ### Hugging Face 🤗 API 78 | 79 | ```python 80 | from datasets import load_dataset 81 | 82 | dataset = load_dataset("FIdo-AI/ua-news") 83 | 84 | for item in dataset["train"]: 85 | title, text, tags, target = item["title"], item["text"], item["tags"], item["target"] 86 | print("Title: " + title) 87 | print("Text: " + text) 88 | print("Tags: " + tags) 89 | print("Target: " + target) 90 | ``` 91 | -------------------------------------------------------------------------------- /test/test_question_answering/test_uasquad.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ua_datasets import UaSquadDataset 4 | 5 | 6 | def test_basic_integrity(dataset: UaSquadDataset) -> None: 7 | """At least one QA triplet is present and components are non-empty strings. 8 | 9 | If the dataset is empty (e.g. missing val split remotely) the fixture may still 10 | supply it; in that case we skip rather than fail so CI remains green for other splits. 11 | """ 12 | if len(dataset) == 0: 13 | pytest.skip("Empty split provided (no samples). Skipping integrity checks.") 14 | ex = dataset[0] 15 | assert isinstance(ex, dict) 16 | assert isinstance(ex.get("question"), str) 17 | assert ex["question"].strip() 18 | assert isinstance(ex.get("context"), str) 19 | assert ex["context"].strip() 20 | if not ex.get("is_impossible"): 21 | assert ex["answers"]["text"] 22 | assert isinstance(ex["answers"]["text"][0], str) 23 | 24 | 25 | def test_multiple_samples_if_available(dataset: UaSquadDataset) -> None: 26 | """Check spaced samples (first, middle, last) when dataset is large enough.""" 27 | if len(dataset) == 0: 28 | return 29 | n = len(dataset) 30 | for idx in [0, n // 2, n - 1]: 31 | ex = dataset[idx] 32 | assert isinstance(ex.get("question"), str) 33 | assert ex["question"].strip() 34 | assert isinstance(ex.get("context"), str) 35 | assert ex["context"].strip() 36 | if not ex.get("is_impossible"): 37 | assert ex["answers"]["text"] 38 | 39 | 40 | def test_iter_first_three(dataset: UaSquadDataset) -> None: 41 | """Iterating yields triplets of strings; limit to first three to stay quick.""" 42 | count_checked = 0 43 | for ex in dataset: 44 | assert isinstance(ex.get("question"), str) 45 | assert ex["question"].strip() 46 | assert isinstance(ex.get("context"), str) 47 | assert ex["context"].strip() 48 | if not ex.get("is_impossible"): 49 | assert ex["answers"]["text"] 50 | count_checked += 1 51 | # If dataset non-empty ensure we actually validated at least one 52 | assert count_checked == len(dataset) 53 | 54 | 55 | def test_examples_length_and_schema(dataset: UaSquadDataset) -> None: 56 | if len(dataset) == 0: 57 | return 58 | examples = dataset.examples 59 | assert len(examples) == len(dataset) 60 | first = examples[0] 61 | for key in ["id", "context", "question", "answers", "is_impossible"]: 62 | assert key in first 63 | assert isinstance(first["answers"], dict) 64 | 65 | 66 | def test_repr_contains_split_and_count(dataset: UaSquadDataset) -> None: 67 | r = repr(dataset) 68 | # Should at least mention the split string and a count marker 69 | assert dataset.split in r 70 | assert "examples=" in r or str(len(dataset)) in r 71 | -------------------------------------------------------------------------------- /examples/ua_squad.md: -------------------------------------------------------------------------------- 1 | # UA-SQuAD 2 | 3 | ## Dataset Summary 4 | 5 | Ukrainian version of [Stanford Question Answering Dataset](https://rajpurkar.github.io/SQuAD-explorer/) that includes context, questions and corresponding answers. Current version of the datasets consists of 13 859 samples. Dataset development is still **in progress**. 6 | 7 | !!! Info 8 | Number of samples: 13 859 9 | Number of questions without answers: 2 927 10 | File size: 17.1 MB 11 | 12 | ### Data sample (HF-style) 13 | 14 | ```json 15 | { 16 | "id": "3d9f1c2e7a4b1f20", 17 | "title": "DONDA", 18 | "context": "5 січня 2012 року Вест оголосив про створення компанії ...", 19 | "question": "Якою була мета нової творчої компанії DONDA, створеної Каньє?", 20 | "answers": {"text": ["виготовлення продуктів та поширення досвіду, які люди хочуть отримати й можуть собі дозволити"], "answer_start": [123]}, 21 | "is_impossible": false 22 | } 23 | ``` 24 | 25 | ## Example of usage 26 | 27 | ### Python API (HF-style examples) 28 | 29 | ```python 30 | from ua_datasets import UaSquadDataset 31 | 32 | qa_dataset = UaSquadDataset("data/", split="train", download=True) 33 | 34 | for ex in qa_dataset: # each ex is a dict 35 | print("Question:", ex["question"]) 36 | print("Answers:", ex["answers"]["text"]) # list (may be empty if is_impossible) 37 | if ex.get("is_impossible"): 38 | print("(No answer — impossible question)") 39 | break 40 | ``` 41 | 42 | ### Optional: DatasetDict helper (no external Hub required) 43 | 44 | If you have the optional `datasets` library installed, you can build a local `DatasetDict` 45 | using the in-package helper (this does NOT call the Hugging Face Hub API if the JSON 46 | files are already cached locally): 47 | 48 | ```python 49 | from ua_datasets.question_answering.uasquad_question_answering import load_ua_squad_v2 50 | 51 | dd = load_ua_squad_v2(root="data/ua_squad", download=True) # returns a datasets.DatasetDict 52 | row = dd["train"][0] 53 | print(row["question"], row["answers"]["text"], row["is_impossible"]) 54 | ``` 55 | 56 | If `datasets` is not installed this helper will raise a `RuntimeError`; install it with: 57 | 58 | ```bash 59 | uv add datasets # or: uv add datasets 60 | ``` 61 | 62 | If you don't need a Hugging Face `Dataset`, stick with the pure-Python iteration example above. 63 | 64 | ### Migration Note 65 | 66 | Legacy versions exposed `(question, context, answer)` tuples and keys `Question/Context/Answer` in raw JSON; these have been replaced by the standard SQuAD v2 schema shown above. Update loops to: `for ex in ds: ex['question'], ex['answers']['text']`. 67 | 68 | ## We thank our contributors 69 | 70 | Kyrpa Mykyta, Ivan Makarov, Tepla Sofiia, Chudnovska Daria, Fedenko Anna, Zaremba Anna, Krainia Daria, Budenkova Marharyta, Butunaieva Diana, Stanislavska Kateryna, Samorodova Sofiia, Martynyshyn Yuliia, Matviienko Iryna, Bezruka Anastasiia, Mostova Mariia, Stepanenko Liubomyr, Bondarenko Vitaliia, Fedorenko Polina, Sydorka Bohdana, Okhrimenko Mykhailo, Hryha Ruslana, Ustynova Olha, Kondratenko Dmytro, Chornomorets Yelyzaveta, Heresh Yuliia, Hynku Anna-Mariia, Tarasiuk Kateryna, Demian Biliavskyi, Piatushko Ruslana, Pakholchak Kateryna, Barabukha Mariia, Poltorak Yuliia, Yuliia Fedor, Usenko Viktoriia, Balanchuk Yana, Kramchenkov Dmytro, Yatsiuk Mariia, Melnyk Tetiana, Biloverbenko Illia, Boiko Khrystyna, Steshenko Kateryna, Korcheva Anna, Syzonenko Anastasiia, Malysheva Alina, Yaroslava Kushcheva, Valeriia Denysenko 71 | -------------------------------------------------------------------------------- /test/test_token_classification/test_pos_dataset_hardening.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from ua_datasets.token_classification.part_of_speech import ( 6 | MovaInstitutePOSDataset, 7 | ParseError, 8 | ) 9 | 10 | 11 | @pytest.fixture 12 | def tmp_dataset_root(tmp_path: Path) -> Path: 13 | return tmp_path 14 | 15 | 16 | def _write(root: Path, name: str, content: str) -> None: 17 | (root / name).write_text(content, encoding="utf8") 18 | 19 | 20 | def test_final_sentence_without_trailing_newline(tmp_dataset_root: Path) -> None: 21 | content = "1\tToken\t_\tNOUN\n" # no trailing blank line 22 | _write(tmp_dataset_root, "final.conllu.txt", content) 23 | ds: MovaInstitutePOSDataset = MovaInstitutePOSDataset( 24 | root=tmp_dataset_root, download=False, file_name="final.conllu.txt" 25 | ) 26 | assert len(ds) == 1 27 | assert ds[0][0] == ["Token"] 28 | 29 | 30 | def test_comments_and_blank_lines(tmp_dataset_root: Path) -> None: 31 | content = ( 32 | "# sent 1\n" 33 | "1\tA\t_\tDET\n" 34 | "2\tcat\t_\tNOUN\n" 35 | "\n" 36 | "# sent 2\n" 37 | "1\tSleeps\t_\tVERB\n" 38 | "2\tquietly\t_\tADV\n" 39 | ) 40 | _write(tmp_dataset_root, "comments.conllu.txt", content) 41 | ds: MovaInstitutePOSDataset = MovaInstitutePOSDataset( 42 | root=tmp_dataset_root, download=False, file_name="comments.conllu.txt" 43 | ) 44 | assert len(ds) == 2 45 | assert ds[0][0] == ["A", "cat"] 46 | 47 | 48 | def test_multiword_tokens_ignored(tmp_dataset_root: Path) -> None: 49 | content = ( 50 | "1\tI\t_\tPRON\n" 51 | "2-3\tgo+ing\t_\t_\n" # multiword range 52 | "2\tam\t_\tAUX\n" 53 | "3\tgoing\t_\tVERB\n" 54 | "4\thome\t_\tNOUN\n" 55 | "\n" 56 | ) 57 | _write(tmp_dataset_root, "mwt.conllu.txt", content) 58 | ds: MovaInstitutePOSDataset = MovaInstitutePOSDataset( 59 | root=tmp_dataset_root, download=False, file_name="mwt.conllu.txt" 60 | ) 61 | tokens, tags = ds[0] 62 | assert tokens == ["I", "am", "going", "home"] 63 | assert tags == ["PRON", "AUX", "VERB", "NOUN"] 64 | 65 | 66 | def test_malformed_lines_ignored(tmp_dataset_root: Path) -> None: 67 | content = "1\tOk\t_\tINTJ\nBADLINE WITHOUT TABS\n2\tthen\t_\tADV\n\n" 68 | _write(tmp_dataset_root, "bad.conllu.txt", content) 69 | ds: MovaInstitutePOSDataset = MovaInstitutePOSDataset( 70 | root=tmp_dataset_root, download=False, file_name="bad.conllu.txt" 71 | ) 72 | tokens, tags = ds[0] 73 | assert tokens == ["Ok", "then"] 74 | assert tags == ["INTJ", "ADV"] 75 | 76 | 77 | def test_empty_file_raises_parse_error(tmp_dataset_root: Path) -> None: 78 | _write(tmp_dataset_root, "empty.conllu.txt", "") 79 | with pytest.raises(ParseError): 80 | MovaInstitutePOSDataset(root=tmp_dataset_root, download=False, file_name="empty.conllu.txt") 81 | 82 | 83 | def test_label_frequencies(tmp_dataset_root: Path) -> None: 84 | content = "1\tHello\t_\tINTJ\n2\tworld\t_\tNOUN\n\n1\tworld\t_\tNOUN\n2\tagain\t_\tADV\n\n" 85 | _write(tmp_dataset_root, "freq.conllu.txt", content) 86 | ds: MovaInstitutePOSDataset = MovaInstitutePOSDataset( 87 | root=tmp_dataset_root, download=False, file_name="freq.conllu.txt" 88 | ) 89 | freqs = ds.label_frequencies() 90 | assert freqs["NOUN"] == 2 91 | assert freqs["INTJ"] == 1 92 | assert freqs["ADV"] == 1 93 | # unique_labels still consistent 94 | assert ds.unique_labels == {"INTJ", "NOUN", "ADV"} 95 | -------------------------------------------------------------------------------- /test/test_question_answering/test_uasquad_hardening.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | import pytest 5 | 6 | from ua_datasets.question_answering.uasquad_question_answering import ( 7 | ParseError, 8 | UaSquadDataset, 9 | ) 10 | 11 | 12 | def _write(root: Path, name: str, obj: object) -> Path: 13 | p = root / name 14 | if isinstance(obj, str): 15 | p.write_text(obj, encoding="utf8") 16 | else: 17 | p.write_text(json.dumps(obj), encoding="utf8") 18 | return p 19 | 20 | 21 | @pytest.fixture 22 | def qa_tmp_root(tmp_path: Path) -> Path: 23 | return tmp_path 24 | 25 | 26 | def test_malformed_json_raises(qa_tmp_root: Path) -> None: 27 | _write(qa_tmp_root, "train.json", "{not-json}") 28 | with pytest.raises(ParseError): 29 | UaSquadDataset(root=qa_tmp_root, split="train", download=False) 30 | 31 | 32 | def test_empty_data_list_raises(qa_tmp_root: Path) -> None: 33 | _write(qa_tmp_root, "train.json", {"data": []}) 34 | with pytest.raises(ParseError): 35 | UaSquadDataset(root=qa_tmp_root, split="train", download=False) 36 | 37 | 38 | def test_answer_frequencies_and_unique(qa_tmp_root: Path) -> None: 39 | obj = { 40 | "data": [ 41 | {"question": "Q1", "context": "C1", "answer": "A1"}, 42 | {"question": "Q2", "context": "C2", "answer": "A1"}, 43 | {"question": "Q3", "context": "C3", "answer": "A2"}, 44 | ] 45 | } 46 | _write(qa_tmp_root, "train.json", obj) 47 | ds = UaSquadDataset(root=qa_tmp_root, split="train", download=False) 48 | freqs = ds.answer_frequencies() 49 | assert freqs == {"A1": 2, "A2": 1} 50 | assert ds.unique_answers == {"A1", "A2"} 51 | 52 | 53 | def test_force_download_skip(monkeypatch: pytest.MonkeyPatch, qa_tmp_root: Path) -> None: 54 | # Existing file should bypass network when force_download False 55 | _write(qa_tmp_root, "train.json", {"data": [{"question": "Q", "context": "C", "answer": "A"}]}) 56 | called = {"count": 0} 57 | 58 | def fake_urlopen(url: str, timeout: int = 0) -> None: # pragma: no cover - should not be used 59 | called["count"] += 1 60 | raise AssertionError("Should not download when file exists and force_download=False") 61 | 62 | monkeypatch.setattr( 63 | "ua_datasets.question_answering.uasquad_question_answering.urlopen", fake_urlopen 64 | ) 65 | UaSquadDataset(root=qa_tmp_root, split="train", download=True, force_download=False) 66 | assert called["count"] == 0 67 | 68 | 69 | def test_force_download_replaces(monkeypatch: pytest.MonkeyPatch, qa_tmp_root: Path) -> None: 70 | # Initial file 71 | _write(qa_tmp_root, "train.json", {"data": [{"question": "Q", "context": "C", "answer": "A"}]}) 72 | new_payload = {"data": [{"question": "QNEW", "context": "CNEW", "answer": "ANEW"}]} 73 | 74 | class FakeResp: 75 | def __init__(self, data: bytes) -> None: 76 | self._data = data 77 | 78 | def read(self) -> bytes: 79 | return self._data 80 | 81 | def __enter__(self) -> "FakeResp": 82 | return self 83 | 84 | def __exit__(self, exc_type: object, exc: object, tb: object) -> None: 85 | return None 86 | 87 | def fake_urlopen(url: str, timeout: int = 0) -> FakeResp: 88 | return FakeResp(json.dumps(new_payload).encode("utf8")) 89 | 90 | monkeypatch.setattr( 91 | "ua_datasets.question_answering.uasquad_question_answering.urlopen", fake_urlopen 92 | ) 93 | ds = UaSquadDataset(root=qa_tmp_root, split="train", download=True, force_download=True) 94 | assert len(ds) == 1 95 | ex = ds[0] 96 | assert ex["question"] == "QNEW" 97 | if not ex.get("is_impossible"): 98 | assert ex["answers"]["text"][0] == "ANEW" 99 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # UA-datasets 2 | 3 | > Unified, lightweight access to Ukrainian NLP benchmark datasets (QA, Text Classification, POS tagging) with automatic download, caching and consistent iteration. 4 | 5 | **UA-datasets** is maintained by FIdo.ai (machine learning research division of the non-profit student organization [FIdo](https://www.facebook.com/fido.naukma/) at the National University of Kyiv-Mohyla Academy) for research purposes. 6 | 7 | --- 8 | 9 | ## Features at a glance 10 | 11 | | Capability | Description | 12 | |------------|-------------| 13 | | Unified API | `len(ds)`, indexing, iteration across all datasets | 14 | | Resilient downloads | Retries, integrity / basic validation, fallback filenames (UA-SQuAD val) | 15 | | Minimal deps | Core loaders rely only on the standard library | 16 | | Consistent samples | QA: HF-style dict (`id`, `title`, `context`, `question`, `answers`, `is_impossible`); Classification `(title, text, label, tags?)`; POS `(tokens, tags)` | 17 | | Frequency helpers | Simple methods for label/answer frequency analysis | 18 | | Ready for tooling | Works seamlessly with `uv`, `ruff`, `mypy`, `pytest`, `pre-commit` | 19 | 20 | --- 21 | 22 | ## Available Datasets 23 | 24 | | Task | Dataset | Class | Splits | Notes | 25 | |------|---------|-------|--------|-------| 26 | | Question Answering | UA-SQuAD | `UaSquadDataset` | `train`, `val` | SQuAD-style JSON; legacy val filename fallbacks | 27 | | Text Classification | UA-News | `NewsClassificationDataset` | `train`, `test` | CSV (title,text,target[,tags]); optional tag parsing | 28 | | POS Tagging | Mova Institute POS | `MovaInstitutePOSDataset` | corpus | CoNLL-U like format; yields (tokens, tags) | 29 | 30 | --- 31 | 32 | ## Quick Start 33 | 34 | ```python 35 | from pathlib import Path 36 | from ua_datasets.question_answering import UaSquadDataset 37 | 38 | ds = UaSquadDataset(root=Path("./data/ua_squad"), split="train", download=True) 39 | print(f"Examples: {len(ds)}") 40 | ex = ds[0] 41 | print(ex["question"], "->", ex["answers"]["text"]) # list of answers (possibly empty if impossible) 42 | ``` 43 | 44 | Text classification: 45 | 46 | ```python 47 | from ua_datasets.text_classification import NewsClassificationDataset 48 | news = NewsClassificationDataset(root=Path("./data/ua_news"), split="train", download=True) 49 | title, text, label, tags = news[0] 50 | ``` 51 | 52 | POS tagging: 53 | 54 | ```python 55 | from ua_datasets.token_classification import MovaInstitutePOSDataset 56 | pos = MovaInstitutePOSDataset(root=Path("./data/mova_pos"), download=True) 57 | tokens, tags = pos[0] 58 | ``` 59 | 60 | --- 61 | 62 | ## Installation 63 | 64 | Choose one method: 65 | 66 | ### Using `uv` (recommended) 67 | 68 | ```bash 69 | uv add ua-datasets 70 | ``` 71 | 72 | ### Via pip 73 | 74 | ```bash 75 | pip install ua_datasets 76 | ``` 77 | 78 | ### From source (editable) 79 | 80 | ```bash 81 | git clone https://github.com/fido-ai/ua-datasets.git 82 | cd ua-datasets 83 | pip install -e . 84 | ``` 85 | 86 | --- 87 | 88 | ## Benchmarks & Acknowledgements 89 | 90 | - **Benchmarks:** See [Benchmarks](further_details/benchmarks.md) for leaderboard scaffolding. 91 | - **Acknowledgements:** See [Acknowledgements](further_details/acknowledgements.md) for dataset contributors. 92 | 93 | --- 94 | 95 | ## Citation 96 | 97 | If you found this library useful in academic research, please cite: 98 | 99 | ```bibtex 100 | @software{ua_datasets_2021, 101 | author = {Ivanyuk-Skulskiy, Bogdan and Zaliznyi, Anton and Reshetar, Oleksand and Protsyk, Oleksiy and Romanchuk, Bohdan and Shpihanovych, Vladyslav}, 102 | month = oct, 103 | title = {ua_datasets: a collection of Ukrainian language datasets}, 104 | url = {https://github.com/fido-ai/ua-datasets}, 105 | version = {1.0.0}, 106 | year = {2021} 107 | } 108 | ``` 109 | 110 | ⭐ Consider starring the project on [GitHub](https://github.com/fido-ai/ua-datasets) to support visibility. 111 | -------------------------------------------------------------------------------- /docs/_static/custom_css.css: -------------------------------------------------------------------------------- 1 | /* Fix /page#foo going to the top of the viewport and being hidden by the navbar */ 2 | html { 3 | scroll-padding-top: 50px; 4 | } 5 | 6 | /* Fit the Twitter handle alongside the GitHub one in the top right. */ 7 | 8 | div.md-header__source { 9 | width: revert; 10 | max-width: revert; 11 | } 12 | 13 | a.md-source { 14 | display: inline-block; 15 | } 16 | 17 | .md-source__repository { 18 | max-width: 100%; 19 | } 20 | 21 | /* Emphasise sections of nav on left hand side */ 22 | 23 | nav.md-nav { 24 | padding-left: 5px; 25 | } 26 | 27 | nav.md-nav--secondary { 28 | border-left: revert !important; 29 | } 30 | 31 | .md-nav__title { 32 | font-size: 0.9rem; 33 | } 34 | 35 | .md-nav__item--section > .md-nav__link { 36 | font-size: 0.9rem; 37 | } 38 | 39 | /* Indent autogenerated documentation */ 40 | 41 | div.doc-contents { 42 | padding-left: 25px; 43 | border-left: 4px solid rgba(230, 230, 230); 44 | } 45 | 46 | /* Increase visibility of splitters "---" */ 47 | 48 | [data-md-color-scheme="default"] .md-typeset hr { 49 | border-bottom-color: rgb(0, 0, 0); 50 | border-bottom-width: 1pt; 51 | } 52 | 53 | [data-md-color-scheme="slate"] .md-typeset hr { 54 | border-bottom-color: rgb(230, 230, 230); 55 | } 56 | 57 | /* More space at the bottom of the page */ 58 | 59 | .md-main__inner { 60 | margin-bottom: 1.5rem; 61 | } 62 | 63 | /* Remove prev/next footer buttons */ 64 | 65 | .md-footer__inner { 66 | display: none; 67 | } 68 | 69 | /* Change font sizes */ 70 | 71 | html { 72 | /* Decrease font size for overall webpage 73 | Down from 137.5% which is the Material default */ 74 | font-size: 110%; 75 | } 76 | 77 | .md-typeset .admonition { 78 | /* Increase font size in admonitions */ 79 | font-size: 100% !important; 80 | } 81 | 82 | .md-typeset details { 83 | /* Increase font size in details */ 84 | font-size: 100% !important; 85 | } 86 | 87 | .md-typeset h1 { 88 | font-size: 1.6rem; 89 | } 90 | 91 | .md-typeset h2 { 92 | font-size: 1.5rem; 93 | } 94 | 95 | .md-typeset h3 { 96 | font-size: 1.3rem; 97 | } 98 | 99 | .md-typeset h4 { 100 | font-size: 1.1rem; 101 | } 102 | 103 | .md-typeset h5 { 104 | font-size: 0.9rem; 105 | } 106 | 107 | .md-typeset h6 { 108 | font-size: 0.8rem; 109 | } 110 | 111 | /* Bugfix: remove the superfluous parts generated when doing: 112 | 113 | ??? Blah 114 | 115 | ::: library.something 116 | */ 117 | 118 | .md-typeset details .mkdocstrings > h4 { 119 | display: none; 120 | } 121 | 122 | .md-typeset details .mkdocstrings > h5 { 123 | display: none; 124 | } 125 | 126 | /* Change default colours for tags */ 127 | 128 | [data-md-color-scheme="default"] { 129 | --md-typeset-a-color: rgb(0, 189, 164) !important; 130 | } 131 | [data-md-color-scheme="slate"] { 132 | --md-typeset-a-color: rgb(0, 189, 164) !important; 133 | } 134 | 135 | /* Highlight functions, classes etc. type signatures. Really helps to make clear where 136 | one item ends and another begins. */ 137 | 138 | [data-md-color-scheme="default"] { 139 | --doc-heading-color: #DDD; 140 | --doc-heading-border-color: #CCC; 141 | --doc-heading-color-alt: #F0F0F0; 142 | } 143 | [data-md-color-scheme="slate"] { 144 | --doc-heading-color: rgb(25,25,33); 145 | --doc-heading-border-color: rgb(25,25,33); 146 | --doc-heading-color-alt: rgb(33,33,44); 147 | --md-code-bg-color: rgb(38,38,50); 148 | } 149 | 150 | h4.doc-heading { 151 | /* NOT var(--md-code-bg-color) as that's not visually distinct from other code blocks.*/ 152 | background-color: var(--doc-heading-color); 153 | border: solid var(--doc-heading-border-color); 154 | border-width: 1.5pt; 155 | border-radius: 2pt; 156 | padding: 0pt 5pt 2pt 5pt; 157 | } 158 | h5.doc-heading, h6.heading { 159 | background-color: var(--doc-heading-color-alt); 160 | border-radius: 2pt; 161 | padding: 0pt 5pt 2pt 5pt; 162 | } 163 | -------------------------------------------------------------------------------- /test/test_text_classification/test_news_dataset_hardening.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from ua_datasets.text_classification.news_classification import ( 6 | NewsClassificationDataset, 7 | ParseError, 8 | ) 9 | 10 | 11 | def _write(root: Path, name: str, text: str) -> Path: 12 | p = root / name 13 | p.write_text(text, encoding="utf8") 14 | return p 15 | 16 | 17 | @pytest.fixture 18 | def tmp_news_root(tmp_path: Path) -> Path: 19 | return tmp_path 20 | 21 | 22 | def test_empty_file_raises_parse_error(tmp_news_root: Path) -> None: 23 | _write(tmp_news_root, "train.csv", "") 24 | with pytest.raises(ParseError): 25 | NewsClassificationDataset(root=tmp_news_root, split="train", download=False) 26 | 27 | 28 | def test_missing_required_column(tmp_news_root: Path) -> None: 29 | # Missing target column 30 | content = "title,text,tags\nA,B,tag1|tag2\n" 31 | _write(tmp_news_root, "train.csv", content) 32 | with pytest.raises(ParseError): 33 | NewsClassificationDataset(root=tmp_news_root, split="train", download=False) 34 | 35 | 36 | def test_basic_loading_and_label_cache(tmp_news_root: Path) -> None: 37 | content = "title,text,tags,target\nT1,Body one,tag1|tag2,CLASS1\nT2,Body two,,CLASS2\n" 38 | _write(tmp_news_root, "train.csv", content) 39 | ds = NewsClassificationDataset(root=tmp_news_root, split="train", download=False) 40 | assert len(ds) == 2 41 | assert ds.labels == {"CLASS1", "CLASS2"} 42 | freqs = ds.label_frequencies() 43 | assert freqs == {"CLASS1": 1, "CLASS2": 1} 44 | 45 | 46 | def test_tag_parsing_return_tags(tmp_news_root: Path) -> None: 47 | content = "title,text,tags,target\nT1,Body one,tag1|tag2,CLASS1\nT2,Body two,tag3,CLASS1\n" 48 | _write(tmp_news_root, "train.csv", content) 49 | ds = NewsClassificationDataset( 50 | root=tmp_news_root, split="train", download=False, return_tags=True 51 | ) 52 | title, _text, target, tags = ds[0] 53 | assert title == "T1" 54 | assert target == "CLASS1" 55 | assert tags == ["tag1", "tag2"] 56 | # second sample 57 | _, _, _, tags2 = ds[1] 58 | assert tags2 == ["tag3"] 59 | 60 | 61 | def test_no_trailing_newline(tmp_news_root: Path) -> None: 62 | # File ends without newline, should still parse second row 63 | content = "title,text,tags,target\nT1,Body one,,A\nT2,Body two,,B" # no trailing newline 64 | _write(tmp_news_root, "train.csv", content) 65 | ds = NewsClassificationDataset(root=tmp_news_root, split="train", download=False) 66 | assert len(ds) == 2 67 | 68 | 69 | def test_force_download_skips_when_disabled( 70 | monkeypatch: pytest.MonkeyPatch, tmp_news_root: Path 71 | ) -> None: 72 | # Create existing file then ensure download is *not* called when force_download False 73 | content = "title,text,tags,target\nT1,Body one,,A\n" 74 | _write(tmp_news_root, "train.csv", content) 75 | called = {"count": 0} 76 | 77 | def fake_urlopen(url: str, timeout: int = 0) -> None: 78 | called["count"] += 1 79 | raise AssertionError("Should not be called when file exists and force_download=False") 80 | 81 | monkeypatch.setattr("ua_datasets.text_classification.news_classification.urlopen", fake_urlopen) 82 | NewsClassificationDataset(root=tmp_news_root, split="train", download=False) 83 | assert called["count"] == 0 84 | 85 | 86 | def test_force_download_triggers(monkeypatch: pytest.MonkeyPatch, tmp_news_root: Path) -> None: 87 | content = "title,text,tags,target\nT1,Body one,,A\n" 88 | _write(tmp_news_root, "train.csv", content) 89 | 90 | # Replace content via forced download 91 | new_csv = "title,text,tags,target\nN1,Body new,,B\n" 92 | 93 | class FakeResponse: 94 | def __init__(self, data: bytes) -> None: 95 | self._data = data 96 | 97 | def read(self) -> bytes: 98 | return self._data 99 | 100 | def __enter__(self) -> "FakeResponse": 101 | return self 102 | 103 | def __exit__(self, exc_type: object, exc: object, tb: object) -> None: 104 | return None 105 | 106 | def fake_urlopen(url: str, timeout: int = 0) -> FakeResponse: 107 | return FakeResponse(new_csv.encode("utf8")) 108 | 109 | monkeypatch.setattr("ua_datasets.text_classification.news_classification.urlopen", fake_urlopen) 110 | ds = NewsClassificationDataset( 111 | root=tmp_news_root, split="train", download=True, force_download=True 112 | ) 113 | assert len(ds) == 1 114 | title, *_ = ds[0] 115 | assert title == "N1" 116 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | theme: 2 | name: material 3 | features: 4 | - navigation.sections # Sections are included in the navigation on the left. 5 | - toc.integrate # Table of contents is integrated on the left; does not appear separately on the right. 6 | - header.autohide # header disappears as you scroll 7 | palette: 8 | # Light mode / dark mode 9 | # We deliberately don't automatically use `media` to check a user's preferences. We default to light mode as 10 | # (a) it looks more professional, and (b) is more obvious about the fact that it offers a (dark mode) toggle. 11 | - scheme: default 12 | primary: white 13 | accent: amber 14 | toggle: 15 | icon: material/weather-night 16 | name: Switch to dark mode 17 | - scheme: slate 18 | primary: black 19 | accent: amber 20 | toggle: 21 | icon: material/weather-sunny 22 | name: Switch to light mode 23 | icon: 24 | repo: fontawesome/brands/github # GitHub logo in top right 25 | logo: "material/math-integral-box" # Diffrax logo in top left 26 | favicon: "_static/favicon.png" 27 | custom_dir: "docs/_overrides" # Overriding part of the HTML 28 | 29 | # These additions are my own custom ones, having overridden a partial. 30 | twitter_name: "@bogdan_ivanyuk" 31 | twitter_url: "https://twitter.com/bogdan_ivanyuk" 32 | 33 | site_name: ua-datasets 34 | site_description: The documentation for the ua-datasets software library. 35 | site_author: Bogdan Ivaniuk-Skulskyi 36 | 37 | repo_url: https://github.com/fido-ai/ua-datasets 38 | repo_name: fido-ai/ua-datasets 39 | edit_uri: "" # No edit button, as some of our pages are in /docs and some in /examples via symlink, so it's impossible for them all to be accurate 40 | 41 | strict: true # Don't allow warnings during the build process 42 | 43 | extra_javascript: 44 | # The below three make MathJax work, see https://squidfunk.github.io/mkdocs-material/reference/mathjax/ 45 | - _static/mathjax.js 46 | - https://polyfill.io/v3/polyfill.min.js?features=es6 47 | - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js 48 | 49 | extra_css: 50 | - _static/custom_css.css 51 | 52 | markdown_extensions: 53 | - pymdownx.arithmatex: # Render LaTeX via MathJax 54 | generic: true 55 | - pymdownx.superfences # Seems to enable syntax highlighting when used with the Material theme. 56 | - pymdownx.details # Allowing hidden expandable regions denoted by ??? 57 | - pymdownx.snippets: # Include one Markdown file into another 58 | base_path: docs 59 | - admonition 60 | - toc: 61 | permalink: "¤" # Adds a clickable permalink to each section heading 62 | toc_depth: 4 # Prevents h5, h6 (i.e. methods) from showing up in the TOC. 63 | 64 | plugins: 65 | - search # default search plugin; needs manually re-enabling when using any other plugins 66 | - autorefs # Cross-links to headings 67 | - include_exclude_files: 68 | include: 69 | exclude: 70 | - "_overrides" 71 | - "_static/README.md" 72 | - mknotebooks # Jupyter notebooks 73 | - mkdocstrings: 74 | handlers: 75 | python: 76 | setup_commands: 77 | - import pytkdocs_tweaks 78 | - pytkdocs_tweaks.main() 79 | 80 | selection: 81 | inherited_members: true # Allow looking up inherited methods 82 | rendering: 83 | show_root_heading: true # actually display anything at all... 84 | show_root_full_path: true # display "diffrax.asdf" not just "asdf" 85 | show_if_no_docstring: true 86 | show_signature_annotations: true 87 | show_source: false # don't include source code 88 | members_order: source # order methods according to their order of definition in the source code, not alphabetical order 89 | heading_level: 4 # Makes everything top-level be

. Child entries will be

etc., but because of toc_depth, above, (deliberately) won't appear in the TOC. 90 | 91 | nav: 92 | - 'index.md' 93 | - 'citation.md' 94 | - Examples: 95 | - UA-SQuAD: 'examples/ua_squad.md' 96 | - Mova Institute POS: 'examples/mova_pos.md' 97 | - UA News classification: 'examples/ua_news.md' 98 | - Further details: 99 | - 'further_details/acknowledgements.md' 100 | - 'further_details/benchmarks.md' 101 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |

3 | NaUKMA FIdo Logo 4 |

5 | 6 |

7 | ua_datasets 8 |

9 | 10 | [![PyPI version](https://img.shields.io/pypi/v/ua-datasets.svg)](https://pypi.org/project/ua-datasets/) 11 | [![Python versions](https://img.shields.io/pypi/pyversions/ua-datasets.svg)](https://pypi.org/project/ua-datasets/) 12 | [![License](https://img.shields.io/pypi/l/ua-datasets.svg)](https://github.com/fido-ai/ua-datasets/blob/main/LICENSE) 13 | [![Downloads](https://static.pepy.tech/badge/ua-datasets)](https://pepy.tech/project/ua-datasets) 14 | 15 | [![Build CI](https://github.com/fido-ai/ua-datasets/actions/workflows/ci.yml/badge.svg)](https://github.com/fido-ai/ua-datasets/actions/workflows/ci.yml) 16 | [![Code size](https://img.shields.io/github/languages/code-size/fido-ai/ua-datasets)](https://github.com/fido-ai/ua-datasets) 17 | [![Code style: Ruff](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff) 18 | [![Type checking: mypy](https://img.shields.io/badge/type%20checking-mypy-blue.svg)](http://mypy-lang.org/) 19 | 20 | [**UA-datasets**](https://fido-ai.github.io/ua-datasets/) provides ready-to-use Ukrainian NLP benchmark datasets with a **single, lightweight Python API**. 21 | 22 | > Fast access to Question Answering, News Classification, and POS Tagging corpora — with automatic download, caching, and consistent iteration. 23 | 24 | ### Why use this library? 25 | 26 | - **Unified API**: All datasets expose `len(ds)`, indexing, iteration, and simple frequency helpers. 27 | - **Robust downloads**: Automatic retries, integrity guards, and filename fallbacks for legacy splits. 28 | - **Zero heavy deps**: Pure Python + standard library (core loaders) for quick startup. 29 | - **Repro friendly**: Validation split for UA-SQuAD; classification CSV parsing with resilience to minor format drift. 30 | - **Tooling ready**: Works seamlessly with ruff, mypy, pytest, and uv-based workflows. 31 | 32 | 33 | _Maintained by the FIdo.ai research group (National University of Kyiv-Mohyla Academy)._ 34 | 35 | ## Minimal Example 36 | 37 | ```python 38 | # Assumes `uv` workspace already synced with `uv sync` and project installed. 39 | 40 | from pathlib import Path 41 | from ua_datasets.question_answering import UaSquadDataset 42 | from ua_datasets.text_classification import NewsClassificationDataset 43 | from ua_datasets.token_classification import MovaInstitutePOSDataset 44 | 45 | # Question Answering (first HF-style example dict) 46 | qa = UaSquadDataset(root=Path("./data/ua_squad"), split="train", download=True) 47 | print("QA examples:", len(qa)) 48 | example = qa[0] 49 | print(example.keys()) # id, title, context, question, answers, is_impossible 50 | print(example["question"], "->", example["answers"]["text"]) # list of accepted answers 51 | 52 | # News Classification 53 | news = NewsClassificationDataset(root=Path("./data/ua_news"), split="train", download=True) 54 | title, text, target, tags = news[0] 55 | print("Label count:", len(news.labels), "First label:", target) 56 | 57 | # Part-of-Speech Tagging 58 | pos = MovaInstitutePOSDataset(root=Path("./data/mova_pos"), download=True) 59 | tokens, tags = pos[0] 60 | print(tokens[:8], tags[:8]) 61 | ``` 62 | 63 | For development commands see the Installation section below. 64 | 65 | ## Installation 66 | 67 | Choose one of the following methods. 68 | 69 | ### 1. Using uv (recommended) 70 | 71 | Add to an existing project: 72 | 73 | ```bash 74 | uv add ua-datasets 75 | ``` 76 | 77 | 78 | 79 |
80 | 2. Using pip (PyPI) 81 | 82 | ```bash 83 | # install 84 | pip install ua_datasets 85 | # upgrade 86 | pip install -U ua_datasets 87 | ``` 88 | 89 |
90 | 91 |
92 | 3. From source (editable install) 93 | 94 | ```bash 95 | git clone https://github.com/fido-ai/ua-datasets.git 96 | cd ua-datasets 97 | pip install -e .[dev] # if you later define optional dev extras 98 | ``` 99 | 100 | Or with uv (editable semantics via local path): 101 | 102 | ```bash 103 | git clone https://github.com/fido-ai/ua-datasets.git 104 | cd ua-datasets 105 | uv sync --dev 106 | ``` 107 | 108 |
109 | 110 | 111 | ## Latest Updates 112 | 113 | | Date | Highlights | 114 | |------|------------| 115 | | 25-10-2025 | Added validation split for UA-SQuAD and updated package code. | 116 | | 05-07-2022 | Added HuggingFace API for UA-SQuAD (Q&A) and UA-News (Text Classification). | 117 | 118 | 119 | ## Available Datasets 120 | 121 | | Task | Dataset | Import Class | Splits | Notes | 122 | |------|---------|--------------|--------|-------| 123 | | Question Answering | UA-SQuAD | `UaSquadDataset` | `train`, `val` | SQuAD v2-style examples (`is_impossible`, multi answers); iteration yields dicts | 124 | | Text Classification | UA-News | `NewsClassificationDataset` | `train`, `test` | CSV (title, text, target[, tags]); optional tag parsing | 125 | | Token Classification | Mova Institute POS | `MovaInstitutePOSDataset` | (single corpus) | CoNLL-U like POS tagging; yields (tokens, tags) per sentence | 126 | 127 | ## Contribution 128 | 129 | In case you are willing to contribute (update any part of the library, add your dataset) do not hesitate to connect through [GitHub Issue](https://github.com/fido-ai/ua-datasets/issues/new/choose). Thanks in advance for your contribution! 130 | 131 | ## Citation 132 | 133 | ```bibtex 134 | @software{ua_datasets_2021, 135 | author = {Ivanyuk-Skulskiy, Bogdan and Zaliznyi, Anton and Reshetar, Oleksand and Protsyk, Oleksiy and Romanchuk, Bohdan and Shpihanovych, Vladyslav}, 136 | month = oct, 137 | title = {ua_datasets: a collection of Ukrainian language datasets}, 138 | url = {https://github.com/fido-ai/ua-datasets}, 139 | version = {1.0.0}, 140 | year = {2021} 141 | } 142 | ``` 143 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | build: 10 | strategy: 11 | matrix: 12 | python-version: [3.11] 13 | os: ['ubuntu-latest'] 14 | runs-on: ${{ matrix.os }} 15 | steps: 16 | - name: Checkout code 17 | uses: actions/checkout@v4 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Setup 25 | shell: bash 26 | run: | 27 | python -m pip install --upgrade pip 28 | python -m pip install build 29 | python -m build 30 | - name: Get versions 31 | id: get-versions 32 | shell: bash 33 | run: | 34 | python -c " 35 | import subprocess 36 | import tomllib 37 | vparse = lambda x: tuple(map(int, x.split('.'))) 38 | with open('pyproject.toml', 'rb') as f: 39 | data = tomllib.load(f) 40 | name = data['project']['name'] 41 | checkout_version = data['project']['version'] 42 | pypi_version = subprocess.run(f'python -m pip index versions {name}', 43 | shell=True, capture_output=True).stdout 44 | pypi_version = pypi_version.split(b'\n', 1)[0].split(b' ')[1][1:-1].decode('utf-8') 45 | new_version = str(vparse(checkout_version) > vparse(pypi_version)).lower() 46 | subprocess.run(f'echo name={name} >> $GITHUB_OUTPUT', shell=True) 47 | subprocess.run(f'echo tag=v{checkout_version} >> $GITHUB_OUTPUT', shell=True) 48 | subprocess.run(f'echo new-version={new_version} >> $GITHUB_OUTPUT', shell=True) 49 | print(f'Got checkout_version={vparse(checkout_version)!r}') 50 | print(f'Got pypi_version={vparse(pypi_version)!r}') 51 | print(f'Setting name={name}') 52 | print(f'Setting tag=v{checkout_version}') 53 | print(f'Setting new-version={new_version}') 54 | " 55 | - name: Test sdist 56 | id: test-sdist 57 | if: steps.get-versions.outputs.new-version == 'true' 58 | shell: bash 59 | run: | 60 | python -m pip install dist/*.tar.gz 61 | cd $(mktemp -d) 62 | set +e 63 | bash -c " 64 | python -m pip install pytest 65 | cp -r ${{ github.workspace }}/test ./test 66 | pytest --disable-warnings 67 | " 68 | if [ "$?" -eq 0 ] 69 | then 70 | echo result=true >> $GITHUB_OUTPUT 71 | else 72 | echo result=false >> $GITHUB_OUTPUT 73 | fi 74 | set -e 75 | python -m pip uninstall -y -r <(pip freeze) 76 | cd ${{ github.workspace }} 77 | - name: Test bdist_wheel 78 | id: test-bdist-wheel 79 | if: steps.get-versions.outputs.new-version == 'true' 80 | shell: bash 81 | run: | 82 | python -m pip install dist/*.whl 83 | cd $(mktemp -d) 84 | set +e 85 | bash -c " 86 | python -m pip install pytest 87 | cp -r ${{ github.workspace }}/test ./test 88 | pytest --disable-warnings 89 | " 90 | if [ "$?" -eq 0 ] 91 | then 92 | echo result=true >> $GITHUB_OUTPUT 93 | else 94 | echo result=false >> $GITHUB_OUTPUT 95 | fi 96 | set -e 97 | python -m pip uninstall -y -r <(pip freeze) 98 | cd ${{ github.workspace }} 99 | - name: Logging 100 | shell: bash 101 | run: | 102 | echo new-version=${{ steps.get-versions.outputs.new-version }} 103 | echo sdist-result=${{ steps.test-sdist.outputs.result }} 104 | echo bdist-result=${{ steps.test-bdist-wheel.outputs.result }} 105 | - name: Tag 106 | env: 107 | github_user: KyloRen1 108 | github_token: ${{ github.token }} 109 | if: (steps.get-versions.outputs.new-version == 'true') && (steps.test-sdist.outputs.result == 'true') && (steps.test-bdist-wheel.outputs.result == 'true') 110 | shell: bash 111 | run: | 112 | git config --global user.email "noreply@example.com" 113 | git config --global user.name "Action: Update Python project" 114 | git tag "${{ steps.get-versions.outputs.tag }}" -m "" 115 | git push https://$github_user:$github_token@github.com/${{ github.repository }} "${{ steps.get-versions.outputs.tag }}" 116 | - name: GitHub release 117 | if: (steps.get-versions.outputs.new-version == 'true') && (steps.test-sdist.outputs.result == 'true') && (steps.test-bdist-wheel.outputs.result == 'true') 118 | uses: softprops/action-gh-release@v1 119 | with: 120 | name: "${{ steps.get-versions.outputs.name }} ${{ steps.get-versions.outputs.tag }}" 121 | body: "Autogenerated release notes as follows:" 122 | tag_name: "${{ steps.get-versions.outputs.tag }}" 123 | token: ${{ github.token }} 124 | generate_release_notes: true 125 | 126 | - name: Push to PyPI 127 | if: (steps.get-versions.outputs.new-version == 'true') && (steps.test-sdist.outputs.result == 'true') && (steps.test-bdist-wheel.outputs.result == 'true') 128 | uses: pypa/gh-action-pypi-publish@release/v1 129 | with: 130 | password: ${{ secrets.pypi_token }} 131 | 132 | - name: Fail 133 | if: (steps.get-versions.outputs.new-version == 'true') && ((steps.test-sdist.outputs.result != 'true') || (steps.test-bdist-wheel.outputs.result != 'true')) 134 | shell: bash 135 | run: exit 1 136 | -------------------------------------------------------------------------------- /ua_datasets/utils.py: -------------------------------------------------------------------------------- 1 | """Shared internal utilities (network + atomic file helpers). 2 | 3 | This consolidates retrying download logic and atomic write operations used by 4 | multiple dataset loaders. 5 | 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | from hashlib import sha256 11 | from pathlib import Path 12 | from time import sleep 13 | from typing import Any, Callable, Optional 14 | from urllib.error import HTTPError, URLError 15 | from urllib.request import urlopen 16 | 17 | __all__ = [ 18 | "DownloadFailure", 19 | "atomic_write_text", 20 | "download_text_with_retries", 21 | ] 22 | 23 | 24 | class DownloadFailure(RuntimeError): 25 | """Raised when a download ultimately fails after retries.""" 26 | 27 | 28 | def download_text_with_retries( 29 | url: str, 30 | *, 31 | timeout: int = 15, 32 | max_retries: int = 3, 33 | expected_sha256: str | None = None, 34 | backoff_factor: float = 0.5, 35 | validate: Optional[Callable[[str], bool]] = None, 36 | opener: Callable[..., Any] = urlopen, 37 | show_progress: bool = False, 38 | chunk_size: int = 8192, 39 | ) -> str: 40 | """Download URL returning decoded UTF-8 text with retries & optional integrity. 41 | 42 | Enhanced with an optional streaming progress bar (stdout) using only the 43 | standard library to preserve the project's minimal dependency footprint. 44 | 45 | Parameters 46 | ---------- 47 | url : str 48 | Resource to fetch (HTTP/HTTPS). 49 | timeout : int 50 | Per-attempt timeout (seconds). 51 | max_retries : int 52 | Maximum number of attempts before failing. 53 | expected_sha256 : str | None 54 | If provided, the hex digest must match the downloaded bytes. 55 | backoff_factor : float 56 | Linear backoff factor (sleep = factor * attempt_number). 57 | validate : Callable[[str], bool] | None 58 | Optional predicate applied to decoded text; must return True for success. 59 | opener : Callable[..., Any] 60 | Function used to open the URL (injected for test monkeypatching). 61 | show_progress : bool 62 | If True, prints a simple ASCII progress indicator while streaming. 63 | chunk_size : int 64 | Byte size for streaming chunks when show_progress is enabled. 65 | """ 66 | attempt = 0 67 | last_exc: Exception | None = None 68 | while attempt < max_retries: 69 | attempt += 1 70 | try: 71 | # Use provided opener (enables test monkeypatching at call sites) 72 | with opener(url, timeout=timeout) as resp: # nosec - caller controls domain 73 | if show_progress: 74 | # Attempt to read content length for percentage; fallback to 0 (unknown) 75 | try: 76 | total_size = int(getattr(resp, "headers", {}).get("Content-Length", "0")) 77 | except Exception: 78 | total_size = 0 79 | downloaded = 0 80 | buf = bytearray() 81 | while True: 82 | # Some mocked/monkeypatched responses (in tests) provide a 83 | # read() method that does NOT accept a size argument OR return 84 | # the full payload on every call (no internal cursor). We: 85 | # 1. Attempt sized reads 86 | # 2. Fallback to a single full read if TypeError is raised 87 | # 3. Break immediately after a fallback full read to avoid 88 | # an infinite loop continually re-appending identical bytes. 89 | try: 90 | chunk = resp.read(chunk_size) 91 | fallback_full_read = False 92 | except TypeError: # signature read() -> bytes (no size param) 93 | chunk = resp.read() 94 | fallback_full_read = True 95 | if not chunk: 96 | break 97 | buf.extend(chunk) 98 | downloaded += len(chunk) 99 | if total_size > 0: 100 | pct = downloaded / total_size * 100 101 | bar_width = 30 102 | filled = int(bar_width * downloaded / total_size) 103 | bar = "#" * filled + "-" * (bar_width - filled) 104 | print( 105 | f"\rDownloading {url} [{bar}] {pct:5.1f}% ({downloaded}/{total_size} bytes)", 106 | end="", 107 | flush=True, 108 | ) 109 | else: 110 | print(f"\rDownloading {url} {downloaded} bytes", end="", flush=True) 111 | if fallback_full_read: 112 | # Prevent infinite loop when mock returns whole content each call 113 | break 114 | data = bytes(buf) 115 | # Ensure newline after completion for clean subsequent output 116 | print() 117 | else: 118 | data = resp.read() 119 | if expected_sha256 is not None: 120 | digest = sha256(data).hexdigest() 121 | if digest.lower() != expected_sha256.lower(): 122 | raise DownloadFailure( 123 | f"SHA256 mismatch for {url}: expected {expected_sha256} got {digest}" 124 | ) 125 | text = data.decode("utf8") 126 | if not text.strip(): 127 | raise DownloadFailure("Downloaded content empty/whitespace.") 128 | if validate and not validate(text): 129 | raise DownloadFailure("Validation predicate rejected content.") 130 | return text 131 | except (HTTPError, URLError, TimeoutError, DownloadFailure) as exc: 132 | last_exc = exc 133 | if attempt < max_retries: 134 | sleep(backoff_factor * attempt) 135 | else: 136 | break 137 | except UnicodeDecodeError as exc: 138 | last_exc = exc 139 | break 140 | except Exception as exc: # unknown fatal 141 | last_exc = exc 142 | break 143 | raise DownloadFailure(f"Failed to download {url} after {max_retries} attempts: {last_exc}") 144 | 145 | 146 | def atomic_write_text(path: Path, text: str, *, encoding: str = "utf8") -> None: 147 | """Write text atomically by first writing to a temporary sibling file. 148 | 149 | Ensures readers do not observe a partially written file. 150 | """ 151 | tmp = path.with_suffix(path.suffix + ".tmp") 152 | tmp.write_text(text, encoding=encoding) 153 | tmp.replace(path) 154 | -------------------------------------------------------------------------------- /ua_datasets/token_classification/part_of_speech.py: -------------------------------------------------------------------------------- 1 | """Part-of-speech tagging dataset loader for the Mova Institute corpus. 2 | 3 | This module provides a light-weight, dependency-free interface to download and 4 | parse a (CoNLL-U like) POS tagging dataset with a focus on robustness and 5 | clarity. 6 | 7 | Example 8 | ------- 9 | >>> ds = MovaInstitutePOSDataset(root=Path('./data'), download=True) 10 | >>> tokens, tags = ds[0] 11 | >>> len(ds), len(tokens) == len(tags) 12 | """ 13 | 14 | from collections.abc import Sequence as ABCSequence 15 | from dataclasses import dataclass, field 16 | from pathlib import Path 17 | from typing import Dict, Generic, Iterator, List, Set, Tuple, TypeVar 18 | 19 | from ua_datasets.utils import DownloadFailure, atomic_write_text, download_text_with_retries 20 | 21 | __all__ = [ 22 | "DownloadError", 23 | "MovaInstitutePOSDataset", 24 | "ParseError", 25 | ] 26 | 27 | Sentence = List[str] 28 | TagSequence = List[str] 29 | 30 | 31 | S = TypeVar("S", bound=Sentence) 32 | T = TypeVar("T", bound=TagSequence) 33 | 34 | 35 | class DownloadError(RuntimeError): 36 | """Raised when the dataset cannot be downloaded after retries.""" 37 | 38 | 39 | class ParseError(RuntimeError): 40 | """Raised when the dataset file cannot be parsed into any sentences.""" 41 | 42 | 43 | @dataclass(slots=True) 44 | class MovaInstitutePOSDataset(ABCSequence, Generic[S, T]): 45 | """Dataset wrapper for the Mova Institute POS tagging corpus. 46 | 47 | Parameters 48 | ---------- 49 | root: 50 | Directory where the dataset file will be stored / read from. 51 | download: 52 | If True (default) the dataset will be downloaded if missing. 53 | file_name: 54 | Local filename for the cached dataset (text format). 55 | data_file: 56 | Remote URL containing the dataset contents. 57 | """ 58 | 59 | root: Path 60 | download: bool = True 61 | file_name: str = "mova_institute_pos_dataset.txt" 62 | data_file: str = "https://lab.mova.institute/files/robochyi_tb.conllu.txt" 63 | force_download: bool = False 64 | max_retries: int = 3 65 | timeout: int = 15 # seconds for individual HTTP attempt 66 | expected_sha256: str | None = None 67 | show_progress: bool = True 68 | 69 | dataset_path: Path = field(init=False) 70 | _samples: List[Sentence] = field(init=False, default_factory=list) 71 | _labels: List[TagSequence] = field(init=False, default_factory=list) 72 | _unique_labels_cache: Set[str] = field(init=False, default_factory=set) 73 | 74 | def __post_init__(self) -> None: 75 | self.root = Path(self.root) 76 | self.dataset_path = self.root / self.file_name 77 | if self.download: 78 | self.download_dataset() 79 | if not self._check_exists(): # Fail early with a clear message. 80 | raise FileNotFoundError( 81 | "Dataset not found. Use download=True to fetch it or ensure the file exists." 82 | ) 83 | self._samples, self._labels = self._load_data() 84 | if not self._samples: 85 | raise ParseError( 86 | f"Parsed zero sentences from dataset file '{self.dataset_path}'. File may be empty or malformed." 87 | ) 88 | # Cache unique labels (frozenset semantics but returning a set copy in property) 89 | self._unique_labels_cache = {lab for seq in self._labels for lab in seq} 90 | 91 | @property 92 | def labels(self) -> List[TagSequence]: 93 | """Raw label sequences (parallel to `data`).""" 94 | return self._labels 95 | 96 | @property 97 | def data(self) -> List[Sentence]: 98 | """Raw token sequences.""" 99 | return self._samples 100 | 101 | @property 102 | def unique_labels(self) -> Set[str]: 103 | """Unique set of tag labels present in the corpus (cached).""" 104 | return self._unique_labels_cache 105 | 106 | def label_frequencies(self) -> Dict[str, int]: 107 | """Return a mapping of label -> occurrence count. 108 | 109 | Useful for quick exploratory statistics. 110 | """ 111 | freqs: Dict[str, int] = {} 112 | for seq in self._labels: 113 | for lab in seq: 114 | freqs[lab] = freqs.get(lab, 0) + 1 115 | return freqs 116 | 117 | def _iter_conllu_sentences(self) -> Iterator[Tuple[Sentence, TagSequence]]: 118 | """Yield (tokens, tags) for each sentence in the dataset file.""" 119 | tokens: Sentence = [] 120 | tags: TagSequence = [] 121 | with self.dataset_path.open("r", encoding="utf8") as fh: 122 | for raw in fh: 123 | line = raw.rstrip("\n") 124 | stripped = line.strip() 125 | if not stripped: # sentence boundary 126 | if tokens: 127 | yield tokens, tags 128 | tokens, tags = [], [] 129 | continue 130 | if stripped.startswith("#"): 131 | continue 132 | parts = stripped.split("\t") 133 | if len(parts) < 4: 134 | continue 135 | id_field = parts[0] 136 | # Skip multiword tokens like '3-4' 137 | if "-" in id_field: 138 | continue 139 | if not id_field.isdigit(): 140 | continue 141 | token = parts[1] 142 | tag = parts[3] 143 | tokens.append(token) 144 | tags.append(tag) 145 | # Flush final sentence if file lacks trailing newline/blank line 146 | if tokens: 147 | yield tokens, tags 148 | 149 | def _load_data(self) -> Tuple[List[Sentence], List[TagSequence]]: 150 | samples: List[Sentence] = [] 151 | labels: List[TagSequence] = [] 152 | for sent, tag_seq in self._iter_conllu_sentences(): 153 | samples.append(sent) 154 | labels.append(tag_seq) 155 | return samples, labels 156 | 157 | def __getitem__(self, idx: int) -> Tuple[Sentence, TagSequence]: # type: ignore[override] 158 | return self._samples[idx], self._labels[idx] 159 | 160 | def __len__(self) -> int: 161 | return len(self._samples) 162 | 163 | def __iter__(self) -> Iterator[Tuple[Sentence, TagSequence]]: 164 | for sample, label in zip(self._samples, self._labels, strict=True): 165 | yield sample, label 166 | 167 | def __repr__(self) -> str: 168 | return f"{self.__class__.__name__}(n_sentences={len(self)}, unique_labels={len(self.unique_labels)})" 169 | 170 | def _check_exists(self) -> bool: 171 | return self.dataset_path.exists() 172 | 173 | def download_dataset(self) -> None: 174 | """Download the raw dataset file if needed using shared retry helper.""" 175 | if self._check_exists() and not self.force_download: 176 | return 177 | self.root.mkdir(parents=True, exist_ok=True) 178 | try: 179 | text = download_text_with_retries( 180 | self.data_file, 181 | timeout=self.timeout, 182 | max_retries=self.max_retries, 183 | expected_sha256=self.expected_sha256, 184 | show_progress=self.show_progress, 185 | ) 186 | except DownloadFailure as exc: 187 | raise DownloadError(str(exc)) from exc 188 | atomic_write_text(self.dataset_path, text) 189 | -------------------------------------------------------------------------------- /ua_datasets/text_classification/news_classification.py: -------------------------------------------------------------------------------- 1 | """News classification dataset loader. 2 | 3 | Expected CSV Columns 4 | -------------------- 5 | Required minimal columns: ``title``, ``text``, ``tags``, ``target`` in that 6 | order. (Historically this dataset has used that order.) If columns are missing 7 | or re-ordered the loader attempts to locate required names; if any mandatory 8 | column is absent a :class:`ParseError` is raised. 9 | 10 | Example 11 | ------- 12 | >>> ds = NewsClassificationDataset(root=Path('./news'), split='train', download=True) 13 | >>> title, text, target, tags = ds[0] 14 | >>> len(ds), target in ds.labels 15 | """ 16 | 17 | from __future__ import annotations 18 | 19 | import csv 20 | from dataclasses import dataclass, field 21 | from pathlib import Path 22 | from typing import Dict, Iterator, List, Optional, Set, Tuple 23 | from urllib.request import urlopen 24 | 25 | from ua_datasets.utils import DownloadFailure, atomic_write_text, download_text_with_retries 26 | 27 | __all__ = [ 28 | "DownloadError", 29 | "NewsClassificationDataset", 30 | "ParseError", 31 | ] 32 | 33 | 34 | class DownloadError(RuntimeError): 35 | """Raised when the dataset cannot be downloaded after retries or integrity check fails.""" 36 | 37 | 38 | class ParseError(RuntimeError): 39 | """Raised when CSV file is empty, malformed, or missing mandatory columns.""" 40 | 41 | 42 | Row = List[str] 43 | Sample = Tuple[str, str, str, Optional[List[str]]] 44 | 45 | 46 | @dataclass(slots=True) 47 | class NewsClassificationDataset: 48 | """Ukrainian news classification dataset. 49 | 50 | Parameters 51 | ---------- 52 | root: 53 | Directory where the dataset split CSV will be stored or read from. 54 | download: 55 | If ``True`` (default), download the split file if it is missing. 56 | split: 57 | One of ``"train"`` or ``"test"``. 58 | return_tags: 59 | If ``True`` parsed list of tags is returned instead of ``None`` in the 60 | 4th element of each sample tuple. 61 | """ 62 | 63 | root: Path 64 | download: bool = True 65 | split: str = "train" 66 | return_tags: bool = False 67 | 68 | base_url: str = "https://github.com/fido-ai/ua-datasets/releases/download/v0.0.1/" 69 | force_download: bool = False 70 | max_retries: int = 3 71 | timeout: int = 20 # seconds 72 | expected_sha256: str | None = None 73 | show_progress: bool = True 74 | 75 | dataset_path: Path = field(init=False) 76 | _columns: List[str] = field(init=False, default_factory=list) 77 | _rows: List[Row] = field(init=False, default_factory=list) 78 | _parsed_tags: Optional[List[List[str]]] = field(init=False, default=None) 79 | _label_cache: Set[str] = field(init=False, default_factory=set) 80 | 81 | def __post_init__(self) -> None: 82 | self.root = Path(self.root) 83 | self.dataset_path = self.root / f"{self.split}.csv" 84 | if self.download: 85 | self.download_dataset() 86 | if not self.dataset_path.exists(): 87 | raise FileNotFoundError( 88 | "Dataset not found. Use download=True to fetch it or ensure the file exists." 89 | ) 90 | self._rows = self._load_rows() 91 | if not self._rows: 92 | raise ParseError("Loaded zero rows; file may be empty or malformed.") 93 | # Cache labels for fast repeated access 94 | self._label_cache = {row[self._columns.index("target")] for row in self._rows} 95 | 96 | def download_dataset(self) -> None: 97 | """Download the dataset split file if needed using shared helper.""" 98 | if self.dataset_path.exists() and not self.force_download: 99 | return 100 | self.root.mkdir(parents=True, exist_ok=True) 101 | url = f"{self.base_url}{self.split}.csv" 102 | try: 103 | text = download_text_with_retries( 104 | url, 105 | timeout=self.timeout, 106 | max_retries=self.max_retries, 107 | expected_sha256=self.expected_sha256, 108 | opener=urlopen, 109 | show_progress=self.show_progress, 110 | ) 111 | except DownloadFailure as exc: 112 | raise DownloadError(str(exc)) from exc 113 | atomic_write_text(self.dataset_path, text) 114 | 115 | def _load_rows(self) -> List[Row]: 116 | """Load raw rows from CSV, capturing header separately and validating columns.""" 117 | with self.dataset_path.open("r", encoding="utf8", newline="") as f: 118 | reader = csv.reader(f) 119 | try: 120 | self._columns = next(reader) 121 | except StopIteration as exc: 122 | raise ParseError("CSV file is empty") from exc 123 | required = {"title", "text", "target"} 124 | missing = required - set(self._columns) 125 | if missing: 126 | raise ParseError(f"Missing required column(s): {', '.join(sorted(missing))}") 127 | rows: List[Row] = [] 128 | for row in reader: 129 | if not row or all(cell == "" for cell in row): 130 | continue 131 | # Basic row length guard 132 | if len(row) < len(self._columns): 133 | # Allow shorter if trailing columns empty, pad to columns length 134 | row = row + [""] * (len(self._columns) - len(row)) 135 | rows.append(row) 136 | return rows 137 | 138 | @property 139 | def column_names(self) -> List[str]: 140 | return self._columns 141 | 142 | @property 143 | def labels(self) -> Set[str]: 144 | return set(self._label_cache) 145 | 146 | @property 147 | def data(self) -> List[Row]: 148 | return self._rows 149 | 150 | @staticmethod 151 | def _preprocess_tags(tags: str) -> List[str]: 152 | return [el for el in tags.split("|") if el] 153 | 154 | def _ensure_parsed_tags(self) -> None: 155 | if not self.return_tags or self._parsed_tags is not None: 156 | return 157 | tags_idx = self._columns.index("tags") if "tags" in self._columns else None 158 | parsed: List[List[str]] = [] 159 | for row in self._rows: 160 | raw = row[tags_idx] if tags_idx is not None and tags_idx < len(row) else "" 161 | parsed.append(self._preprocess_tags(raw)) 162 | self._parsed_tags = parsed 163 | 164 | def label_frequencies(self) -> Dict[str, int]: 165 | freqs: Dict[str, int] = {} 166 | tgt_idx = self._columns.index("target") 167 | for row in self._rows: 168 | tgt = row[tgt_idx] 169 | freqs[tgt] = freqs.get(tgt, 0) + 1 170 | return freqs 171 | 172 | def __len__(self) -> int: 173 | return len(self._rows) 174 | 175 | def __getitem__(self, idx: int) -> Sample: 176 | title, text, _tags_raw, target = self._rows[idx] 177 | if self.return_tags: 178 | self._ensure_parsed_tags() 179 | assert self._parsed_tags is not None 180 | return title, text, target, self._parsed_tags[idx] 181 | return title, text, target, None 182 | 183 | def __iter__(self) -> Iterator[Sample]: 184 | for i in range(len(self)): 185 | yield self[i] 186 | 187 | def __repr__(self) -> str: 188 | return f"{self.__class__.__name__}(split={self.split!r}, n_rows={len(self)}, n_labels={len(self.labels)}, return_tags={self.return_tags})" 189 | -------------------------------------------------------------------------------- /ua_datasets/question_answering/uasquad_question_answering.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | from dataclasses import dataclass, field 5 | from pathlib import Path 6 | from typing import Any, Dict, Iterator, List, Optional, Set 7 | from urllib.request import urlopen 8 | 9 | from ua_datasets.utils import DownloadFailure, atomic_write_text, download_text_with_retries 10 | 11 | __all__ = [ 12 | "DownloadError", 13 | "ParseError", 14 | "UaSquadDataset", 15 | "load_ua_squad_v2", 16 | ] 17 | 18 | # Public (lightweight) representation of a SQuAD v2 style example. 19 | # We intentionally keep this a plain dict-compatible shape instead of introducing 20 | # pydantic/dataclasses for each row to avoid overhead and preserve zero heavy deps. 21 | HFStyleExample = Dict[str, Any] 22 | 23 | 24 | class DownloadError(RuntimeError): 25 | """Raised when a split cannot be downloaded after retries or integrity check fails.""" 26 | 27 | 28 | class ParseError(RuntimeError): 29 | """Raised when the JSON file is malformed or yields zero valid QA triplets.""" 30 | 31 | 32 | @dataclass(slots=True) 33 | class UaSquadDataset: 34 | """Ukrainian SQuAD-style Question Answering dataset. 35 | 36 | Parameters 37 | ---------- 38 | root: 39 | Directory where splits will be cached. 40 | split: 41 | One of ``"train"`` or ``"val"``. 42 | download: 43 | If ``True`` (default) downloads the split file if it is missing. 44 | file_map: 45 | Optional mapping from split name to filename. Defaults to 46 | ``{"train": "train.json", "val": "val.json"}``. 47 | base_url: 48 | Base URL path ending with a slash from which filenames are resolved. 49 | """ 50 | 51 | root: Path 52 | split: str = "train" 53 | download: bool = True 54 | file_map: dict[str, List[str]] = field( 55 | default_factory=lambda: { 56 | "train": ["train.json"], 57 | "val": ["val.json", "validation.json", "dev.json", "val.jspon"], 58 | } 59 | ) 60 | base_url: str = "https://huggingface.co/datasets/FIdo-AI/ua-squad/resolve/main/" 61 | force_download: bool = False 62 | max_retries: int = 3 63 | timeout: int = 20 # seconds 64 | expected_sha256: str | None = None 65 | show_progress: bool = True 66 | # If True (default) skip flat-format training examples whose 'answer' value is an empty string. 67 | # This avoids polluting the training set with ambiguous empty-answer placeholders while still 68 | # retaining explicit impossible examples represented by a missing 'answer' key (answer=None). 69 | ignore_empty_answer: bool = True 70 | 71 | dataset_path: Optional[Path] = field(init=False, default=None) 72 | # SQuAD v2 style expanded storage 73 | _examples: List[HFStyleExample] = field(init=False, default_factory=list) 74 | _unique_answers_cache: Set[str] = field(init=False, default_factory=set) 75 | 76 | def __post_init__(self) -> None: 77 | self.root = Path(self.root) 78 | if self.split not in self.file_map: 79 | raise ValueError( 80 | f"Unsupported split '{self.split}'. Expected one of: {list(self.file_map)}" 81 | ) 82 | self.dataset_path = self._resolve_or_download_split() 83 | if self.dataset_path is None: 84 | # Graceful empty dataset (tests expect len==0 allowed) 85 | self._examples = [] 86 | return 87 | self._examples = self._parse( 88 | self.dataset_path, 89 | ignore_empty_answer=self.ignore_empty_answer, 90 | split=self.split, 91 | ) 92 | if not self._examples: 93 | raise ParseError( 94 | f"Parsed zero QA examples from '{self.dataset_path}'. File may be malformed." 95 | ) 96 | # Build unique answer cache ignoring empties and impossible examples. 97 | self._unique_answers_cache = { 98 | t 99 | for ex in self._examples 100 | if not ex.get("is_impossible") 101 | for t in ex.get("answers", {}).get("text", []) 102 | if t 103 | } 104 | 105 | @property 106 | def unique_answers(self) -> Set[str]: 107 | return set(self._unique_answers_cache) 108 | 109 | def answer_frequencies(self) -> Dict[str, int]: 110 | freqs: Dict[str, int] = {} 111 | for ex in self._examples: 112 | if ex.get("is_impossible"): 113 | continue 114 | for t in ex.get("answers", {}).get("text", []): 115 | if not t: 116 | continue 117 | freqs[t] = freqs.get(t, 0) + 1 118 | return freqs 119 | 120 | def _resolve_or_download_split(self) -> Path | None: 121 | """Locate or download split file with retries & optional integrity.""" 122 | candidates = self.file_map[self.split] 123 | self.root.mkdir(parents=True, exist_ok=True) 124 | 125 | # Existing file short-circuit 126 | for name in candidates: 127 | path = self.root / name 128 | if path.exists() and not self.force_download: 129 | return path 130 | if not self.download: 131 | return None 132 | 133 | for name in candidates: 134 | path = self.root / name 135 | url = f"{self.base_url}{name}" 136 | try: 137 | text = download_text_with_retries( 138 | url, 139 | timeout=self.timeout, 140 | max_retries=self.max_retries, 141 | expected_sha256=self.expected_sha256, 142 | validate=lambda t: t.lstrip().startswith("{") or t.lstrip().startswith("["), 143 | opener=urlopen, 144 | show_progress=self.show_progress, 145 | ) 146 | atomic_write_text(path, text) 147 | return path 148 | except DownloadFailure: 149 | continue 150 | return None 151 | 152 | @staticmethod 153 | def _parse( 154 | path: Path, 155 | *, 156 | ignore_empty_answer: bool = True, 157 | split: str | None = None, 158 | ) -> List[HFStyleExample]: 159 | """Parse flat (train-like) or nested SQuAD / SQuAD v2 style JSON into HF style examples only.""" 160 | with path.open("r", encoding="utf8") as f: 161 | try: 162 | obj = json.load(f) 163 | except json.JSONDecodeError as exc: 164 | raise ParseError(f"Failed to decode JSON file '{path}': {exc}") from exc 165 | 166 | data = obj.get("data", []) 167 | examples: List[HFStyleExample] = [] 168 | 169 | def _gen_id(question: str, context: str) -> str: 170 | # Lightweight deterministic id (not cryptographic, good enough for local uniqueness) 171 | import hashlib 172 | 173 | h = hashlib.sha1() 174 | h.update((question + "\n" + context).encode("utf-8")) 175 | return h.hexdigest()[:16] 176 | 177 | def _compute_answer_start(context: str, answer_text: str) -> int: 178 | return context.find(answer_text) if answer_text else -1 179 | 180 | nested_format = ( 181 | data 182 | and isinstance(data, list) 183 | and isinstance(data[0], dict) 184 | and "paragraphs" in data[0] 185 | ) 186 | 187 | if nested_format: 188 | # SQuAD / SQuAD v2 style validation (or full) format 189 | for article in data: 190 | title = article.get("title") 191 | for para in article.get("paragraphs", []): 192 | raw_context = para.get("context") 193 | if raw_context is None: 194 | continue 195 | context = str(raw_context).strip() 196 | if not context: 197 | continue 198 | for qa in para.get("qas", []): 199 | raw_question = qa.get("question") 200 | if raw_question is None: 201 | continue 202 | question = str(raw_question).strip() 203 | if not question: 204 | continue 205 | # answers may be empty in SQuAD v2 206 | ans_objs = qa.get("answers") or [] 207 | texts: List[str] = [] 208 | starts: List[int] = [] 209 | for cand in ans_objs: 210 | t = str(cand.get("text", "")).strip() 211 | if not t: 212 | continue 213 | start = cand.get("answer_start") 214 | if isinstance(start, int) and start >= 0: 215 | # validate substring alignment quickly (best effort) 216 | if context[start : start + len(t)] != t: 217 | # fallback to search 218 | start = _compute_answer_start(context, t) 219 | else: 220 | start = _compute_answer_start(context, t) 221 | if start >= 0: 222 | texts.append(t) 223 | starts.append(start) 224 | is_impossible = bool(qa.get("is_impossible", len(texts) == 0)) 225 | examples.append( 226 | { 227 | "id": qa.get("id") or _gen_id(question, context), 228 | "title": title, 229 | "context": context, 230 | "question": question, 231 | "answers": {"text": texts, "answer_start": starts}, 232 | "is_impossible": is_impossible, 233 | } 234 | ) 235 | else: 236 | # Flat simplified train-like structure with singular 'answer' 237 | for item in data: 238 | if not isinstance(item, dict): 239 | continue 240 | question = str(item.get("question", "")).strip() 241 | context = str(item.get("context", "")).strip() 242 | answer = item.get("answer") 243 | if not question or not context: 244 | continue 245 | if answer is None: 246 | # impossible (no answer provided) 247 | texts = [] 248 | starts = [] 249 | is_impossible = True 250 | else: 251 | ans_text = str(answer).strip() 252 | if not ans_text: 253 | # Empty string answer 254 | if ignore_empty_answer and split == "train": 255 | # Skip this example entirely when training to avoid noisy empties. 256 | continue 257 | # Keep as impossible example for non-train splits (evaluation) or when flag disabled. 258 | texts = [] 259 | starts = [] 260 | is_impossible = True 261 | else: 262 | start_pos = _compute_answer_start(context, ans_text) 263 | if start_pos == -1: 264 | # Accept provided answer text even if not found in context for synthetic tests; 265 | # record start as -1 to indicate unknown alignment. 266 | texts = [ans_text] 267 | starts = [-1] 268 | is_impossible = False 269 | else: 270 | texts = [ans_text] 271 | starts = [start_pos] 272 | is_impossible = False 273 | examples.append( 274 | { 275 | "id": _gen_id(question, context), 276 | "title": None, 277 | "context": context, 278 | "question": question, 279 | "answers": {"text": texts, "answer_start": starts}, 280 | "is_impossible": is_impossible, 281 | } 282 | ) 283 | 284 | return examples 285 | 286 | def __getitem__(self, idx: int) -> HFStyleExample: 287 | return self._examples[idx] 288 | 289 | def __len__(self) -> int: 290 | return len(self._examples) 291 | 292 | def __iter__(self) -> Iterator[HFStyleExample]: 293 | for ex in self._examples: 294 | yield ex 295 | 296 | def __repr__(self) -> str: 297 | return f"{self.__class__.__name__}(split={self.split!r}, examples={len(self._examples)}, unique_answers={len(self._unique_answers_cache)})" 298 | 299 | def _check_exists(self) -> bool: 300 | return bool(self.dataset_path and self.dataset_path.exists()) 301 | 302 | # ---- SQuAD v2 style accessors ------------------------------------------------- 303 | @property 304 | def examples(self) -> List[HFStyleExample]: 305 | """Full list of SQuAD v2 style examples. 306 | 307 | Each example dict has keys: id, title, context, question, answers, is_impossible. 308 | Answers is a dict {'text': List[str], 'answer_start': List[int]} as expected by 309 | Hugging Face's squad_v2 format. No heavy HF dependency is required here. 310 | """ 311 | return list(self._examples) 312 | 313 | def to_hf_dict(self) -> List[Dict[str, Any]]: # lightweight alias 314 | """Alias returning examples (intended for quick serialization).""" 315 | return self.examples 316 | 317 | def to_hf_dataset(self) -> Any: # pragma: no cover - optional convenience 318 | """Return a Hugging Face Dataset (requires 'datasets' installed). 319 | 320 | This keeps the core library free from the dependency; import is local. 321 | """ 322 | try: # local import to avoid hard dependency 323 | import importlib 324 | 325 | ds_mod = importlib.import_module("datasets") 326 | Dataset = ds_mod.Dataset 327 | except Exception as exc: 328 | raise RuntimeError( 329 | "The 'datasets' package is required for to_hf_dataset(); install with 'pip install datasets'." 330 | ) from exc 331 | return Dataset.from_list(self._examples) 332 | 333 | 334 | # ---------------------------------------------------------------------------- 335 | # Convenience loader mimicking Hugging Face squad_v2 DatasetDict structure. 336 | # ---------------------------------------------------------------------------- 337 | def load_ua_squad_v2( 338 | root: Path | str = Path("./data/ua_squad"), 339 | *, 340 | download: bool = True, 341 | force_download: bool = False, 342 | features: Any | None = None, 343 | ) -> Any: 344 | """Load UA-SQuAD splits and return a ``datasets.DatasetDict`` matching squad_v2 shape. 345 | 346 | Parameters 347 | ---------- 348 | root : Path | str 349 | Root directory where ``train.json`` / ``val.json`` (or fallbacks) reside / will be downloaded. 350 | download : bool 351 | Whether to download missing splits. 352 | force_download : bool 353 | Re-download even if local files exist. 354 | features : Optional[datasets.Features] 355 | Custom features to cast onto the resulting datasets. If omitted a default 356 | SQuAD v2 style schema is applied. 357 | 358 | Returns 359 | ------- 360 | datasets.DatasetDict 361 | With keys ``train`` and ``validation`` each exposing columns: 362 | id, title, context, question, answers{"text": list[str], "answer_start": list[int]}, is_impossible. 363 | """ 364 | try: # local import to avoid hard dependency 365 | import importlib 366 | 367 | ds_mod = importlib.import_module("datasets") 368 | DatasetDict = ds_mod.DatasetDict 369 | Features = ds_mod.Features 370 | Sequence = ds_mod.Sequence 371 | Value = ds_mod.Value 372 | except ModuleNotFoundError as exc: # pragma: no cover 373 | raise RuntimeError( 374 | "The 'datasets' package is required for load_ua_squad_v2(); install with 'uv add datasets'." 375 | ) from exc 376 | 377 | root = Path(root) 378 | train_ds = UaSquadDataset( 379 | root=root, split="train", download=download, force_download=force_download 380 | ).to_hf_dataset() 381 | val_ds = UaSquadDataset( 382 | root=root, split="val", download=download, force_download=force_download 383 | ).to_hf_dataset() 384 | 385 | if features is None: 386 | features = Features( 387 | { 388 | "id": Value("string"), 389 | "title": Value("string"), 390 | "context": Value("string"), 391 | "question": Value("string"), 392 | "answers": { 393 | "text": Sequence(Value("string")), 394 | "answer_start": Sequence(Value("int32")), 395 | }, 396 | "is_impossible": Value("bool"), 397 | } 398 | ) 399 | 400 | train_ds = train_ds.cast(features) 401 | val_ds = val_ds.cast(features) 402 | return DatasetDict({"train": train_ds, "validation": val_ds}) 403 | -------------------------------------------------------------------------------- /uv.lock: -------------------------------------------------------------------------------- 1 | version = 1 2 | revision = 2 3 | requires-python = ">=3.10" 4 | 5 | [[package]] 6 | name = "cfgv" 7 | version = "3.4.0" 8 | source = { registry = "https://pypi.org/simple" } 9 | sdist = { url = "https://files.pythonhosted.org/packages/11/74/539e56497d9bd1d484fd863dd69cbbfa653cd2aa27abfe35653494d85e94/cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560", size = 7114, upload-time = "2023-08-12T20:38:17.776Z" } 10 | wheels = [ 11 | { url = "https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9", size = 7249, upload-time = "2023-08-12T20:38:16.269Z" }, 12 | ] 13 | 14 | [[package]] 15 | name = "click" 16 | version = "8.3.0" 17 | source = { registry = "https://pypi.org/simple" } 18 | dependencies = [ 19 | { name = "colorama", marker = "sys_platform == 'win32'" }, 20 | ] 21 | sdist = { url = "https://files.pythonhosted.org/packages/46/61/de6cd827efad202d7057d93e0fed9294b96952e188f7384832791c7b2254/click-8.3.0.tar.gz", hash = "sha256:e7b8232224eba16f4ebe410c25ced9f7875cb5f3263ffc93cc3e8da705e229c4", size = 276943, upload-time = "2025-09-18T17:32:23.696Z" } 22 | wheels = [ 23 | { url = "https://files.pythonhosted.org/packages/db/d3/9dcc0f5797f070ec8edf30fbadfb200e71d9db6b84d211e3b2085a7589a0/click-8.3.0-py3-none-any.whl", hash = "sha256:9b9f285302c6e3064f4330c05f05b81945b2a39544279343e6e7c5f27a9baddc", size = 107295, upload-time = "2025-09-18T17:32:22.42Z" }, 24 | ] 25 | 26 | [[package]] 27 | name = "colorama" 28 | version = "0.4.6" 29 | source = { registry = "https://pypi.org/simple" } 30 | sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } 31 | wheels = [ 32 | { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, 33 | ] 34 | 35 | [[package]] 36 | name = "distlib" 37 | version = "0.4.0" 38 | source = { registry = "https://pypi.org/simple" } 39 | sdist = { url = "https://files.pythonhosted.org/packages/96/8e/709914eb2b5749865801041647dc7f4e6d00b549cfe88b65ca192995f07c/distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d", size = 614605, upload-time = "2025-07-17T16:52:00.465Z" } 40 | wheels = [ 41 | { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, 42 | ] 43 | 44 | [[package]] 45 | name = "exceptiongroup" 46 | version = "1.3.0" 47 | source = { registry = "https://pypi.org/simple" } 48 | dependencies = [ 49 | { name = "typing-extensions", marker = "python_full_version < '3.13'" }, 50 | ] 51 | sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" } 52 | wheels = [ 53 | { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" }, 54 | ] 55 | 56 | [[package]] 57 | name = "filelock" 58 | version = "3.20.0" 59 | source = { registry = "https://pypi.org/simple" } 60 | sdist = { url = "https://files.pythonhosted.org/packages/58/46/0028a82567109b5ef6e4d2a1f04a583fb513e6cf9527fcdd09afd817deeb/filelock-3.20.0.tar.gz", hash = "sha256:711e943b4ec6be42e1d4e6690b48dc175c822967466bb31c0c293f34334c13f4", size = 18922, upload-time = "2025-10-08T18:03:50.056Z" } 61 | wheels = [ 62 | { url = "https://files.pythonhosted.org/packages/76/91/7216b27286936c16f5b4d0c530087e4a54eead683e6b0b73dd0c64844af6/filelock-3.20.0-py3-none-any.whl", hash = "sha256:339b4732ffda5cd79b13f4e2711a31b0365ce445d95d243bb996273d072546a2", size = 16054, upload-time = "2025-10-08T18:03:48.35Z" }, 63 | ] 64 | 65 | [[package]] 66 | name = "identify" 67 | version = "2.6.15" 68 | source = { registry = "https://pypi.org/simple" } 69 | sdist = { url = "https://files.pythonhosted.org/packages/ff/e7/685de97986c916a6d93b3876139e00eef26ad5bbbd61925d670ae8013449/identify-2.6.15.tar.gz", hash = "sha256:e4f4864b96c6557ef2a1e1c951771838f4edc9df3a72ec7118b338801b11c7bf", size = 99311, upload-time = "2025-10-02T17:43:40.631Z" } 70 | wheels = [ 71 | { url = "https://files.pythonhosted.org/packages/0f/1c/e5fd8f973d4f375adb21565739498e2e9a1e54c858a97b9a8ccfdc81da9b/identify-2.6.15-py2.py3-none-any.whl", hash = "sha256:1181ef7608e00704db228516541eb83a88a9f94433a8c80bb9b5bd54b1d81757", size = 99183, upload-time = "2025-10-02T17:43:39.137Z" }, 72 | ] 73 | 74 | [[package]] 75 | name = "iniconfig" 76 | version = "2.3.0" 77 | source = { registry = "https://pypi.org/simple" } 78 | sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } 79 | wheels = [ 80 | { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, 81 | ] 82 | 83 | [[package]] 84 | name = "markdown-it-py" 85 | version = "4.0.0" 86 | source = { registry = "https://pypi.org/simple" } 87 | dependencies = [ 88 | { name = "mdurl" }, 89 | ] 90 | sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } 91 | wheels = [ 92 | { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, 93 | ] 94 | 95 | [[package]] 96 | name = "mdurl" 97 | version = "0.1.2" 98 | source = { registry = "https://pypi.org/simple" } 99 | sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } 100 | wheels = [ 101 | { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, 102 | ] 103 | 104 | [[package]] 105 | name = "mypy" 106 | version = "1.18.2" 107 | source = { registry = "https://pypi.org/simple" } 108 | dependencies = [ 109 | { name = "mypy-extensions" }, 110 | { name = "pathspec" }, 111 | { name = "tomli", marker = "python_full_version < '3.11'" }, 112 | { name = "typing-extensions" }, 113 | ] 114 | sdist = { url = "https://files.pythonhosted.org/packages/c0/77/8f0d0001ffad290cef2f7f216f96c814866248a0b92a722365ed54648e7e/mypy-1.18.2.tar.gz", hash = "sha256:06a398102a5f203d7477b2923dda3634c36727fa5c237d8f859ef90c42a9924b", size = 3448846, upload-time = "2025-09-19T00:11:10.519Z" } 115 | wheels = [ 116 | { url = "https://files.pythonhosted.org/packages/03/6f/657961a0743cff32e6c0611b63ff1c1970a0b482ace35b069203bf705187/mypy-1.18.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c1eab0cf6294dafe397c261a75f96dc2c31bffe3b944faa24db5def4e2b0f77c", size = 12807973, upload-time = "2025-09-19T00:10:35.282Z" }, 117 | { url = "https://files.pythonhosted.org/packages/10/e9/420822d4f661f13ca8900f5fa239b40ee3be8b62b32f3357df9a3045a08b/mypy-1.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7a780ca61fc239e4865968ebc5240bb3bf610ef59ac398de9a7421b54e4a207e", size = 11896527, upload-time = "2025-09-19T00:10:55.791Z" }, 118 | { url = "https://files.pythonhosted.org/packages/aa/73/a05b2bbaa7005f4642fcfe40fb73f2b4fb6bb44229bd585b5878e9a87ef8/mypy-1.18.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:448acd386266989ef11662ce3c8011fd2a7b632e0ec7d61a98edd8e27472225b", size = 12507004, upload-time = "2025-09-19T00:11:05.411Z" }, 119 | { url = "https://files.pythonhosted.org/packages/4f/01/f6e4b9f0d031c11ccbd6f17da26564f3a0f3c4155af344006434b0a05a9d/mypy-1.18.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f9e171c465ad3901dc652643ee4bffa8e9fef4d7d0eece23b428908c77a76a66", size = 13245947, upload-time = "2025-09-19T00:10:46.923Z" }, 120 | { url = "https://files.pythonhosted.org/packages/d7/97/19727e7499bfa1ae0773d06afd30ac66a58ed7437d940c70548634b24185/mypy-1.18.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:592ec214750bc00741af1f80cbf96b5013d81486b7bb24cb052382c19e40b428", size = 13499217, upload-time = "2025-09-19T00:09:39.472Z" }, 121 | { url = "https://files.pythonhosted.org/packages/9f/4f/90dc8c15c1441bf31cf0f9918bb077e452618708199e530f4cbd5cede6ff/mypy-1.18.2-cp310-cp310-win_amd64.whl", hash = "sha256:7fb95f97199ea11769ebe3638c29b550b5221e997c63b14ef93d2e971606ebed", size = 9766753, upload-time = "2025-09-19T00:10:49.161Z" }, 122 | { url = "https://files.pythonhosted.org/packages/88/87/cafd3ae563f88f94eec33f35ff722d043e09832ea8530ef149ec1efbaf08/mypy-1.18.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:807d9315ab9d464125aa9fcf6d84fde6e1dc67da0b6f80e7405506b8ac72bc7f", size = 12731198, upload-time = "2025-09-19T00:09:44.857Z" }, 123 | { url = "https://files.pythonhosted.org/packages/0f/e0/1e96c3d4266a06d4b0197ace5356d67d937d8358e2ee3ffac71faa843724/mypy-1.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:776bb00de1778caf4db739c6e83919c1d85a448f71979b6a0edd774ea8399341", size = 11817879, upload-time = "2025-09-19T00:09:47.131Z" }, 124 | { url = "https://files.pythonhosted.org/packages/72/ef/0c9ba89eb03453e76bdac5a78b08260a848c7bfc5d6603634774d9cd9525/mypy-1.18.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1379451880512ffce14505493bd9fe469e0697543717298242574882cf8cdb8d", size = 12427292, upload-time = "2025-09-19T00:10:22.472Z" }, 125 | { url = "https://files.pythonhosted.org/packages/1a/52/ec4a061dd599eb8179d5411d99775bec2a20542505988f40fc2fee781068/mypy-1.18.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1331eb7fd110d60c24999893320967594ff84c38ac6d19e0a76c5fd809a84c86", size = 13163750, upload-time = "2025-09-19T00:09:51.472Z" }, 126 | { url = "https://files.pythonhosted.org/packages/c4/5f/2cf2ceb3b36372d51568f2208c021870fe7834cf3186b653ac6446511839/mypy-1.18.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3ca30b50a51e7ba93b00422e486cbb124f1c56a535e20eff7b2d6ab72b3b2e37", size = 13351827, upload-time = "2025-09-19T00:09:58.311Z" }, 127 | { url = "https://files.pythonhosted.org/packages/c8/7d/2697b930179e7277529eaaec1513f8de622818696857f689e4a5432e5e27/mypy-1.18.2-cp311-cp311-win_amd64.whl", hash = "sha256:664dc726e67fa54e14536f6e1224bcfce1d9e5ac02426d2326e2bb4e081d1ce8", size = 9757983, upload-time = "2025-09-19T00:10:09.071Z" }, 128 | { url = "https://files.pythonhosted.org/packages/07/06/dfdd2bc60c66611dd8335f463818514733bc763e4760dee289dcc33df709/mypy-1.18.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:33eca32dd124b29400c31d7cf784e795b050ace0e1f91b8dc035672725617e34", size = 12908273, upload-time = "2025-09-19T00:10:58.321Z" }, 129 | { url = "https://files.pythonhosted.org/packages/81/14/6a9de6d13a122d5608e1a04130724caf9170333ac5a924e10f670687d3eb/mypy-1.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a3c47adf30d65e89b2dcd2fa32f3aeb5e94ca970d2c15fcb25e297871c8e4764", size = 11920910, upload-time = "2025-09-19T00:10:20.043Z" }, 130 | { url = "https://files.pythonhosted.org/packages/5f/a9/b29de53e42f18e8cc547e38daa9dfa132ffdc64f7250e353f5c8cdd44bee/mypy-1.18.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d6c838e831a062f5f29d11c9057c6009f60cb294fea33a98422688181fe2893", size = 12465585, upload-time = "2025-09-19T00:10:33.005Z" }, 131 | { url = "https://files.pythonhosted.org/packages/77/ae/6c3d2c7c61ff21f2bee938c917616c92ebf852f015fb55917fd6e2811db2/mypy-1.18.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01199871b6110a2ce984bde85acd481232d17413868c9807e95c1b0739a58914", size = 13348562, upload-time = "2025-09-19T00:10:11.51Z" }, 132 | { url = "https://files.pythonhosted.org/packages/4d/31/aec68ab3b4aebdf8f36d191b0685d99faa899ab990753ca0fee60fb99511/mypy-1.18.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a2afc0fa0b0e91b4599ddfe0f91e2c26c2b5a5ab263737e998d6817874c5f7c8", size = 13533296, upload-time = "2025-09-19T00:10:06.568Z" }, 133 | { url = "https://files.pythonhosted.org/packages/9f/83/abcb3ad9478fca3ebeb6a5358bb0b22c95ea42b43b7789c7fb1297ca44f4/mypy-1.18.2-cp312-cp312-win_amd64.whl", hash = "sha256:d8068d0afe682c7c4897c0f7ce84ea77f6de953262b12d07038f4d296d547074", size = 9828828, upload-time = "2025-09-19T00:10:28.203Z" }, 134 | { url = "https://files.pythonhosted.org/packages/5f/04/7f462e6fbba87a72bc8097b93f6842499c428a6ff0c81dd46948d175afe8/mypy-1.18.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:07b8b0f580ca6d289e69209ec9d3911b4a26e5abfde32228a288eb79df129fcc", size = 12898728, upload-time = "2025-09-19T00:10:01.33Z" }, 135 | { url = "https://files.pythonhosted.org/packages/99/5b/61ed4efb64f1871b41fd0b82d29a64640f3516078f6c7905b68ab1ad8b13/mypy-1.18.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ed4482847168439651d3feee5833ccedbf6657e964572706a2adb1f7fa4dfe2e", size = 11910758, upload-time = "2025-09-19T00:10:42.607Z" }, 136 | { url = "https://files.pythonhosted.org/packages/3c/46/d297d4b683cc89a6e4108c4250a6a6b717f5fa96e1a30a7944a6da44da35/mypy-1.18.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c3ad2afadd1e9fea5cf99a45a822346971ede8685cc581ed9cd4d42eaf940986", size = 12475342, upload-time = "2025-09-19T00:11:00.371Z" }, 137 | { url = "https://files.pythonhosted.org/packages/83/45/4798f4d00df13eae3bfdf726c9244bcb495ab5bd588c0eed93a2f2dd67f3/mypy-1.18.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a431a6f1ef14cf8c144c6b14793a23ec4eae3db28277c358136e79d7d062f62d", size = 13338709, upload-time = "2025-09-19T00:11:03.358Z" }, 138 | { url = "https://files.pythonhosted.org/packages/d7/09/479f7358d9625172521a87a9271ddd2441e1dab16a09708f056e97007207/mypy-1.18.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7ab28cc197f1dd77a67e1c6f35cd1f8e8b73ed2217e4fc005f9e6a504e46e7ba", size = 13529806, upload-time = "2025-09-19T00:10:26.073Z" }, 139 | { url = "https://files.pythonhosted.org/packages/71/cf/ac0f2c7e9d0ea3c75cd99dff7aec1c9df4a1376537cb90e4c882267ee7e9/mypy-1.18.2-cp313-cp313-win_amd64.whl", hash = "sha256:0e2785a84b34a72ba55fb5daf079a1003a34c05b22238da94fcae2bbe46f3544", size = 9833262, upload-time = "2025-09-19T00:10:40.035Z" }, 140 | { url = "https://files.pythonhosted.org/packages/5a/0c/7d5300883da16f0063ae53996358758b2a2df2a09c72a5061fa79a1f5006/mypy-1.18.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:62f0e1e988ad41c2a110edde6c398383a889d95b36b3e60bcf155f5164c4fdce", size = 12893775, upload-time = "2025-09-19T00:10:03.814Z" }, 141 | { url = "https://files.pythonhosted.org/packages/50/df/2cffbf25737bdb236f60c973edf62e3e7b4ee1c25b6878629e88e2cde967/mypy-1.18.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8795a039bab805ff0c1dfdb8cd3344642c2b99b8e439d057aba30850b8d3423d", size = 11936852, upload-time = "2025-09-19T00:10:51.631Z" }, 142 | { url = "https://files.pythonhosted.org/packages/be/50/34059de13dd269227fb4a03be1faee6e2a4b04a2051c82ac0a0b5a773c9a/mypy-1.18.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6ca1e64b24a700ab5ce10133f7ccd956a04715463d30498e64ea8715236f9c9c", size = 12480242, upload-time = "2025-09-19T00:11:07.955Z" }, 143 | { url = "https://files.pythonhosted.org/packages/5b/11/040983fad5132d85914c874a2836252bbc57832065548885b5bb5b0d4359/mypy-1.18.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d924eef3795cc89fecf6bedc6ed32b33ac13e8321344f6ddbf8ee89f706c05cb", size = 13326683, upload-time = "2025-09-19T00:09:55.572Z" }, 144 | { url = "https://files.pythonhosted.org/packages/e9/ba/89b2901dd77414dd7a8c8729985832a5735053be15b744c18e4586e506ef/mypy-1.18.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:20c02215a080e3a2be3aa50506c67242df1c151eaba0dcbc1e4e557922a26075", size = 13514749, upload-time = "2025-09-19T00:10:44.827Z" }, 145 | { url = "https://files.pythonhosted.org/packages/25/bc/cc98767cffd6b2928ba680f3e5bc969c4152bf7c2d83f92f5a504b92b0eb/mypy-1.18.2-cp314-cp314-win_amd64.whl", hash = "sha256:749b5f83198f1ca64345603118a6f01a4e99ad4bf9d103ddc5a3200cc4614adf", size = 9982959, upload-time = "2025-09-19T00:10:37.344Z" }, 146 | { url = "https://files.pythonhosted.org/packages/87/e3/be76d87158ebafa0309946c4a73831974d4d6ab4f4ef40c3b53a385a66fd/mypy-1.18.2-py3-none-any.whl", hash = "sha256:22a1748707dd62b58d2ae53562ffc4d7f8bcc727e8ac7cbc69c053ddc874d47e", size = 2352367, upload-time = "2025-09-19T00:10:15.489Z" }, 147 | ] 148 | 149 | [[package]] 150 | name = "mypy-extensions" 151 | version = "1.1.0" 152 | source = { registry = "https://pypi.org/simple" } 153 | sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" } 154 | wheels = [ 155 | { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" }, 156 | ] 157 | 158 | [[package]] 159 | name = "nodeenv" 160 | version = "1.9.1" 161 | source = { registry = "https://pypi.org/simple" } 162 | sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" } 163 | wheels = [ 164 | { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, 165 | ] 166 | 167 | [[package]] 168 | name = "packaging" 169 | version = "24.2" 170 | source = { registry = "https://pypi.org/simple" } 171 | sdist = { url = "https://files.pythonhosted.org/packages/d0/63/68dbb6eb2de9cb10ee4c9c14a0148804425e13c4fb20d61cce69f53106da/packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f", size = 163950, upload-time = "2024-11-08T09:47:47.202Z" } 172 | wheels = [ 173 | { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451, upload-time = "2024-11-08T09:47:44.722Z" }, 174 | ] 175 | 176 | [[package]] 177 | name = "pathspec" 178 | version = "0.12.1" 179 | source = { registry = "https://pypi.org/simple" } 180 | sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" } 181 | wheels = [ 182 | { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" }, 183 | ] 184 | 185 | [[package]] 186 | name = "platformdirs" 187 | version = "4.5.0" 188 | source = { registry = "https://pypi.org/simple" } 189 | sdist = { url = "https://files.pythonhosted.org/packages/61/33/9611380c2bdb1225fdef633e2a9610622310fed35ab11dac9620972ee088/platformdirs-4.5.0.tar.gz", hash = "sha256:70ddccdd7c99fc5942e9fc25636a8b34d04c24b335100223152c2803e4063312", size = 21632, upload-time = "2025-10-08T17:44:48.791Z" } 190 | wheels = [ 191 | { url = "https://files.pythonhosted.org/packages/73/cb/ac7874b3e5d58441674fb70742e6c374b28b0c7cb988d37d991cde47166c/platformdirs-4.5.0-py3-none-any.whl", hash = "sha256:e578a81bb873cbb89a41fcc904c7ef523cc18284b7e3b3ccf06aca1403b7ebd3", size = 18651, upload-time = "2025-10-08T17:44:47.223Z" }, 192 | ] 193 | 194 | [[package]] 195 | name = "pluggy" 196 | version = "1.6.0" 197 | source = { registry = "https://pypi.org/simple" } 198 | sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } 199 | wheels = [ 200 | { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, 201 | ] 202 | 203 | [[package]] 204 | name = "pre-commit" 205 | version = "4.3.0" 206 | source = { registry = "https://pypi.org/simple" } 207 | dependencies = [ 208 | { name = "cfgv" }, 209 | { name = "identify" }, 210 | { name = "nodeenv" }, 211 | { name = "pyyaml" }, 212 | { name = "virtualenv" }, 213 | ] 214 | sdist = { url = "https://files.pythonhosted.org/packages/ff/29/7cf5bbc236333876e4b41f56e06857a87937ce4bf91e117a6991a2dbb02a/pre_commit-4.3.0.tar.gz", hash = "sha256:499fe450cc9d42e9d58e606262795ecb64dd05438943c62b66f6a8673da30b16", size = 193792, upload-time = "2025-08-09T18:56:14.651Z" } 215 | wheels = [ 216 | { url = "https://files.pythonhosted.org/packages/5b/a5/987a405322d78a73b66e39e4a90e4ef156fd7141bf71df987e50717c321b/pre_commit-4.3.0-py2.py3-none-any.whl", hash = "sha256:2b0747ad7e6e967169136edffee14c16e148a778a54e4f967921aa1ebf2308d8", size = 220965, upload-time = "2025-08-09T18:56:13.192Z" }, 217 | ] 218 | 219 | [[package]] 220 | name = "pygments" 221 | version = "2.19.2" 222 | source = { registry = "https://pypi.org/simple" } 223 | sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } 224 | wheels = [ 225 | { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, 226 | ] 227 | 228 | [[package]] 229 | name = "pytest" 230 | version = "8.4.2" 231 | source = { registry = "https://pypi.org/simple" } 232 | dependencies = [ 233 | { name = "colorama", marker = "sys_platform == 'win32'" }, 234 | { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, 235 | { name = "iniconfig" }, 236 | { name = "packaging" }, 237 | { name = "pluggy" }, 238 | { name = "pygments" }, 239 | { name = "tomli", marker = "python_full_version < '3.11'" }, 240 | ] 241 | sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" } 242 | wheels = [ 243 | { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" }, 244 | ] 245 | 246 | [[package]] 247 | name = "pyyaml" 248 | version = "6.0.3" 249 | source = { registry = "https://pypi.org/simple" } 250 | sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } 251 | wheels = [ 252 | { url = "https://files.pythonhosted.org/packages/f4/a0/39350dd17dd6d6c6507025c0e53aef67a9293a6d37d3511f23ea510d5800/pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b", size = 184227, upload-time = "2025-09-25T21:31:46.04Z" }, 253 | { url = "https://files.pythonhosted.org/packages/05/14/52d505b5c59ce73244f59c7a50ecf47093ce4765f116cdb98286a71eeca2/pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956", size = 174019, upload-time = "2025-09-25T21:31:47.706Z" }, 254 | { url = "https://files.pythonhosted.org/packages/43/f7/0e6a5ae5599c838c696adb4e6330a59f463265bfa1e116cfd1fbb0abaaae/pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8", size = 740646, upload-time = "2025-09-25T21:31:49.21Z" }, 255 | { url = "https://files.pythonhosted.org/packages/2f/3a/61b9db1d28f00f8fd0ae760459a5c4bf1b941baf714e207b6eb0657d2578/pyyaml-6.0.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:66291b10affd76d76f54fad28e22e51719ef9ba22b29e1d7d03d6777a9174198", size = 840793, upload-time = "2025-09-25T21:31:50.735Z" }, 256 | { url = "https://files.pythonhosted.org/packages/7a/1e/7acc4f0e74c4b3d9531e24739e0ab832a5edf40e64fbae1a9c01941cabd7/pyyaml-6.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c7708761fccb9397fe64bbc0395abcae8c4bf7b0eac081e12b809bf47700d0b", size = 770293, upload-time = "2025-09-25T21:31:51.828Z" }, 257 | { url = "https://files.pythonhosted.org/packages/8b/ef/abd085f06853af0cd59fa5f913d61a8eab65d7639ff2a658d18a25d6a89d/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:418cf3f2111bc80e0933b2cd8cd04f286338bb88bdc7bc8e6dd775ebde60b5e0", size = 732872, upload-time = "2025-09-25T21:31:53.282Z" }, 258 | { url = "https://files.pythonhosted.org/packages/1f/15/2bc9c8faf6450a8b3c9fc5448ed869c599c0a74ba2669772b1f3a0040180/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5e0b74767e5f8c593e8c9b5912019159ed0533c70051e9cce3e8b6aa699fcd69", size = 758828, upload-time = "2025-09-25T21:31:54.807Z" }, 259 | { url = "https://files.pythonhosted.org/packages/a3/00/531e92e88c00f4333ce359e50c19b8d1de9fe8d581b1534e35ccfbc5f393/pyyaml-6.0.3-cp310-cp310-win32.whl", hash = "sha256:28c8d926f98f432f88adc23edf2e6d4921ac26fb084b028c733d01868d19007e", size = 142415, upload-time = "2025-09-25T21:31:55.885Z" }, 260 | { url = "https://files.pythonhosted.org/packages/2a/fa/926c003379b19fca39dd4634818b00dec6c62d87faf628d1394e137354d4/pyyaml-6.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:bdb2c67c6c1390b63c6ff89f210c8fd09d9a1217a465701eac7316313c915e4c", size = 158561, upload-time = "2025-09-25T21:31:57.406Z" }, 261 | { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" }, 262 | { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" }, 263 | { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" }, 264 | { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" }, 265 | { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" }, 266 | { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" }, 267 | { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" }, 268 | { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" }, 269 | { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" }, 270 | { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" }, 271 | { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" }, 272 | { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" }, 273 | { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" }, 274 | { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" }, 275 | { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" }, 276 | { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" }, 277 | { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" }, 278 | { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" }, 279 | { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, 280 | { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" }, 281 | { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" }, 282 | { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" }, 283 | { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" }, 284 | { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" }, 285 | { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" }, 286 | { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" }, 287 | { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" }, 288 | { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" }, 289 | { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" }, 290 | { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, 291 | { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, 292 | { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, 293 | { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, 294 | { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, 295 | { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, 296 | { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, 297 | { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, 298 | { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, 299 | { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, 300 | { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, 301 | { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, 302 | { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, 303 | { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, 304 | { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, 305 | { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, 306 | { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, 307 | { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, 308 | ] 309 | 310 | [[package]] 311 | name = "rich" 312 | version = "14.2.0" 313 | source = { registry = "https://pypi.org/simple" } 314 | dependencies = [ 315 | { name = "markdown-it-py" }, 316 | { name = "pygments" }, 317 | ] 318 | sdist = { url = "https://files.pythonhosted.org/packages/fb/d2/8920e102050a0de7bfabeb4c4614a49248cf8d5d7a8d01885fbb24dc767a/rich-14.2.0.tar.gz", hash = "sha256:73ff50c7c0c1c77c8243079283f4edb376f0f6442433aecb8ce7e6d0b92d1fe4", size = 219990, upload-time = "2025-10-09T14:16:53.064Z" } 319 | wheels = [ 320 | { url = "https://files.pythonhosted.org/packages/25/7a/b0178788f8dc6cafce37a212c99565fa1fe7872c70c6c9c1e1a372d9d88f/rich-14.2.0-py3-none-any.whl", hash = "sha256:76bc51fe2e57d2b1be1f96c524b890b816e334ab4c1e45888799bfaab0021edd", size = 243393, upload-time = "2025-10-09T14:16:51.245Z" }, 321 | ] 322 | 323 | [[package]] 324 | name = "ruff" 325 | version = "0.14.2" 326 | source = { registry = "https://pypi.org/simple" } 327 | sdist = { url = "https://files.pythonhosted.org/packages/ee/34/8218a19b2055b80601e8fd201ec723c74c7fe1ca06d525a43ed07b6d8e85/ruff-0.14.2.tar.gz", hash = "sha256:98da787668f239313d9c902ca7c523fe11b8ec3f39345553a51b25abc4629c96", size = 5539663, upload-time = "2025-10-23T19:37:00.956Z" } 328 | wheels = [ 329 | { url = "https://files.pythonhosted.org/packages/16/dd/23eb2db5ad9acae7c845700493b72d3ae214dce0b226f27df89216110f2b/ruff-0.14.2-py3-none-linux_armv6l.whl", hash = "sha256:7cbe4e593505bdec5884c2d0a4d791a90301bc23e49a6b1eb642dd85ef9c64f1", size = 12533390, upload-time = "2025-10-23T19:36:18.044Z" }, 330 | { url = "https://files.pythonhosted.org/packages/5a/8c/5f9acff43ddcf3f85130d0146d0477e28ccecc495f9f684f8f7119b74c0d/ruff-0.14.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:8d54b561729cee92f8d89c316ad7a3f9705533f5903b042399b6ae0ddfc62e11", size = 12887187, upload-time = "2025-10-23T19:36:22.664Z" }, 331 | { url = "https://files.pythonhosted.org/packages/99/fa/047646491479074029665022e9f3dc6f0515797f40a4b6014ea8474c539d/ruff-0.14.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5c8753dfa44ebb2cde10ce5b4d2ef55a41fb9d9b16732a2c5df64620dbda44a3", size = 11925177, upload-time = "2025-10-23T19:36:24.778Z" }, 332 | { url = "https://files.pythonhosted.org/packages/15/8b/c44cf7fe6e59ab24a9d939493a11030b503bdc2a16622cede8b7b1df0114/ruff-0.14.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d0bbeffb8d9f4fccf7b5198d566d0bad99a9cb622f1fc3467af96cb8773c9e3", size = 12358285, upload-time = "2025-10-23T19:36:26.979Z" }, 333 | { url = "https://files.pythonhosted.org/packages/45/01/47701b26254267ef40369aea3acb62a7b23e921c27372d127e0f3af48092/ruff-0.14.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7047f0c5a713a401e43a88d36843d9c83a19c584e63d664474675620aaa634a8", size = 12303832, upload-time = "2025-10-23T19:36:29.192Z" }, 334 | { url = "https://files.pythonhosted.org/packages/2d/5c/ae7244ca4fbdf2bee9d6405dcd5bc6ae51ee1df66eb7a9884b77b8af856d/ruff-0.14.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3bf8d2f9aa1602599217d82e8e0af7fd33e5878c4d98f37906b7c93f46f9a839", size = 13036995, upload-time = "2025-10-23T19:36:31.861Z" }, 335 | { url = "https://files.pythonhosted.org/packages/27/4c/0860a79ce6fd4c709ac01173f76f929d53f59748d0dcdd662519835dae43/ruff-0.14.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:1c505b389e19c57a317cf4b42db824e2fca96ffb3d86766c1c9f8b96d32048a7", size = 14512649, upload-time = "2025-10-23T19:36:33.915Z" }, 336 | { url = "https://files.pythonhosted.org/packages/7f/7f/d365de998069720a3abfc250ddd876fc4b81a403a766c74ff9bde15b5378/ruff-0.14.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a307fc45ebd887b3f26b36d9326bb70bf69b01561950cdcc6c0bdf7bb8e0f7cc", size = 14088182, upload-time = "2025-10-23T19:36:36.983Z" }, 337 | { url = "https://files.pythonhosted.org/packages/6c/ea/d8e3e6b209162000a7be1faa41b0a0c16a133010311edc3329753cc6596a/ruff-0.14.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:61ae91a32c853172f832c2f40bd05fd69f491db7289fb85a9b941ebdd549781a", size = 13599516, upload-time = "2025-10-23T19:36:39.208Z" }, 338 | { url = "https://files.pythonhosted.org/packages/fa/ea/c7810322086db68989fb20a8d5221dd3b79e49e396b01badca07b433ab45/ruff-0.14.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1967e40286f63ee23c615e8e7e98098dedc7301568bd88991f6e544d8ae096", size = 13272690, upload-time = "2025-10-23T19:36:41.453Z" }, 339 | { url = "https://files.pythonhosted.org/packages/a9/39/10b05acf8c45786ef501d454e00937e1b97964f846bf28883d1f9619928a/ruff-0.14.2-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:2877f02119cdebf52a632d743a2e302dea422bfae152ebe2f193d3285a3a65df", size = 13496497, upload-time = "2025-10-23T19:36:43.61Z" }, 340 | { url = "https://files.pythonhosted.org/packages/59/a1/1f25f8301e13751c30895092485fada29076e5e14264bdacc37202e85d24/ruff-0.14.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e681c5bc777de5af898decdcb6ba3321d0d466f4cb43c3e7cc2c3b4e7b843a05", size = 12266116, upload-time = "2025-10-23T19:36:45.625Z" }, 341 | { url = "https://files.pythonhosted.org/packages/5c/fa/0029bfc9ce16ae78164e6923ef392e5f173b793b26cc39aa1d8b366cf9dc/ruff-0.14.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:e21be42d72e224736f0c992cdb9959a2fa53c7e943b97ef5d081e13170e3ffc5", size = 12281345, upload-time = "2025-10-23T19:36:47.618Z" }, 342 | { url = "https://files.pythonhosted.org/packages/a5/ab/ece7baa3c0f29b7683be868c024f0838770c16607bea6852e46b202f1ff6/ruff-0.14.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:b8264016f6f209fac16262882dbebf3f8be1629777cf0f37e7aff071b3e9b92e", size = 12629296, upload-time = "2025-10-23T19:36:49.789Z" }, 343 | { url = "https://files.pythonhosted.org/packages/a4/7f/638f54b43f3d4e48c6a68062794e5b367ddac778051806b9e235dfb7aa81/ruff-0.14.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5ca36b4cb4db3067a3b24444463ceea5565ea78b95fe9a07ca7cb7fd16948770", size = 13371610, upload-time = "2025-10-23T19:36:51.882Z" }, 344 | { url = "https://files.pythonhosted.org/packages/8d/35/3654a973ebe5b32e1fd4a08ed2d46755af7267da7ac710d97420d7b8657d/ruff-0.14.2-py3-none-win32.whl", hash = "sha256:41775927d287685e08f48d8eb3f765625ab0b7042cc9377e20e64f4eb0056ee9", size = 12415318, upload-time = "2025-10-23T19:36:53.961Z" }, 345 | { url = "https://files.pythonhosted.org/packages/71/30/3758bcf9e0b6a4193a6f51abf84254aba00887dfa8c20aba18aa366c5f57/ruff-0.14.2-py3-none-win_amd64.whl", hash = "sha256:0df3424aa5c3c08b34ed8ce099df1021e3adaca6e90229273496b839e5a7e1af", size = 13565279, upload-time = "2025-10-23T19:36:56.578Z" }, 346 | { url = "https://files.pythonhosted.org/packages/2e/5d/aa883766f8ef9ffbe6aa24f7192fb71632f31a30e77eb39aa2b0dc4290ac/ruff-0.14.2-py3-none-win_arm64.whl", hash = "sha256:ea9d635e83ba21569fbacda7e78afbfeb94911c9434aff06192d9bc23fd5495a", size = 12554956, upload-time = "2025-10-23T19:36:58.714Z" }, 347 | ] 348 | 349 | [[package]] 350 | name = "shellingham" 351 | version = "1.5.4" 352 | source = { registry = "https://pypi.org/simple" } 353 | sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" } 354 | wheels = [ 355 | { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, 356 | ] 357 | 358 | [[package]] 359 | name = "tomli" 360 | version = "2.3.0" 361 | source = { registry = "https://pypi.org/simple" } 362 | sdist = { url = "https://files.pythonhosted.org/packages/52/ed/3f73f72945444548f33eba9a87fc7a6e969915e7b1acc8260b30e1f76a2f/tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549", size = 17392, upload-time = "2025-10-08T22:01:47.119Z" } 363 | wheels = [ 364 | { url = "https://files.pythonhosted.org/packages/b3/2e/299f62b401438d5fe1624119c723f5d877acc86a4c2492da405626665f12/tomli-2.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88bd15eb972f3664f5ed4b57c1634a97153b4bac4479dcb6a495f41921eb7f45", size = 153236, upload-time = "2025-10-08T22:01:00.137Z" }, 365 | { url = "https://files.pythonhosted.org/packages/86/7f/d8fffe6a7aefdb61bced88fcb5e280cfd71e08939da5894161bd71bea022/tomli-2.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:883b1c0d6398a6a9d29b508c331fa56adbcdff647f6ace4dfca0f50e90dfd0ba", size = 148084, upload-time = "2025-10-08T22:01:01.63Z" }, 366 | { url = "https://files.pythonhosted.org/packages/47/5c/24935fb6a2ee63e86d80e4d3b58b222dafaf438c416752c8b58537c8b89a/tomli-2.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1381caf13ab9f300e30dd8feadb3de072aeb86f1d34a8569453ff32a7dea4bf", size = 234832, upload-time = "2025-10-08T22:01:02.543Z" }, 367 | { url = "https://files.pythonhosted.org/packages/89/da/75dfd804fc11e6612846758a23f13271b76d577e299592b4371a4ca4cd09/tomli-2.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0e285d2649b78c0d9027570d4da3425bdb49830a6156121360b3f8511ea3441", size = 242052, upload-time = "2025-10-08T22:01:03.836Z" }, 368 | { url = "https://files.pythonhosted.org/packages/70/8c/f48ac899f7b3ca7eb13af73bacbc93aec37f9c954df3c08ad96991c8c373/tomli-2.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a154a9ae14bfcf5d8917a59b51ffd5a3ac1fd149b71b47a3a104ca4edcfa845", size = 239555, upload-time = "2025-10-08T22:01:04.834Z" }, 369 | { url = "https://files.pythonhosted.org/packages/ba/28/72f8afd73f1d0e7829bfc093f4cb98ce0a40ffc0cc997009ee1ed94ba705/tomli-2.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:74bf8464ff93e413514fefd2be591c3b0b23231a77f901db1eb30d6f712fc42c", size = 245128, upload-time = "2025-10-08T22:01:05.84Z" }, 370 | { url = "https://files.pythonhosted.org/packages/b6/eb/a7679c8ac85208706d27436e8d421dfa39d4c914dcf5fa8083a9305f58d9/tomli-2.3.0-cp311-cp311-win32.whl", hash = "sha256:00b5f5d95bbfc7d12f91ad8c593a1659b6387b43f054104cda404be6bda62456", size = 96445, upload-time = "2025-10-08T22:01:06.896Z" }, 371 | { url = "https://files.pythonhosted.org/packages/0a/fe/3d3420c4cb1ad9cb462fb52967080575f15898da97e21cb6f1361d505383/tomli-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:4dc4ce8483a5d429ab602f111a93a6ab1ed425eae3122032db7e9acf449451be", size = 107165, upload-time = "2025-10-08T22:01:08.107Z" }, 372 | { url = "https://files.pythonhosted.org/packages/ff/b7/40f36368fcabc518bb11c8f06379a0fd631985046c038aca08c6d6a43c6e/tomli-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7d86942e56ded512a594786a5ba0a5e521d02529b3826e7761a05138341a2ac", size = 154891, upload-time = "2025-10-08T22:01:09.082Z" }, 373 | { url = "https://files.pythonhosted.org/packages/f9/3f/d9dd692199e3b3aab2e4e4dd948abd0f790d9ded8cd10cbaae276a898434/tomli-2.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73ee0b47d4dad1c5e996e3cd33b8a76a50167ae5f96a2607cbe8cc773506ab22", size = 148796, upload-time = "2025-10-08T22:01:10.266Z" }, 374 | { url = "https://files.pythonhosted.org/packages/60/83/59bff4996c2cf9f9387a0f5a3394629c7efa5ef16142076a23a90f1955fa/tomli-2.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:792262b94d5d0a466afb5bc63c7daa9d75520110971ee269152083270998316f", size = 242121, upload-time = "2025-10-08T22:01:11.332Z" }, 375 | { url = "https://files.pythonhosted.org/packages/45/e5/7c5119ff39de8693d6baab6c0b6dcb556d192c165596e9fc231ea1052041/tomli-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f195fe57ecceac95a66a75ac24d9d5fbc98ef0962e09b2eddec5d39375aae52", size = 250070, upload-time = "2025-10-08T22:01:12.498Z" }, 376 | { url = "https://files.pythonhosted.org/packages/45/12/ad5126d3a278f27e6701abde51d342aa78d06e27ce2bb596a01f7709a5a2/tomli-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e31d432427dcbf4d86958c184b9bfd1e96b5b71f8eb17e6d02531f434fd335b8", size = 245859, upload-time = "2025-10-08T22:01:13.551Z" }, 377 | { url = "https://files.pythonhosted.org/packages/fb/a1/4d6865da6a71c603cfe6ad0e6556c73c76548557a8d658f9e3b142df245f/tomli-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b0882799624980785240ab732537fcfc372601015c00f7fc367c55308c186f6", size = 250296, upload-time = "2025-10-08T22:01:14.614Z" }, 378 | { url = "https://files.pythonhosted.org/packages/a0/b7/a7a7042715d55c9ba6e8b196d65d2cb662578b4d8cd17d882d45322b0d78/tomli-2.3.0-cp312-cp312-win32.whl", hash = "sha256:ff72b71b5d10d22ecb084d345fc26f42b5143c5533db5e2eaba7d2d335358876", size = 97124, upload-time = "2025-10-08T22:01:15.629Z" }, 379 | { url = "https://files.pythonhosted.org/packages/06/1e/f22f100db15a68b520664eb3328fb0ae4e90530887928558112c8d1f4515/tomli-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:1cb4ed918939151a03f33d4242ccd0aa5f11b3547d0cf30f7c74a408a5b99878", size = 107698, upload-time = "2025-10-08T22:01:16.51Z" }, 380 | { url = "https://files.pythonhosted.org/packages/89/48/06ee6eabe4fdd9ecd48bf488f4ac783844fd777f547b8d1b61c11939974e/tomli-2.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5192f562738228945d7b13d4930baffda67b69425a7f0da96d360b0a3888136b", size = 154819, upload-time = "2025-10-08T22:01:17.964Z" }, 381 | { url = "https://files.pythonhosted.org/packages/f1/01/88793757d54d8937015c75dcdfb673c65471945f6be98e6a0410fba167ed/tomli-2.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:be71c93a63d738597996be9528f4abe628d1adf5e6eb11607bc8fe1a510b5dae", size = 148766, upload-time = "2025-10-08T22:01:18.959Z" }, 382 | { url = "https://files.pythonhosted.org/packages/42/17/5e2c956f0144b812e7e107f94f1cc54af734eb17b5191c0bbfb72de5e93e/tomli-2.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4665508bcbac83a31ff8ab08f424b665200c0e1e645d2bd9ab3d3e557b6185b", size = 240771, upload-time = "2025-10-08T22:01:20.106Z" }, 383 | { url = "https://files.pythonhosted.org/packages/d5/f4/0fbd014909748706c01d16824eadb0307115f9562a15cbb012cd9b3512c5/tomli-2.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4021923f97266babc6ccab9f5068642a0095faa0a51a246a6a02fccbb3514eaf", size = 248586, upload-time = "2025-10-08T22:01:21.164Z" }, 384 | { url = "https://files.pythonhosted.org/packages/30/77/fed85e114bde5e81ecf9bc5da0cc69f2914b38f4708c80ae67d0c10180c5/tomli-2.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4ea38c40145a357d513bffad0ed869f13c1773716cf71ccaa83b0fa0cc4e42f", size = 244792, upload-time = "2025-10-08T22:01:22.417Z" }, 385 | { url = "https://files.pythonhosted.org/packages/55/92/afed3d497f7c186dc71e6ee6d4fcb0acfa5f7d0a1a2878f8beae379ae0cc/tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05", size = 248909, upload-time = "2025-10-08T22:01:23.859Z" }, 386 | { url = "https://files.pythonhosted.org/packages/f8/84/ef50c51b5a9472e7265ce1ffc7f24cd4023d289e109f669bdb1553f6a7c2/tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606", size = 96946, upload-time = "2025-10-08T22:01:24.893Z" }, 387 | { url = "https://files.pythonhosted.org/packages/b2/b7/718cd1da0884f281f95ccfa3a6cc572d30053cba64603f79d431d3c9b61b/tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999", size = 107705, upload-time = "2025-10-08T22:01:26.153Z" }, 388 | { url = "https://files.pythonhosted.org/packages/19/94/aeafa14a52e16163008060506fcb6aa1949d13548d13752171a755c65611/tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e", size = 154244, upload-time = "2025-10-08T22:01:27.06Z" }, 389 | { url = "https://files.pythonhosted.org/packages/db/e4/1e58409aa78eefa47ccd19779fc6f36787edbe7d4cd330eeeedb33a4515b/tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3", size = 148637, upload-time = "2025-10-08T22:01:28.059Z" }, 390 | { url = "https://files.pythonhosted.org/packages/26/b6/d1eccb62f665e44359226811064596dd6a366ea1f985839c566cd61525ae/tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc", size = 241925, upload-time = "2025-10-08T22:01:29.066Z" }, 391 | { url = "https://files.pythonhosted.org/packages/70/91/7cdab9a03e6d3d2bb11beae108da5bdc1c34bdeb06e21163482544ddcc90/tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0", size = 249045, upload-time = "2025-10-08T22:01:31.98Z" }, 392 | { url = "https://files.pythonhosted.org/packages/15/1b/8c26874ed1f6e4f1fcfeb868db8a794cbe9f227299402db58cfcc858766c/tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879", size = 245835, upload-time = "2025-10-08T22:01:32.989Z" }, 393 | { url = "https://files.pythonhosted.org/packages/fd/42/8e3c6a9a4b1a1360c1a2a39f0b972cef2cc9ebd56025168c4137192a9321/tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005", size = 253109, upload-time = "2025-10-08T22:01:34.052Z" }, 394 | { url = "https://files.pythonhosted.org/packages/22/0c/b4da635000a71b5f80130937eeac12e686eefb376b8dee113b4a582bba42/tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463", size = 97930, upload-time = "2025-10-08T22:01:35.082Z" }, 395 | { url = "https://files.pythonhosted.org/packages/b9/74/cb1abc870a418ae99cd5c9547d6bce30701a954e0e721821df483ef7223c/tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8", size = 107964, upload-time = "2025-10-08T22:01:36.057Z" }, 396 | { url = "https://files.pythonhosted.org/packages/54/78/5c46fff6432a712af9f792944f4fcd7067d8823157949f4e40c56b8b3c83/tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77", size = 163065, upload-time = "2025-10-08T22:01:37.27Z" }, 397 | { url = "https://files.pythonhosted.org/packages/39/67/f85d9bd23182f45eca8939cd2bc7050e1f90c41f4a2ecbbd5963a1d1c486/tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf", size = 159088, upload-time = "2025-10-08T22:01:38.235Z" }, 398 | { url = "https://files.pythonhosted.org/packages/26/5a/4b546a0405b9cc0659b399f12b6adb750757baf04250b148d3c5059fc4eb/tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530", size = 268193, upload-time = "2025-10-08T22:01:39.712Z" }, 399 | { url = "https://files.pythonhosted.org/packages/42/4f/2c12a72ae22cf7b59a7fe75b3465b7aba40ea9145d026ba41cb382075b0e/tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b", size = 275488, upload-time = "2025-10-08T22:01:40.773Z" }, 400 | { url = "https://files.pythonhosted.org/packages/92/04/a038d65dbe160c3aa5a624e93ad98111090f6804027d474ba9c37c8ae186/tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67", size = 272669, upload-time = "2025-10-08T22:01:41.824Z" }, 401 | { url = "https://files.pythonhosted.org/packages/be/2f/8b7c60a9d1612a7cbc39ffcca4f21a73bf368a80fc25bccf8253e2563267/tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f", size = 279709, upload-time = "2025-10-08T22:01:43.177Z" }, 402 | { url = "https://files.pythonhosted.org/packages/7e/46/cc36c679f09f27ded940281c38607716c86cf8ba4a518d524e349c8b4874/tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0", size = 107563, upload-time = "2025-10-08T22:01:44.233Z" }, 403 | { url = "https://files.pythonhosted.org/packages/84/ff/426ca8683cf7b753614480484f6437f568fd2fda2edbdf57a2d3d8b27a0b/tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba", size = 119756, upload-time = "2025-10-08T22:01:45.234Z" }, 404 | { url = "https://files.pythonhosted.org/packages/77/b8/0135fadc89e73be292b473cb820b4f5a08197779206b33191e801feeae40/tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b", size = 14408, upload-time = "2025-10-08T22:01:46.04Z" }, 405 | ] 406 | 407 | [[package]] 408 | name = "tomlkit" 409 | version = "0.13.3" 410 | source = { registry = "https://pypi.org/simple" } 411 | sdist = { url = "https://files.pythonhosted.org/packages/cc/18/0bbf3884e9eaa38819ebe46a7bd25dcd56b67434402b66a58c4b8e552575/tomlkit-0.13.3.tar.gz", hash = "sha256:430cf247ee57df2b94ee3fbe588e71d362a941ebb545dec29b53961d61add2a1", size = 185207, upload-time = "2025-06-05T07:13:44.947Z" } 412 | wheels = [ 413 | { url = "https://files.pythonhosted.org/packages/bd/75/8539d011f6be8e29f339c42e633aae3cb73bffa95dd0f9adec09b9c58e85/tomlkit-0.13.3-py3-none-any.whl", hash = "sha256:c89c649d79ee40629a9fda55f8ace8c6a1b42deb912b2a8fd8d942ddadb606b0", size = 38901, upload-time = "2025-06-05T07:13:43.546Z" }, 414 | ] 415 | 416 | [[package]] 417 | name = "typer" 418 | version = "0.20.0" 419 | source = { registry = "https://pypi.org/simple" } 420 | dependencies = [ 421 | { name = "click" }, 422 | { name = "rich" }, 423 | { name = "shellingham" }, 424 | { name = "typing-extensions" }, 425 | ] 426 | sdist = { url = "https://files.pythonhosted.org/packages/8f/28/7c85c8032b91dbe79725b6f17d2fffc595dff06a35c7a30a37bef73a1ab4/typer-0.20.0.tar.gz", hash = "sha256:1aaf6494031793e4876fb0bacfa6a912b551cf43c1e63c800df8b1a866720c37", size = 106492, upload-time = "2025-10-20T17:03:49.445Z" } 427 | wheels = [ 428 | { url = "https://files.pythonhosted.org/packages/78/64/7713ffe4b5983314e9d436a90d5bd4f63b6054e2aca783a3cfc44cb95bbf/typer-0.20.0-py3-none-any.whl", hash = "sha256:5b463df6793ec1dca6213a3cf4c0f03bc6e322ac5e16e13ddd622a889489784a", size = 47028, upload-time = "2025-10-20T17:03:47.617Z" }, 429 | ] 430 | 431 | [[package]] 432 | name = "typing-extensions" 433 | version = "4.15.0" 434 | source = { registry = "https://pypi.org/simple" } 435 | sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } 436 | wheels = [ 437 | { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, 438 | ] 439 | 440 | [[package]] 441 | name = "ua-datasets" 442 | version = "1.0.0" 443 | source = { virtual = "." } 444 | 445 | [package.dev-dependencies] 446 | dev = [ 447 | { name = "mypy" }, 448 | { name = "pre-commit" }, 449 | { name = "pytest" }, 450 | { name = "ruff" }, 451 | { name = "uv-sort" }, 452 | ] 453 | 454 | [package.metadata] 455 | 456 | [package.metadata.requires-dev] 457 | dev = [ 458 | { name = "mypy", specifier = ">=1.18.2" }, 459 | { name = "pre-commit", specifier = ">=2.21.0" }, 460 | { name = "pytest", specifier = ">=7.4.4" }, 461 | { name = "ruff", specifier = ">=0.14.2" }, 462 | { name = "uv-sort", specifier = ">=0.6.1" }, 463 | ] 464 | 465 | [[package]] 466 | name = "uv-sort" 467 | version = "0.6.1" 468 | source = { registry = "https://pypi.org/simple" } 469 | dependencies = [ 470 | { name = "packaging" }, 471 | { name = "tomlkit" }, 472 | { name = "typer" }, 473 | ] 474 | sdist = { url = "https://files.pythonhosted.org/packages/b3/70/df2501f7821f629c1c0e7dc90076f48ed4a364dc3225201f187a5ecf1608/uv_sort-0.6.1.tar.gz", hash = "sha256:a2f3828aedb60a54a17960ec3c1031e6cf8b711e6321016a6f50e6d30a442865", size = 23536, upload-time = "2025-07-05T00:54:56.213Z" } 475 | wheels = [ 476 | { url = "https://files.pythonhosted.org/packages/70/a0/3ee31db18de67d3ecfaa2fd58238a3a6837c2199419ee1f35cd668b94b3c/uv_sort-0.6.1-py3-none-any.whl", hash = "sha256:3b2df63e74cab5d8a581c12c4629ad297ea56960fb5d5433dcf8eb0aca2e80b9", size = 6409, upload-time = "2025-07-05T00:54:54.905Z" }, 477 | ] 478 | 479 | [[package]] 480 | name = "virtualenv" 481 | version = "20.35.3" 482 | source = { registry = "https://pypi.org/simple" } 483 | dependencies = [ 484 | { name = "distlib" }, 485 | { name = "filelock" }, 486 | { name = "platformdirs" }, 487 | { name = "typing-extensions", marker = "python_full_version < '3.11'" }, 488 | ] 489 | sdist = { url = "https://files.pythonhosted.org/packages/a4/d5/b0ccd381d55c8f45d46f77df6ae59fbc23d19e901e2d523395598e5f4c93/virtualenv-20.35.3.tar.gz", hash = "sha256:4f1a845d131133bdff10590489610c98c168ff99dc75d6c96853801f7f67af44", size = 6002907, upload-time = "2025-10-10T21:23:33.178Z" } 490 | wheels = [ 491 | { url = "https://files.pythonhosted.org/packages/27/73/d9a94da0e9d470a543c1b9d3ccbceb0f59455983088e727b8a1824ed90fb/virtualenv-20.35.3-py3-none-any.whl", hash = "sha256:63d106565078d8c8d0b206d48080f938a8b25361e19432d2c9db40d2899c810a", size = 5981061, upload-time = "2025-10-10T21:23:30.433Z" }, 492 | ] 493 | --------------------------------------------------------------------------------