├── .dvc
    ├── config
    ├── .gitignore
    └── plots
    │   ├── scatter.json
    │   ├── default.json
    │   ├── confusion.json
    │   └── smooth.json
├── tests
    ├── __init__.py
    ├── io
    │   ├── __init__.py
    │   └── data
    │   │   ├── test_file.json.gz
    │   │   ├── test_file.json.xz
    │   │   ├── test_file.json.bz2
    │   │   └── test_image_rgb.jpg
    ├── commands
    │   ├── __init__.py
    │   ├── conftest.py
    │   └── test_test.py
    ├── features
    │   ├── __init__.py
    │   ├── data
    │   │   ├── test_pdf.pdf
    │   │   ├── test_nifti.nii
    │   │   ├── test_image_rgb.jpg
    │   │   ├── test_nifti.nii.gz
    │   │   ├── test_audio_16000.mp3
    │   │   ├── test_audio_16000.pcm
    │   │   ├── test_audio_44100.mp3
    │   │   ├── test_audio_44100.wav
    │   │   ├── test_audio_48000.opus
    │   │   ├── test_image_rgba.png
    │   │   └── test_video_66x50.mov
    │   └── test_pdf.py
    ├── fixtures
    │   └── __init__.py
    ├── packaged_modules
    │   ├── __init__.py
    │   ├── test_sql.py
    │   ├── test_pandas.py
    │   ├── test_parquet.py
    │   └── test_arrow.py
    ├── test_filelock.py
    ├── _test_patching.py
    ├── test_experimental.py
    ├── test_version.py
    ├── test_info_utils.py
    ├── test_exceptions.py
    ├── distributed_scripts
    │   └── run_torch_distributed.py
    ├── test_splits.py
    ├── test_parallel.py
    ├── test_dataset_list.py
    ├── test_sharding_utils.py
    ├── test_offline_util.py
    ├── conftest.py
    ├── test_filesystem.py
    └── test_hub.py
├── benchmarks
    ├── results
    │   ├── .gitkeep
    │   ├── benchmark_indices_mapping.json
    │   ├── benchmark_getitem_100B.json
    │   ├── benchmark_map_filter.json
    │   ├── benchmark_iterating.json
    │   └── benchmark_array_xd.json
    ├── format.py
    ├── benchmark_indices_mapping.py
    ├── utils.py
    ├── benchmark_getitem_100B.py
    └── benchmark_map_filter.py
├── src
    └── datasets
    │   ├── io
    │       ├── __init__.py
    │       ├── abc.py
    │       ├── spark.py
    │       ├── text.py
    │       └── generator.py
    │   ├── utils
    │       ├── resources
    │       │   ├── __init__.py
    │       │   ├── multilingualities.json
    │       │   ├── size_categories.json
    │       │   └── creators.json
    │       ├── hub.py
    │       ├── typing.py
    │       ├── filelock.py
    │       ├── doc_utils.py
    │       ├── __init__.py
    │       ├── experimental.py
    │       ├── track.py
    │       ├── _filelock.py
    │       ├── version.py
    │       └── deprecation_utils.py
    │   ├── packaged_modules
    │       ├── arrow
    │       │   ├── __init__.py
    │       │   └── arrow.py
    │       ├── cache
    │       │   └── __init__.py
    │       ├── csv
    │       │   └── __init__.py
    │       ├── eval
    │       │   ├── __init__.py
    │       │   └── eval.py
    │       ├── hdf5
    │       │   └── __init__.py
    │       ├── json
    │       │   └── __init__.py
    │       ├── pandas
    │       │   ├── __init__.py
    │       │   └── pandas.py
    │       ├── spark
    │       │   └── __init__.py
    │       ├── sql
    │       │   └── __init__.py
    │       ├── text
    │       │   └── __init__.py
    │       ├── xml
    │       │   ├── __init__.py
    │       │   └── xml.py
    │       ├── audiofolder
    │       │   ├── __init__.py
    │       │   └── audiofolder.py
    │       ├── generator
    │       │   ├── __init__.py
    │       │   └── generator.py
    │       ├── imagefolder
    │       │   ├── __init__.py
    │       │   └── imagefolder.py
    │       ├── niftifolder
    │       │   ├── __init__.py
    │       │   └── niftifolder.py
    │       ├── parquet
    │       │   └── __init__.py
    │       ├── pdffolder
    │       │   ├── __init__.py
    │       │   └── pdffolder.py
    │       ├── videofolder
    │       │   ├── __init__.py
    │       │   └── videofolder.py
    │       ├── webdataset
    │       │   └── __init__.py
    │       └── folder_based_builder
    │       │   └── __init__.py
    │   ├── parallel
    │       └── __init__.py
    │   ├── download
    │       ├── __init__.py
    │       └── download_config.py
    │   ├── commands
    │       ├── __init__.py
    │       ├── datasets_cli.py
    │       ├── env.py
    │       └── delete_from_hub.py
    │   ├── features
    │       ├── _torchcodec.py
    │       └── __init__.py
    │   ├── distributed.py
    │   ├── filesystems
    │       └── __init__.py
    │   ├── __init__.py
    │   └── naming.py
├── .github
    ├── conda
    │   ├── build.sh
    │   └── meta.yaml
    ├── workflows
    │   ├── trufflehog.yml
    │   ├── upload_pr_documentation.yml
    │   ├── build_pr_documentation.yml
    │   ├── build_documentation.yml
    │   ├── self-assign.yaml
    │   └── release-conda.yml
    └── ISSUE_TEMPLATE
    │   ├── config.yml
    │   ├── feature-request.yml
    │   └── bug-report.yml
├── docs
    └── source
    │   ├── imgs
    │       ├── course_banner.png
    │       └── datasets_logo_name.jpg
    │   ├── _redirects.yml
    │   ├── _config.py
    │   ├── package_reference
    │       ├── builder_classes.mdx
    │       ├── utilities.mdx
    │       ├── table_classes.mdx
    │       └── loading_methods.mdx
    │   ├── tutorial.md
    │   ├── cli.mdx
    │   ├── how_to.md
    │   ├── nlp_load.mdx
    │   ├── about_arrow.md
    │   ├── dataset_card.mdx
    │   ├── use_with_pandas.mdx
    │   ├── index.mdx
    │   ├── use_with_spark.mdx
    │   ├── filesystems.mdx
    │   ├── nlp_process.mdx
    │   ├── image_classification.mdx
    │   ├── about_map_batch.mdx
    │   ├── about_cache.mdx
    │   ├── image_process.mdx
    │   ├── audio_process.mdx
    │   ├── installation.md
    │   └── use_with_pyarrow.mdx
├── .dvcignore
├── .pre-commit-config.yaml
├── AUTHORS
├── ADD_NEW_DATASET.md
├── Makefile
├── pyproject.toml
├── .gitignore
├── SECURITY.md
├── notebooks
    └── README.md
├── templates
    └── README.md
└── .zenodo.json


/.dvc/config:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/io/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/benchmarks/results/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/io/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/commands/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/features/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/fixtures/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/packaged_modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/utils/resources/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/arrow/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/cache/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/csv/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/eval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/hdf5/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/json/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/pandas/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/spark/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/sql/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/text/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/xml/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.dvc/.gitignore:
--------------------------------------------------------------------------------
1 | /config.local
2 | /tmp
3 | /cache
4 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/audiofolder/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/generator/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/imagefolder/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/niftifolder/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/parquet/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/pdffolder/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/videofolder/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/webdataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/folder_based_builder/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/conda/build.sh:
--------------------------------------------------------------------------------
1 | $PYTHON setup.py install --single-version-externally-managed --record=record.txt
2 | 


--------------------------------------------------------------------------------
/src/datasets/parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .parallel import ParallelBackendConfig, parallel_backend, parallel_map
2 | 


--------------------------------------------------------------------------------
/tests/io/data/test_file.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/io/data/test_file.json.gz


--------------------------------------------------------------------------------
/tests/io/data/test_file.json.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/io/data/test_file.json.xz


--------------------------------------------------------------------------------
/tests/features/data/test_pdf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_pdf.pdf


--------------------------------------------------------------------------------
/tests/io/data/test_file.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/io/data/test_file.json.bz2


--------------------------------------------------------------------------------
/tests/io/data/test_image_rgb.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/io/data/test_image_rgb.jpg


--------------------------------------------------------------------------------
/docs/source/imgs/course_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/docs/source/imgs/course_banner.png


--------------------------------------------------------------------------------
/tests/features/data/test_nifti.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_nifti.nii


--------------------------------------------------------------------------------
/tests/features/data/test_image_rgb.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_image_rgb.jpg


--------------------------------------------------------------------------------
/tests/features/data/test_nifti.nii.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_nifti.nii.gz


--------------------------------------------------------------------------------
/docs/source/imgs/datasets_logo_name.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/docs/source/imgs/datasets_logo_name.jpg


--------------------------------------------------------------------------------
/tests/features/data/test_audio_16000.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_audio_16000.mp3


--------------------------------------------------------------------------------
/tests/features/data/test_audio_16000.pcm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_audio_16000.pcm


--------------------------------------------------------------------------------
/tests/features/data/test_audio_44100.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_audio_44100.mp3


--------------------------------------------------------------------------------
/tests/features/data/test_audio_44100.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_audio_44100.wav


--------------------------------------------------------------------------------
/tests/features/data/test_audio_48000.opus:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_audio_48000.opus


--------------------------------------------------------------------------------
/tests/features/data/test_image_rgba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_image_rgba.png


--------------------------------------------------------------------------------
/tests/features/data/test_video_66x50.mov:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_video_66x50.mov


--------------------------------------------------------------------------------
/.dvcignore:
--------------------------------------------------------------------------------
1 | # Add patterns of files dvc should ignore, which could improve
2 | # the performance. Learn more at
3 | # https://dvc.org/doc/user-guide/dvcignore
4 | 


--------------------------------------------------------------------------------
/src/datasets/utils/hub.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 | 
3 | from huggingface_hub import hf_hub_url
4 | 
5 | 
6 | hf_dataset_url = partial(hf_hub_url, repo_type="dataset")
7 | 


--------------------------------------------------------------------------------
/benchmarks/results/benchmark_indices_mapping.json:
--------------------------------------------------------------------------------
1 | {"num examples": 500000, "select": 0.03741131999413483, "sort": 0.7371353159978753, "shuffle": 0.17655655200360343, "train_test_split": 0.29633847798686475, "shard": 0.01452581599005498}


--------------------------------------------------------------------------------
/benchmarks/results/benchmark_getitem_100B.json:
--------------------------------------------------------------------------------
1 | {"num examples": 100000000000, "get_first_row": 0.00019991099999927542, "get_last_row": 5.4411000000698095e-05, "get_batch_of_1024_rows": 0.0004897069999998394, "get_batch_of_1024_random_rows": 0.01800621099999944}


--------------------------------------------------------------------------------
/src/datasets/utils/resources/multilingualities.json:
--------------------------------------------------------------------------------
1 | {
2 |   "monolingual": "contains a single language",
3 |   "multilingual": "contains multiple languages",
4 |   "translation": "contains translated or aligned text",
5 |   "other": "other type of language distribution"
6 | }
7 | 


--------------------------------------------------------------------------------
/src/datasets/utils/typing.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import TypeVar, Union
 3 | 
 4 | 
 5 | T = TypeVar("T")
 6 | 
 7 | ListLike = Union[list[T], tuple[T, ...]]
 8 | NestedDataStructureLike = Union[T, list[T], dict[str, T]]
 9 | PathLike = Union[str, bytes, os.PathLike]
10 | 


--------------------------------------------------------------------------------
/src/datasets/utils/resources/size_categories.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   "unknown",
 3 |   "n<1K",
 4 |   "1K<n<10K",
 5 |   "10K<n<100K",
 6 |   "100K<n<1M",
 7 |   "1M<n<10M",
 8 |   "10M<n<100M",
 9 |   "100M<n<1B",
10 |   "1B<n<10B",
11 |   "10B<n<100B",
12 |   "100B<n<1T",
13 |   "n>1T"
14 | ]
15 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/charliermarsh/ruff-pre-commit # https://github.com/charliermarsh/ruff#usage
 3 |     rev: 'v0.11.8'
 4 |     hooks:
 5 |       # Run the linter.
 6 |       - id: ruff
 7 |         args: [ --fix ]
 8 |       # Run the formatter.
 9 |       - id: ruff-format
10 | 


--------------------------------------------------------------------------------
/tests/commands/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from huggingface_hub import snapshot_download
 3 | 
 4 | 
 5 | @pytest.fixture
 6 | def dataset_dir(tmp_path):
 7 |     dataset_dir = tmp_path / "test_command_dataset_dir"
 8 |     snapshot_download("hf-internal-testing/ner-jsonl", repo_type="dataset", local_dir=dataset_dir)
 9 |     return str(dataset_dir)
10 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | # This is the list of HuggingFace Datasets authors for copyright purposes.
2 | #
3 | # This does not necessarily list everyone who has contributed code, since in
4 | # some cases, their employer may be the copyright holder.  To see the full list
5 | # of contributors, see the revision history in source control.
6 | 
7 | Google Inc.
8 | HuggingFace Inc.
9 | 


--------------------------------------------------------------------------------
/src/datasets/download/__init__.py:
--------------------------------------------------------------------------------
 1 | __all__ = [
 2 |     "DownloadConfig",
 3 |     "DownloadManager",
 4 |     "DownloadMode",
 5 |     "StreamingDownloadManager",
 6 | ]
 7 | 
 8 | from .download_config import DownloadConfig
 9 | from .download_manager import DownloadManager, DownloadMode
10 | from .streaming_download_manager import StreamingDownloadManager
11 | 


--------------------------------------------------------------------------------
/src/datasets/utils/resources/creators.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "language": [
 3 |     "found",
 4 |     "crowdsourced",
 5 |     "expert-generated",
 6 |     "machine-generated",
 7 |     "other"
 8 |   ],
 9 |   "annotations": [
10 |     "found",
11 |     "crowdsourced",
12 |     "expert-generated",
13 |     "machine-generated",
14 |     "no-annotation",
15 |     "other"
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/tests/test_filelock.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datasets.utils._filelock import FileLock
 4 | 
 5 | 
 6 | def test_long_path(tmpdir):
 7 |     filename = "a" * 1000 + ".lock"
 8 |     lock1 = FileLock(str(tmpdir / filename))
 9 |     assert lock1.lock_file.endswith(".lock")
10 |     assert not lock1.lock_file.endswith(filename)
11 |     assert len(os.path.basename(lock1.lock_file)) <= 255
12 | 


--------------------------------------------------------------------------------
/.github/workflows/trufflehog.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 | 
 4 | name: Secret Leaks
 5 | 
 6 | permissions:
 7 |   contents: read
 8 | 
 9 | jobs:
10 |   trufflehog:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - name: Checkout code
14 |       uses: actions/checkout@v4
15 |       with:
16 |         fetch-depth: 0
17 |     - name: Secret Scanning
18 |       uses: trufflesecurity/trufflehog@main
19 | 


--------------------------------------------------------------------------------
/src/datasets/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from argparse import ArgumentParser
 3 | 
 4 | 
 5 | class BaseDatasetsCLICommand(ABC):
 6 |     @staticmethod
 7 |     @abstractmethod
 8 |     def register_subcommand(parser: ArgumentParser):
 9 |         raise NotImplementedError()
10 | 
11 |     @abstractmethod
12 |     def run(self):
13 |         raise NotImplementedError()
14 | 


--------------------------------------------------------------------------------
/tests/_test_patching.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa: F401
 2 | # This is the module that test_patching.py uses to test patch_submodule()
 3 | import os
 4 | import os as renamed_os
 5 | from os import path
 6 | from os import path as renamed_path
 7 | from os.path import join
 8 | from os.path import join as renamed_join
 9 | 
10 | 
11 | open = open  # we just need to have a builtin inside this module to test it properly
12 | 


--------------------------------------------------------------------------------
/ADD_NEW_DATASET.md:
--------------------------------------------------------------------------------
1 | # How to add one new datasets
2 | 
3 | Add datasets directly to the 🤗 Hugging Face Hub!
4 | 
5 | You can share your dataset on https://huggingface.co/datasets directly using your account, see the documentation:
6 | 
7 | * [Create a dataset and upload files on the website](https://huggingface.co/docs/datasets/upload_dataset)
8 | * [Advanced guide using the CLI](https://huggingface.co/docs/datasets/share)
9 | 


--------------------------------------------------------------------------------
/src/datasets/utils/filelock.py:
--------------------------------------------------------------------------------
 1 | # deprecated, please use the `filelock` package instead
 2 | 
 3 | from filelock import (  # noqa: F401 # imported for backward compatibility TODO: remove in 3.0.0
 4 |     BaseFileLock,
 5 |     SoftFileLock,
 6 |     Timeout,
 7 |     UnixFileLock,
 8 |     WindowsFileLock,
 9 | )
10 | 
11 | from ._filelock import FileLock  # noqa: F401 # imported for backward compatibility. TODO: remove in 3.0.0
12 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | contact_links:
2 |   - name: Datasets on the Hugging Face Hub
3 |     url: https://huggingface.co/datasets
4 |     about: Please use the "Community" tab of the dataset on the Hugging Face Hub to open a discussion or a pull request
5 |   - name: Forum
6 |     url: https://discuss.huggingface.co/c/datasets/10
7 |     about: Please ask and answer questions here, and engage with other community members
8 | 


--------------------------------------------------------------------------------
/benchmarks/results/benchmark_map_filter.json:
--------------------------------------------------------------------------------
1 | {"num examples": 500000, "map identity": 10.19139202599763, "map identity batched": 0.6804238399927272, "map no-op batched": 0.5342009569867514, "map no-op batched numpy": 0.5792830920108827, "map no-op batched pandas": 0.4343639040016569, "map no-op batched pytorch": 0.5403374370071106, "map no-op batched tensorflow": 1.3869360350072384, "map fast-tokenizer batched": 8.074308118986664, "filter": 1.841787679004483}


--------------------------------------------------------------------------------
/docs/source/_redirects.yml:
--------------------------------------------------------------------------------
 1 | # This first_section was backported from nginx
 2 | loading_datasets: loading
 3 | share_dataset: share
 4 | quicktour: quickstart
 5 | dataset_streaming: stream
 6 | torch_tensorflow: use_dataset
 7 | splits: loading#slice-splits
 8 | processing: process
 9 | faiss_and_ea: faiss_es
10 | features: about_dataset_features
11 | exploring: access
12 | package_reference/logging_methods: package_reference/utilities
13 | # end of first_section
14 | 


--------------------------------------------------------------------------------
/docs/source/_config.py:
--------------------------------------------------------------------------------
 1 | # docstyle-ignore
 2 | INSTALL_CONTENT = """
 3 | # Datasets installation
 4 | ! pip install datasets transformers
 5 | # To install from source instead of the last release, comment the command above and uncomment the following one.
 6 | # ! pip install git+https://github.com/huggingface/datasets.git
 7 | """
 8 | 
 9 | notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
10 | default_branch_name = "main"
11 | version_prefix = ""
12 | 


--------------------------------------------------------------------------------
/.github/workflows/upload_pr_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Upload PR Documentation
 2 | 
 3 | on:
 4 |   workflow_run:
 5 |     workflows: ["Build PR Documentation"]
 6 |     types:
 7 |       - completed
 8 | 
 9 | jobs:
10 |   build:
11 |     uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
12 |     with:
13 |       package_name: datasets
14 |     secrets:
15 |       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
16 |       comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}


--------------------------------------------------------------------------------
/src/datasets/utils/doc_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable
 2 | 
 3 | 
 4 | def is_documented_by(function_with_docstring: Callable):
 5 |     """Decorator to share docstrings across common functions.
 6 | 
 7 |     Args:
 8 |         function_with_docstring (`Callable`): Name of the function with the docstring.
 9 |     """
10 | 
11 |     def wrapper(target_function):
12 |         target_function.__doc__ = function_with_docstring.__doc__
13 |         return target_function
14 | 
15 |     return wrapper
16 | 


--------------------------------------------------------------------------------
/.github/workflows/build_pr_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Build PR Documentation
 2 | 
 3 | on:
 4 |   pull_request:
 5 | 
 6 | concurrency:
 7 |   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
 8 |   cancel-in-progress: true
 9 | 
10 | jobs:
11 |   build:
12 |     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
13 |     with:
14 |       commit_sha: ${{ github.event.pull_request.head.sha }}
15 |       pr_number: ${{ github.event.number }}
16 |       package: datasets
17 | 


--------------------------------------------------------------------------------
/tests/test_experimental.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import warnings
 3 | 
 4 | from datasets.utils import experimental
 5 | 
 6 | 
 7 | @experimental
 8 | def dummy_function():
 9 |     return "success"
10 | 
11 | 
12 | class TestExperimentalFlag(unittest.TestCase):
13 |     def test_experimental_warning(self):
14 |         with warnings.catch_warnings(record=True) as w:
15 |             warnings.simplefilter("always")
16 |             self.assertEqual(dummy_function(), "success")
17 |         self.assertEqual(len(w), 1)
18 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: quality style test
 2 | 
 3 | check_dirs := tests src benchmarks utils
 4 | 
 5 | # Check that source code meets quality standards
 6 | 
 7 | quality:
 8 | 	ruff check $(check_dirs) setup.py  # linter
 9 | 	ruff format --check $(check_dirs) setup.py # formatter
10 | 
11 | # Format source code automatically
12 | 
13 | style:
14 | 	ruff check --fix $(check_dirs) setup.py # linter
15 | 	ruff format $(check_dirs) setup.py # formatter
16 | 
17 | # Run tests for the library
18 | 
19 | test:
20 | 	python -m pytest -n auto --dist=loadfile -s -v ./tests/
21 | 


--------------------------------------------------------------------------------
/.github/workflows/build_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Build documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - doc-builder*
 8 |       - v*-release
 9 |       - v*-patch
10 | 
11 | jobs:
12 |   build:
13 |     uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
14 |     with:
15 |       commit_sha: ${{ github.sha }}
16 |       package: datasets
17 |       notebook_folder: datasets_doc
18 |     secrets:
19 |       token: ${{ secrets.HUGGINGFACE_PUSH }}
20 |       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
21 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/pdffolder/pdffolder.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | 
 3 | from ..folder_based_builder import folder_based_builder
 4 | 
 5 | 
 6 | logger = datasets.utils.logging.get_logger(__name__)
 7 | 
 8 | 
 9 | class PdfFolderConfig(folder_based_builder.FolderBasedBuilderConfig):
10 |     """BuilderConfig for ImageFolder."""
11 | 
12 |     drop_labels: bool = None
13 |     drop_metadata: bool = None
14 | 
15 |     def __post_init__(self):
16 |         super().__post_init__()
17 | 
18 | 
19 | class PdfFolder(folder_based_builder.FolderBasedBuilder):
20 |     BASE_FEATURE = datasets.Pdf
21 |     BASE_COLUMN_NAME = "pdf"
22 |     BUILDER_CONFIG_CLASS = PdfFolderConfig
23 |     EXTENSIONS: list[str] = [".pdf"]
24 | 


--------------------------------------------------------------------------------
/src/datasets/features/_torchcodec.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from torchcodec.decoders import AudioDecoder as _AudioDecoder
 3 | 
 4 | 
 5 | class AudioDecoder(_AudioDecoder):
 6 |     def __getitem__(self, key: str):
 7 |         if key == "array":
 8 |             y = self.get_all_samples().data.cpu().numpy()
 9 |             return np.mean(y, axis=tuple(range(y.ndim - 1))) if y.ndim > 1 else y
10 |         elif key == "sampling_rate":
11 |             return self.get_samples_played_in_range(0, 0).sample_rate
12 |         elif hasattr(super(), "__getitem__"):
13 |             return super().__getitem__(key)
14 |         else:
15 |             raise TypeError("'torchcodec.decoders.AudioDecoder' object is not subscriptable")
16 | 


--------------------------------------------------------------------------------
/tests/test_version.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from datasets.utils.version import Version
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "other, expected_equality",
 8 |     [
 9 |         (Version("1.0.0"), True),
10 |         ("1.0.0", True),
11 |         (Version("2.0.0"), False),
12 |         ("2.0.0", False),
13 |         ("1", False),
14 |         ("a", False),
15 |         (1, False),
16 |         (None, False),
17 |     ],
18 | )
19 | def test_version_equality_and_hash(other, expected_equality):
20 |     version = Version("1.0.0")
21 |     assert (version == other) is expected_equality
22 |     assert (version != other) is not expected_equality
23 |     assert (hash(version) == hash(other)) is expected_equality
24 | 


--------------------------------------------------------------------------------
/src/datasets/features/__init__.py:
--------------------------------------------------------------------------------
 1 | __all__ = [
 2 |     "Audio",
 3 |     "Array2D",
 4 |     "Array3D",
 5 |     "Array4D",
 6 |     "Array5D",
 7 |     "ClassLabel",
 8 |     "Features",
 9 |     "LargeList",
10 |     "List",
11 |     "Sequence",
12 |     "Value",
13 |     "Image",
14 |     "Translation",
15 |     "TranslationVariableLanguages",
16 |     "Video",
17 |     "Pdf",
18 |     "Nifti",
19 | ]
20 | from .audio import Audio
21 | from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, List, Sequence, Value
22 | from .image import Image
23 | from .nifti import Nifti
24 | from .pdf import Pdf
25 | from .translation import Translation, TranslationVariableLanguages
26 | from .video import Video
27 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/niftifolder/niftifolder.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | 
 3 | from ..folder_based_builder import folder_based_builder
 4 | 
 5 | 
 6 | logger = datasets.utils.logging.get_logger(__name__)
 7 | 
 8 | 
 9 | class NiftiFolderConfig(folder_based_builder.FolderBasedBuilderConfig):
10 |     """BuilderConfig for NiftiFolder."""
11 | 
12 |     drop_labels: bool = None
13 |     drop_metadata: bool = None
14 | 
15 |     def __post_init__(self):
16 |         super().__post_init__()
17 | 
18 | 
19 | class NiftiFolder(folder_based_builder.FolderBasedBuilder):
20 |     BASE_FEATURE = datasets.Nifti
21 |     BASE_COLUMN_NAME = "nifti"
22 |     BUILDER_CONFIG_CLASS = NiftiFolderConfig
23 |     EXTENSIONS: list[str] = [".nii", ".nii.gz"]
24 | 


--------------------------------------------------------------------------------
/tests/packaged_modules/test_sql.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from datasets.builder import InvalidConfigName
 4 | from datasets.data_files import DataFilesList
 5 | from datasets.packaged_modules.sql.sql import SqlConfig
 6 | 
 7 | 
 8 | def test_config_raises_when_invalid_name() -> None:
 9 |     with pytest.raises(InvalidConfigName, match="Bad characters"):
10 |         _ = SqlConfig(name="name-with-*-invalid-character")
11 | 
12 | 
13 | @pytest.mark.parametrize("data_files", ["str_path", ["str_path"], DataFilesList(["str_path"], [()])])
14 | def test_config_raises_when_invalid_data_files(data_files) -> None:
15 |     with pytest.raises(ValueError, match="Expected a DataFilesDict"):
16 |         _ = SqlConfig(name="name", data_files=data_files)
17 | 


--------------------------------------------------------------------------------
/tests/packaged_modules/test_pandas.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from datasets.builder import InvalidConfigName
 4 | from datasets.data_files import DataFilesList
 5 | from datasets.packaged_modules.pandas.pandas import PandasConfig
 6 | 
 7 | 
 8 | def test_config_raises_when_invalid_name() -> None:
 9 |     with pytest.raises(InvalidConfigName, match="Bad characters"):
10 |         _ = PandasConfig(name="name-with-*-invalid-character")
11 | 
12 | 
13 | @pytest.mark.parametrize("data_files", ["str_path", ["str_path"], DataFilesList(["str_path"], [()])])
14 | def test_config_raises_when_invalid_data_files(data_files) -> None:
15 |     with pytest.raises(ValueError, match="Expected a DataFilesDict"):
16 |         _ = PandasConfig(name="name", data_files=data_files)
17 | 


--------------------------------------------------------------------------------
/tests/packaged_modules/test_parquet.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from datasets.builder import InvalidConfigName
 4 | from datasets.data_files import DataFilesList
 5 | from datasets.packaged_modules.parquet.parquet import ParquetConfig
 6 | 
 7 | 
 8 | def test_config_raises_when_invalid_name() -> None:
 9 |     with pytest.raises(InvalidConfigName, match="Bad characters"):
10 |         _ = ParquetConfig(name="name-with-*-invalid-character")
11 | 
12 | 
13 | @pytest.mark.parametrize("data_files", ["str_path", ["str_path"], DataFilesList(["str_path"], [()])])
14 | def test_config_raises_when_invalid_data_files(data_files) -> None:
15 |     with pytest.raises(ValueError, match="Expected a DataFilesDict"):
16 |         _ = ParquetConfig(name="name", data_files=data_files)
17 | 


--------------------------------------------------------------------------------
/.dvc/plots/scatter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": "point",
 8 |     "encoding": {
 9 |         "x": {
10 |             "field": "<DVC_METRIC_X>",
11 |             "type": "quantitative",
12 |             "title": "<DVC_METRIC_X_LABEL>"
13 |         },
14 |         "y": {
15 |             "field": "<DVC_METRIC_Y>",
16 |             "type": "quantitative",
17 |             "title": "<DVC_METRIC_Y_LABEL>",
18 |             "scale": {
19 |                 "zero": false
20 |             }
21 |         },
22 |         "color": {
23 |             "field": "rev",
24 |             "type": "nominal"
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.ruff]
 2 | line-length = 119
 3 | 
 4 | [tool.ruff.lint]
 5 | # Ignored rules:
 6 | #   "E501" -> line length violation
 7 | #   "F821" -> undefined named in type annotation (e.g. Literal["something"])
 8 | #   "C901" -> `function_name` is too complex
 9 | ignore = ["E501", "F821", "C901"]
10 | select = ["C", "E", "F", "I", "W"]
11 | 
12 | [tool.ruff.lint.isort]
13 | lines-after-imports = 2
14 | known-first-party = ["datasets"]
15 | 
16 | [tool.ruff.lint.per-file-ignores]
17 | "__init__.py" = ["F401", "F403", "F405"]
18 | 
19 | [tool.pytest.ini_options]
20 | # Test fails if a FutureWarning is thrown by `huggingface_hub`
21 | filterwarnings = [
22 |     "error::FutureWarning:huggingface_hub*",
23 | ]
24 | markers = [
25 |     "unit: unit test",
26 |     "integration: integration test",
27 | ]
28 | 


--------------------------------------------------------------------------------
/.dvc/plots/default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": {
 8 |         "type": "line"
 9 |     },
10 |     "encoding": {
11 |         "x": {
12 |             "field": "<DVC_METRIC_X>",
13 |             "type": "quantitative",
14 |             "title": "<DVC_METRIC_X_LABEL>"
15 |         },
16 |         "y": {
17 |             "field": "<DVC_METRIC_Y>",
18 |             "type": "quantitative",
19 |             "title": "<DVC_METRIC_Y_LABEL>",
20 |             "scale": {
21 |                 "zero": false
22 |             }
23 |         },
24 |         "color": {
25 |             "field": "rev",
26 |             "type": "nominal"
27 |         }
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/.dvc/plots/confusion.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": "rect",
 8 |     "encoding": {
 9 |         "x": {
10 |             "field": "<DVC_METRIC_X>",
11 |             "type": "nominal",
12 |             "sort": "ascending",
13 |             "title": "<DVC_METRIC_X_LABEL>"
14 |         },
15 |         "y": {
16 |             "field": "<DVC_METRIC_Y>",
17 |             "type": "nominal",
18 |             "sort": "ascending",
19 |             "title": "<DVC_METRIC_Y_LABEL>"
20 |         },
21 |         "color": {
22 |             "aggregate": "count",
23 |             "type": "quantitative"
24 |         },
25 |         "facet": {
26 |             "field": "rev",
27 |             "type": "nominal"
28 |         }
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/.github/workflows/self-assign.yaml:
--------------------------------------------------------------------------------
 1 | name: Self-assign
 2 | on:
 3 |   issue_comment:
 4 |     types: created
 5 | jobs:
 6 |   one:
 7 |     runs-on: ubuntu-latest
 8 |     if: >-
 9 |       (github.event.comment.body == '#take' ||
10 |        github.event.comment.body == '#self-assign')
11 |       && !github.event.issue.assignee
12 |     steps:
13 |       - run: |
14 |           echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}"
15 |           curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees
16 |           curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -X "DELETE" https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels/help%20wanted
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Locked files
 2 | *.lock
 3 | !dvc.lock
 4 | 
 5 | # Extracted dummy data
 6 | datasets/**/dummy_data-zip-extracted/
 7 | 
 8 | # Compiled python modules.
 9 | *.pyc
10 | 
11 | # Byte-compiled
12 | _pycache__/
13 | .cache/
14 | 
15 | # Python egg metadata, regenerated from source files by setuptools.
16 | *.egg-info
17 | .eggs/
18 | 
19 | # PyPI distribution artifacts.
20 | build/
21 | dist/
22 | 
23 | # Environments
24 | .env
25 | .venv
26 | env/
27 | venv/
28 | ENV/
29 | env.bak/
30 | venv.bak/
31 | 
32 | # pyenv
33 | .python-version
34 | 
35 | # Tests
36 | .pytest_cache/
37 | 
38 | # Other
39 | *.DS_Store
40 | 
41 | # PyCharm/vscode
42 | .idea
43 | .vscode
44 | 
45 | # Vim
46 | .*.swp
47 | 
48 | # playground
49 | /playground
50 | 
51 | # Sphinx documentation
52 | docs/_build/
53 | docs/source/_build/
54 | 
55 | # Benchmark results
56 | report.json
57 | report.md
58 | 
59 | # Ruff
60 | .ruff_cache
61 | 


--------------------------------------------------------------------------------
/benchmarks/results/benchmark_iterating.json:
--------------------------------------------------------------------------------
1 | {"num examples": 50000, "read 5000": 0.2152090710005723, "read 50000": 2.077654693988734, "read_batch 50000 10": 1.5041199039987987, "read_batch 50000 100": 1.5411947140091797, "read_batch 50000 1000": 1.4684901159926085, "read_formatted numpy 5000": 4.584776938994764, "read_formatted pandas 5000": 3.7457121399929747, "read_formatted torch 5000": 4.565676491998602, "read_formatted tensorflow 5000": 5.269861594992108, "read_formatted_batch numpy 5000 10": 0.4242750950070331, "read_formatted_batch numpy 5000 1000": 0.007607111998368055, "shuffled read 5000": 0.22604441999283154, "shuffled read 50000": 2.268928524994408, "shuffled read_batch 50000 10": 55.44462437101174, "shuffled read_batch 50000 100": 6.876476717996411, "shuffled read_batch 50000 1000": 2.1420724369963864, "shuffled read_formatted numpy 5000": 4.8052272600034485, "shuffled read_formatted_batch numpy 5000 10": 6.500664097999106, "shuffled read_formatted_batch numpy 5000 1000": 0.0754691059992183}


--------------------------------------------------------------------------------
/docs/source/package_reference/builder_classes.mdx:
--------------------------------------------------------------------------------
 1 | # Builder classes
 2 | 
 3 | ## Builders
 4 | 
 5 | 🤗 Datasets relies on two main classes during the dataset building process: [`DatasetBuilder`] and [`BuilderConfig`].
 6 | 
 7 | [[autodoc]] datasets.DatasetBuilder
 8 | 
 9 | [[autodoc]] datasets.GeneratorBasedBuilder
10 | 
11 | [[autodoc]] datasets.ArrowBasedBuilder
12 | 
13 | [[autodoc]] datasets.BuilderConfig
14 | 
15 | ## Download
16 | 
17 | [[autodoc]] datasets.DownloadManager
18 | 
19 | [[autodoc]] datasets.StreamingDownloadManager
20 | 
21 | [[autodoc]] datasets.DownloadConfig
22 | 
23 | [[autodoc]] datasets.DownloadMode
24 | 
25 | ## Verification
26 | 
27 | [[autodoc]] datasets.VerificationMode
28 | 
29 | ## Splits
30 | 
31 | [[autodoc]] datasets.SplitGenerator
32 | 
33 | [[autodoc]] datasets.Split
34 | 
35 | [[autodoc]] datasets.NamedSplit
36 | 
37 | [[autodoc]] datasets.NamedSplitAll
38 | 
39 | [[autodoc]] datasets.ReadInstruction
40 | 
41 | ## Version
42 | 
43 | [[autodoc]] datasets.utils.Version
44 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/videofolder/videofolder.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | 
 3 | from ..folder_based_builder import folder_based_builder
 4 | 
 5 | 
 6 | logger = datasets.utils.logging.get_logger(__name__)
 7 | 
 8 | 
 9 | class VideoFolderConfig(folder_based_builder.FolderBasedBuilderConfig):
10 |     """BuilderConfig for ImageFolder."""
11 | 
12 |     drop_labels: bool = None
13 |     drop_metadata: bool = None
14 | 
15 |     def __post_init__(self):
16 |         super().__post_init__()
17 | 
18 | 
19 | class VideoFolder(folder_based_builder.FolderBasedBuilder):
20 |     BASE_FEATURE = datasets.Video
21 |     BASE_COLUMN_NAME = "video"
22 |     BUILDER_CONFIG_CLASS = VideoFolderConfig
23 |     EXTENSIONS: list[str]  # definition at the bottom of the script
24 | 
25 | 
26 | # TODO: initial list, we should check the compatibility of other formats
27 | VIDEO_EXTENSIONS = [
28 |     ".mkv",
29 |     ".mp4",
30 |     ".avi",
31 |     ".mpeg",
32 |     ".mov",
33 | ]
34 | VideoFolder.EXTENSIONS = VIDEO_EXTENSIONS
35 | 


--------------------------------------------------------------------------------
/tests/test_info_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import datasets.config
 4 | from datasets.utils.info_utils import is_small_dataset
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("dataset_size", [None, 400 * 2**20, 600 * 2**20])
 8 | @pytest.mark.parametrize("input_in_memory_max_size", ["default", 0, 100 * 2**20, 900 * 2**20])
 9 | def test_is_small_dataset(dataset_size, input_in_memory_max_size, monkeypatch):
10 |     if input_in_memory_max_size != "default":
11 |         monkeypatch.setattr(datasets.config, "IN_MEMORY_MAX_SIZE", input_in_memory_max_size)
12 |     in_memory_max_size = datasets.config.IN_MEMORY_MAX_SIZE
13 |     if input_in_memory_max_size == "default":
14 |         assert in_memory_max_size == 0
15 |     else:
16 |         assert in_memory_max_size == input_in_memory_max_size
17 |     if dataset_size and in_memory_max_size:
18 |         expected = dataset_size < in_memory_max_size
19 |     else:
20 |         expected = False
21 |     result = is_small_dataset(dataset_size)
22 |     assert result == expected
23 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | <!--
 5 | Use this section to tell people about which versions of your project are
 6 | currently being supported with security updates.
 7 | 
 8 | | Version | Supported          |
 9 | | ------- | ------------------ |
10 | | 5.1.x   | :white_check_mark: |
11 | | 5.0.x   | :x:                |
12 | | 4.0.x   | :white_check_mark: |
13 | | < 4.0   | :x:                |
14 | -->
15 | 
16 | Each major version is currently being supported with security updates.
17 | 
18 | | Version | Supported          |
19 | |---------|--------------------|
20 | | 1.x.x   | :white_check_mark: |
21 | | 2.x.x   | :white_check_mark: |
22 | 
23 | 
24 | ## Reporting a Vulnerability
25 | <!--
26 | Use this section to tell people how to report a vulnerability.
27 | 
28 | Tell them where to go, how often they can expect to get an update on a
29 | reported vulnerability, what to expect if the vulnerability is accepted or
30 | declined, etc.
31 | -->
32 | 
33 | To report a security vulnerability, please contact: security@huggingface.co
34 | 


--------------------------------------------------------------------------------
/.dvc/plots/smooth.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": {
 8 |         "type": "line"
 9 |     },
10 |     "encoding": {
11 |         "x": {
12 |             "field": "<DVC_METRIC_X>",
13 |             "type": "quantitative",
14 |             "title": "<DVC_METRIC_X_LABEL>"
15 |         },
16 |         "y": {
17 |             "field": "<DVC_METRIC_Y>",
18 |             "type": "quantitative",
19 |             "title": "<DVC_METRIC_Y_LABEL>",
20 |             "scale": {
21 |                 "zero": false
22 |             }
23 |         },
24 |         "color": {
25 |             "field": "rev",
26 |             "type": "nominal"
27 |         }
28 |     },
29 |     "transform": [
30 |         {
31 |             "loess": "<DVC_METRIC_Y>",
32 |             "on": "<DVC_METRIC_X>",
33 |             "groupby": [
34 |                 "rev"
35 |             ],
36 |             "bandwidth": 0.3
37 |         }
38 |     ]
39 | }
40 | 


--------------------------------------------------------------------------------
/tests/test_exceptions.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | import pytest
 4 | 
 5 | import datasets.utils.deprecation_utils
 6 | from datasets.exceptions import (
 7 |     ChecksumVerificationError,
 8 |     ExpectedMoreDownloadedFilesError,
 9 |     ExpectedMoreSplitsError,
10 |     NonMatchingChecksumError,
11 |     NonMatchingSplitsSizesError,
12 |     SplitsVerificationError,
13 |     UnexpectedDownloadedFileError,
14 |     UnexpectedSplitsError,
15 | )
16 | 
17 | 
18 | @pytest.mark.parametrize(
19 |     "error",
20 |     [
21 |         ChecksumVerificationError,
22 |         UnexpectedDownloadedFileError,
23 |         ExpectedMoreDownloadedFilesError,
24 |         NonMatchingChecksumError,
25 |         SplitsVerificationError,
26 |         UnexpectedSplitsError,
27 |         ExpectedMoreSplitsError,
28 |         NonMatchingSplitsSizesError,
29 |     ],
30 | )
31 | def test_error_not_deprecated(error, monkeypatch):
32 |     monkeypatch.setattr(datasets.utils.deprecation_utils, "_emitted_deprecation_warnings", set())
33 |     with warnings.catch_warnings():
34 |         warnings.simplefilter("error")
35 |         error()
36 | 


--------------------------------------------------------------------------------
/src/datasets/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from . import tqdm as _tqdm  # _tqdm is the module
16 | from .experimental import experimental
17 | from .info_utils import VerificationMode
18 | from .logging import disable_progress_bar, enable_progress_bar, is_progress_bar_enabled
19 | from .tqdm import (
20 |     are_progress_bars_disabled,
21 |     disable_progress_bars,
22 |     enable_progress_bars,
23 |     tqdm,
24 | )
25 | from .version import Version
26 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.yml:
--------------------------------------------------------------------------------
 1 | name: Feature request
 2 | description: Suggest an idea for this project
 3 | labels: ["enhancement"]
 4 | body:
 5 |   - type: textarea
 6 |     id: feature-request
 7 |     attributes:
 8 |       label: Feature request
 9 |       description: A clear and concise description of the feature proposal.
10 |     validations:
11 |       required: true
12 |   
13 |   - type: textarea
14 |     id: motivation
15 |     validations:
16 |       required: true
17 |     attributes:
18 |       label: Motivation
19 |       description: |
20 |         Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.   
21 | 
22 |   - type: textarea
23 |     id: contribution
24 |     validations:
25 |       required: true
26 |     attributes:
27 |       label: Your contribution
28 |       description: |
29 |         Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md).
30 | 


--------------------------------------------------------------------------------
/docs/source/tutorial.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | Welcome to the 🤗 Datasets tutorials! These beginner-friendly tutorials will guide you through the fundamentals of working with 🤗 Datasets. You'll load and prepare a dataset for training with your machine learning framework of choice. Along the way, you'll learn how to load different dataset configurations and splits, interact with and see what's inside your dataset, preprocess, and share a dataset to the [Hub](https://huggingface.co/datasets).
 4 | 
 5 | The tutorials assume some basic knowledge of Python and a machine learning framework like PyTorch or TensorFlow. If you're already familiar with these, feel free to check out the [quickstart](./quickstart) to see what you can do with 🤗 Datasets.
 6 | 
 7 | > [!TIP]
 8 | > The tutorials only cover the basic skills you need to use 🤗 Datasets. There are many other useful functionalities and applications that aren't discussed here. If you're interested in learning more, take a look at [Chapter 5](https://huggingface.co/course/chapter5/1?fw=pt) of the Hugging Face course.
 9 | 
10 | If you have any questions about 🤗 Datasets, feel free to join and ask the community on our [forum](https://discuss.huggingface.co/c/datasets/10).
11 | 
12 | Let's get started! 🏁
13 | 


--------------------------------------------------------------------------------
/.github/conda/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "datasets" %}
 2 | 
 3 | package:
 4 |   name: "{{ name|lower }}"
 5 |   version: "{{ DATASETS_VERSION }}"
 6 | 
 7 | source:
 8 |   path: ../../
 9 | 
10 | build:
11 |   noarch: python
12 | 
13 | requirements:
14 |   host:
15 |     - python
16 |     - pip
17 |     - numpy >=1.17
18 |     - pyarrow >=16.0.0
19 |     - python-xxhash
20 |     - dill
21 |     - pandas
22 |     - requests >=2.19.0
23 |     - httpx <1.0.0
24 |     - tqdm >=4.66.3
25 |     - dataclasses
26 |     - multiprocess
27 |     - fsspec
28 |     - huggingface_hub >=0.25.0,<2.0.0
29 |     - packaging
30 |   run:
31 |     - python
32 |     - pip
33 |     - numpy >=1.17
34 |     - pyarrow >=16.0.0
35 |     - python-xxhash
36 |     - dill
37 |     - pandas
38 |     - requests >=2.19.0
39 |     - httpx <1.0.0
40 |     - tqdm >=4.66.3
41 |     - dataclasses
42 |     - multiprocess
43 |     - fsspec
44 |     - huggingface_hub >=0.25.0,<2.0.0
45 |     - packaging
46 | 
47 | test:
48 |   imports:
49 |     - datasets
50 | 
51 | about:
52 |   home: https://huggingface.co
53 |   license: Apache License 2.0
54 |   license_file: LICENSE
55 |   summary: "🤗 The largest hub of ready-to-use NLP datasets for ML models with fast, easy-to-use and efficient data manipulation tools"
56 | 


--------------------------------------------------------------------------------
/.github/workflows/release-conda.yml:
--------------------------------------------------------------------------------
 1 | name: Release - Conda
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "[0-9]+.[0-9]+.[0-9]+*"
 7 | 
 8 | env:
 9 |   ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
10 | 
11 | jobs:
12 |   build_and_package:
13 |     runs-on: ubuntu-22.04
14 |     defaults:
15 |       run:
16 |         shell: bash -l {0}
17 | 
18 |     steps:
19 |       - name: Checkout repository
20 |         uses: actions/checkout@v4
21 | 
22 |       - name: Install miniconda
23 |         uses: conda-incubator/setup-miniconda@v2
24 |         with:
25 |           auto-update-conda: true
26 |           auto-activate-base: false
27 |           activate-environment: "build-datasets"
28 |           python-version: 3.9
29 |           channels: huggingface
30 | 
31 |       - name: Setup conda env
32 |         run: |
33 |           conda install -c defaults anaconda-client conda-build
34 | 
35 |       - name: Extract version
36 |         run: echo "DATASETS_VERSION=`python setup.py --version`" >> $GITHUB_ENV
37 | 
38 |       - name: Build conda packages
39 |         run: |
40 |           conda info
41 |           conda build .github/conda
42 | 
43 |       - name: Upload to Anaconda
44 |         run: |
45 |           anaconda upload `conda build .github/conda --output -c conda-forge` --force
46 | 


--------------------------------------------------------------------------------
/src/datasets/utils/experimental.py:
--------------------------------------------------------------------------------
 1 | """Contains utilities to flag a feature as "experimental" in datasets."""
 2 | 
 3 | import warnings
 4 | from functools import wraps
 5 | from typing import Callable
 6 | 
 7 | 
 8 | def experimental(fn: Callable) -> Callable:
 9 |     """Decorator to flag a feature as experimental.
10 | 
11 |     An experimental feature trigger a warning when used as it might be subject to breaking changes in the future.
12 | 
13 |     Args:
14 |         fn (`Callable`):
15 |             The function to flag as experimental.
16 | 
17 |     Returns:
18 |         `Callable`: The decorated function.
19 | 
20 |     Example:
21 | 
22 |     ```python
23 |     >>> from datasets.utils import experimental
24 | 
25 |     >>> @experimental
26 |     ... def my_function():
27 |     ...     print("Hello world!")
28 | 
29 |     >>> my_function()
30 |     UserWarning: 'my_function' is experimental and might be subject to breaking changes in the future.
31 |     Hello world!
32 |     ```
33 |     """
34 | 
35 |     @wraps(fn)
36 |     def _inner_fn(*args, **kwargs):
37 |         warnings.warn(
38 |             (f"'{fn.__name__}' is experimental and might be subject to breaking changes in the future."),
39 |             UserWarning,
40 |         )
41 |         return fn(*args, **kwargs)
42 | 
43 |     return _inner_fn
44 | 


--------------------------------------------------------------------------------
/src/datasets/commands/datasets_cli.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from argparse import ArgumentParser
 3 | 
 4 | from datasets.commands.delete_from_hub import DeleteFromHubCommand
 5 | from datasets.commands.env import EnvironmentCommand
 6 | from datasets.commands.test import TestCommand
 7 | from datasets.utils.logging import set_verbosity_info
 8 | 
 9 | 
10 | def parse_unknown_args(unknown_args):
11 |     return {key.lstrip("-"): value for key, value in zip(unknown_args[::2], unknown_args[1::2])}
12 | 
13 | 
14 | def main():
15 |     parser = ArgumentParser(
16 |         "HuggingFace Datasets CLI tool", usage="datasets-cli <command> [<args>]", allow_abbrev=False
17 |     )
18 |     commands_parser = parser.add_subparsers(help="datasets-cli command helpers")
19 |     set_verbosity_info()
20 | 
21 |     # Register commands
22 |     EnvironmentCommand.register_subcommand(commands_parser)
23 |     TestCommand.register_subcommand(commands_parser)
24 |     DeleteFromHubCommand.register_subcommand(commands_parser)
25 | 
26 |     # Parse args
27 |     args, unknown_args = parser.parse_known_args()
28 |     if not hasattr(args, "func"):
29 |         parser.print_help()
30 |         exit(1)
31 |     kwargs = parse_unknown_args(unknown_args)
32 | 
33 |     # Run
34 |     service = args.func(args, **kwargs)
35 |     service.run()
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     main()
40 | 


--------------------------------------------------------------------------------
/src/datasets/commands/env.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | from argparse import ArgumentParser
 3 | 
 4 | import fsspec
 5 | import huggingface_hub
 6 | import pandas
 7 | import pyarrow
 8 | 
 9 | from datasets import __version__ as version
10 | from datasets.commands import BaseDatasetsCLICommand
11 | 
12 | 
13 | def info_command_factory(_):
14 |     return EnvironmentCommand()
15 | 
16 | 
17 | class EnvironmentCommand(BaseDatasetsCLICommand):
18 |     @staticmethod
19 |     def register_subcommand(parser: ArgumentParser):
20 |         download_parser = parser.add_parser("env", help="Print relevant system environment info.")
21 |         download_parser.set_defaults(func=info_command_factory)
22 | 
23 |     def run(self):
24 |         info = {
25 |             "`datasets` version": version,
26 |             "Platform": platform.platform(),
27 |             "Python version": platform.python_version(),
28 |             "`huggingface_hub` version": huggingface_hub.__version__,
29 |             "PyArrow version": pyarrow.__version__,
30 |             "Pandas version": pandas.__version__,
31 |             "`fsspec` version": fsspec.__version__,
32 |         }
33 | 
34 |         print("\nCopy-and-paste the text below in your GitHub issue.\n")
35 |         print(self.format_dict(info))
36 | 
37 |         return info
38 | 
39 |     @staticmethod
40 |     def format_dict(d):
41 |         return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"
42 | 


--------------------------------------------------------------------------------
/benchmarks/results/benchmark_array_xd.json:
--------------------------------------------------------------------------------
1 | {"write_array2d": 0.14168284999323077, "read_unformated after write_array2d": 0.04353281999647152, "read_formatted_as_numpy after write_array2d": 0.1285462469968479, "read_batch_unformated after write_array2d": 0.023109222995117307, "read_batch_formatted_as_numpy after write_array2d": 0.011352884990628809, "read_col_unformated after write_array2d": 0.037052362007671036, "read_col_formatted_as_numpy after write_array2d": 0.007985618998645805, "write_nested_sequence": 1.4927163410029607, "read_unformated after write_nested_sequence": 0.28319963401008863, "read_formatted_as_numpy after write_nested_sequence": 0.419271487990045, "read_batch_unformated after write_nested_sequence": 0.3234798710036557, "read_batch_formatted_as_numpy after write_nested_sequence": 0.03850809299910907, "read_col_unformated after write_nested_sequence": 0.29384092400141526, "read_col_formatted_as_numpy after write_nested_sequence": 0.004250421989127062, "write_flattened_sequence": 1.4521546780015342, "read_unformated after write_flattened_sequence": 0.25513897799828555, "read_formatted_as_numpy after write_flattened_sequence": 0.07564631900459062, "read_batch_unformated after write_flattened_sequence": 0.2758980469952803, "read_batch_formatted_as_numpy after write_flattened_sequence": 0.011008214991306886, "read_col_unformated after write_flattened_sequence": 0.25848906899045687, "read_col_formatted_as_numpy after write_flattened_sequence": 0.004328447001171298}


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/generator/generator.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Callable, Optional
 3 | 
 4 | import datasets
 5 | from datasets.builder import Key
 6 | from datasets.utils.sharding import _number_of_shards_in_gen_kwargs, _split_gen_kwargs
 7 | 
 8 | 
 9 | @dataclass
10 | class GeneratorConfig(datasets.BuilderConfig):
11 |     generator: Optional[Callable] = None
12 |     gen_kwargs: Optional[dict] = None
13 |     features: Optional[datasets.Features] = None
14 |     split: datasets.NamedSplit = datasets.Split.TRAIN
15 | 
16 |     def __post_init__(self):
17 |         super().__post_init__()
18 |         if self.generator is None:
19 |             raise ValueError("generator must be specified")
20 | 
21 |         if self.gen_kwargs is None:
22 |             self.gen_kwargs = {}
23 | 
24 | 
25 | class Generator(datasets.GeneratorBasedBuilder):
26 |     BUILDER_CONFIG_CLASS = GeneratorConfig
27 | 
28 |     def _info(self):
29 |         return datasets.DatasetInfo(features=self.config.features)
30 | 
31 |     def _split_generators(self, dl_manager):
32 |         return [datasets.SplitGenerator(name=self.config.split, gen_kwargs=self.config.gen_kwargs)]
33 | 
34 |     def _generate_examples(self, **gen_kwargs):
35 |         num_shards = _number_of_shards_in_gen_kwargs(gen_kwargs)
36 |         for shard_idx, shard_gen_kwargs in enumerate(_split_gen_kwargs(gen_kwargs, max_num_jobs=num_shards)):
37 |             for sample_idx, sample in enumerate(self.config.generator(**shard_gen_kwargs)):
38 |                 yield Key(shard_idx, sample_idx), sample
39 | 


--------------------------------------------------------------------------------
/src/datasets/commands/delete_from_hub.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | from typing import Optional
 3 | 
 4 | from datasets.commands import BaseDatasetsCLICommand
 5 | from datasets.hub import delete_from_hub
 6 | 
 7 | 
 8 | def _command_factory(args):
 9 |     return DeleteFromHubCommand(
10 |         args.dataset_id,
11 |         args.config_name,
12 |         args.token,
13 |         args.revision,
14 |     )
15 | 
16 | 
17 | class DeleteFromHubCommand(BaseDatasetsCLICommand):
18 |     @staticmethod
19 |     def register_subcommand(parser):
20 |         parser: ArgumentParser = parser.add_parser("delete_from_hub", help="Delete dataset config from the Hub")
21 |         parser.add_argument(
22 |             "dataset_id", help="source dataset ID, e.g. USERNAME/DATASET_NAME or ORGANIZATION/DATASET_NAME"
23 |         )
24 |         parser.add_argument("config_name", help="config name to delete")
25 |         parser.add_argument("--token", help="access token to the Hugging Face Hub")
26 |         parser.add_argument("--revision", help="source revision")
27 |         parser.set_defaults(func=_command_factory)
28 | 
29 |     def __init__(
30 |         self,
31 |         dataset_id: str,
32 |         config_name: str,
33 |         token: Optional[str],
34 |         revision: Optional[str],
35 |     ):
36 |         self._dataset_id = dataset_id
37 |         self._config_name = config_name
38 |         self._token = token
39 |         self._revision = revision
40 | 
41 |     def run(self) -> None:
42 |         _ = delete_from_hub(self._dataset_id, self._config_name, revision=self._revision, token=self._token)
43 | 


--------------------------------------------------------------------------------
/docs/source/cli.mdx:
--------------------------------------------------------------------------------
 1 | # Command Line Interface (CLI)
 2 | 
 3 | 🤗 Datasets provides a command line interface (CLI) with useful shell commands to interact with your dataset.
 4 | 
 5 | You can check the available commands:
 6 | ```bash
 7 | >>> datasets-cli --help
 8 | usage: datasets-cli <command> [<args>]
 9 | 
10 | positional arguments:
11 |   {env,test,delete_from_hub}
12 |                         datasets-cli command helpers
13 |     env                 Print relevant system environment info.
14 |     test                Test dataset loading.
15 |     delete_from_hub     Delete dataset config from the Hub
16 | 
17 | optional arguments:
18 |   -h, --help            show this help message and exit
19 | ```
20 | 
21 | ## Delete from Hub
22 | 
23 | Delete a dataset configuration from a [supported dataset](repository_structure) on the Hub.
24 | 
25 | ```bash
26 | >>> datasets-cli delete_from_hub --help
27 | usage: datasets-cli <command> [<args>] delete_from_hub [-h] [--token TOKEN] [--revision REVISION] dataset_id config_name
28 | 
29 | positional arguments:
30 |   dataset_id           source dataset ID, e.g. USERNAME/DATASET_NAME or ORGANIZATION/DATASET_NAME
31 |   config_name          config name to delete
32 | 
33 | optional arguments:
34 |   -h, --help           show this help message and exit
35 |   --token TOKEN        access token to the Hugging Face Hub
36 |   --revision REVISION  source revision
37 | ```
38 | 
39 | For example:
40 | ```bash
41 | >>> datasets-cli delete_from_hub USERNAME/DATASET_NAME CONFIG_NAME
42 | ```
43 | 
44 | > [!TIP]
45 | > Do not forget that you need to log in first to your Hugging Face account:
46 | > ```bash
47 | > >>> hf auth login
48 | > ```
49 | 


--------------------------------------------------------------------------------
/src/datasets/distributed.py:
--------------------------------------------------------------------------------
 1 | from typing import TypeVar
 2 | 
 3 | from .arrow_dataset import Dataset, _split_by_node_map_style_dataset
 4 | from .iterable_dataset import IterableDataset, _split_by_node_iterable_dataset
 5 | 
 6 | 
 7 | DatasetType = TypeVar("DatasetType", Dataset, IterableDataset)
 8 | 
 9 | 
10 | def split_dataset_by_node(dataset: DatasetType, rank: int, world_size: int) -> DatasetType:
11 |     """
12 |     Split a dataset for the node at rank `rank` in a pool of nodes of size `world_size`.
13 | 
14 |     For map-style datasets:
15 | 
16 |     Each node is assigned a chunk of data, e.g. rank 0 is given the first chunk of the dataset.
17 |     To maximize data loading throughput, chunks are made of contiguous data on disk if possible.
18 | 
19 |     For iterable datasets:
20 | 
21 |     If the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.num_shards % world_size == 0`),
22 |     then the shards are evenly assigned across the nodes, which is the most optimized.
23 |     Otherwise, each node keeps 1 example out of `world_size`, skipping the other examples.
24 | 
25 |     Args:
26 |         dataset ([`Dataset`] or [`IterableDataset`]):
27 |             The dataset to split by node.
28 |         rank (`int`):
29 |             Rank of the current node.
30 |         world_size (`int`):
31 |             Total number of nodes.
32 | 
33 |     Returns:
34 |         [`Dataset`] or [`IterableDataset`]: The dataset to be used on the node at rank `rank`.
35 |     """
36 |     if isinstance(dataset, Dataset):
37 |         return _split_by_node_map_style_dataset(dataset, rank=rank, world_size=world_size)
38 |     else:
39 |         return _split_by_node_iterable_dataset(dataset, rank=rank, world_size=world_size)
40 | 


--------------------------------------------------------------------------------
/src/datasets/filesystems/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import shutil
 3 | import warnings
 4 | from typing import List
 5 | 
 6 | import fsspec
 7 | import fsspec.asyn
 8 | from fsspec.implementations.local import LocalFileSystem
 9 | 
10 | from . import compression
11 | 
12 | 
13 | COMPRESSION_FILESYSTEMS: list[compression.BaseCompressedFileFileSystem] = [
14 |     compression.Bz2FileSystem,
15 |     compression.GzipFileSystem,
16 |     compression.Lz4FileSystem,
17 |     compression.XzFileSystem,
18 |     compression.ZstdFileSystem,
19 | ]
20 | 
21 | # Register custom filesystems
22 | for fs_class in COMPRESSION_FILESYSTEMS:
23 |     if fs_class.protocol in fsspec.registry and fsspec.registry[fs_class.protocol] is not fs_class:
24 |         warnings.warn(f"A filesystem protocol was already set for {fs_class.protocol} and will be overwritten.")
25 |     fsspec.register_implementation(fs_class.protocol, fs_class, clobber=True)
26 | 
27 | 
28 | def is_remote_filesystem(fs: fsspec.AbstractFileSystem) -> bool:
29 |     """
30 |     Checks if `fs` is a remote filesystem.
31 | 
32 |     Args:
33 |         fs (`fsspec.spec.AbstractFileSystem`):
34 |             An abstract super-class for pythonic file-systems, e.g. `fsspec.filesystem(\'file\')` or `s3fs.S3FileSystem`.
35 |     """
36 |     return not isinstance(fs, LocalFileSystem)
37 | 
38 | 
39 | def rename(fs: fsspec.AbstractFileSystem, src: str, dst: str):
40 |     """
41 |     Renames the file `src` in `fs` to `dst`.
42 |     """
43 |     if not is_remote_filesystem(fs):
44 |         # LocalFileSystem.mv does copy + rm, it is more efficient to simply move a local directory
45 |         shutil.move(fs._strip_protocol(src), fs._strip_protocol(dst))
46 |     else:
47 |         fs.mv(src, dst, recursive=True)
48 | 


--------------------------------------------------------------------------------
/docs/source/how_to.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | The how-to guides offer a more comprehensive overview of all the tools 🤗 Datasets offers and how to use them. This will help you tackle messier real-world datasets where you may need to manipulate the dataset structure or content to get it ready for training.
 4 | 
 5 | The guides assume you are familiar and comfortable with the 🤗 Datasets basics. We recommend newer users check out our [tutorials](tutorial) first.
 6 | 
 7 | > [!TIP]
 8 | > Interested in learning more? Take a look at [Chapter 5](https://huggingface.co/course/chapter5/1?fw=pt) of the Hugging Face course!
 9 | 
10 | The guides are organized into six sections:
11 | 
12 | - <span class="underline decoration-sky-400 decoration-2 font-semibold">General usage</span>: Functions for general dataset loading and processing. The functions shown in this section are applicable across all dataset modalities.
13 | - <span class="underline decoration-pink-400 decoration-2 font-semibold">Audio</span>: How to load, process, and share audio datasets.
14 | - <span class="underline decoration-yellow-400 decoration-2 font-semibold">Vision</span>: How to load, process, and share image and video datasets.
15 | - <span class="underline decoration-green-400 decoration-2 font-semibold">Text</span>: How to load, process, and share text datasets.
16 | - <span class="underline decoration-orange-400 decoration-2 font-semibold">Tabular</span>: How to load, process, and share tabular datasets.
17 | - <span class="underline decoration-indigo-400 decoration-2 font-semibold">Dataset repository</span>: How to share and upload a dataset to the <a href="https://huggingface.co/datasets">Hub</a>.
18 | 
19 | If you have any questions about 🤗 Datasets, feel free to join and ask the community on our [forum](https://discuss.huggingface.co/c/datasets/10).
20 | 


--------------------------------------------------------------------------------
/benchmarks/format.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | 
 4 | 
 5 | def format_json_to_md(input_json_file, output_md_file):
 6 |     with open(input_json_file, encoding="utf-8") as f:
 7 |         results = json.load(f)
 8 | 
 9 |     output_md = ["<details>", "<summary>Show updated benchmarks!</summary>", " "]
10 | 
11 |     for benchmark_name in sorted(results):
12 |         benchmark_res = results[benchmark_name]
13 | 
14 |         benchmark_file_name = benchmark_name.split("/")[-1]
15 |         output_md.append(f"### Benchmark: {benchmark_file_name}")
16 | 
17 |         title = "| metric |"
18 |         lines = "|--------|"
19 |         value = "| new / old (diff) |"
20 |         for metric_name in sorted(benchmark_res):
21 |             metric_vals = benchmark_res[metric_name]
22 |             new_val = metric_vals["new"]
23 |             old_val = metric_vals.get("old", None)
24 |             dif_val = metric_vals.get("diff", None)
25 | 
26 |             val_str = f" {new_val:f}" if isinstance(new_val, (int, float)) else "None"
27 | 
28 |             if old_val is not None:
29 |                 val_str += f" / {old_val:f}" if isinstance(old_val, (int, float)) else "None"
30 |             if dif_val is not None:
31 |                 val_str += f" ({dif_val:f})" if isinstance(dif_val, (int, float)) else "None"
32 | 
33 |             title += " " + metric_name + " |"
34 |             lines += "---|"
35 |             value += val_str + " |"
36 | 
37 |         output_md += [title, lines, value, " "]
38 | 
39 |     output_md.append("</details>")
40 | 
41 |     with open(output_md_file, "w", encoding="utf-8") as f:
42 |         f.writelines("\n".join(output_md))
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     input_json_file = sys.argv[1]
47 |     output_md_file = sys.argv[2]
48 | 
49 |     format_json_to_md(input_json_file, output_md_file)
50 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.yml:
--------------------------------------------------------------------------------
 1 | name: Bug report
 2 | description: Create a report to help reproduce and fix the bug
 3 | body:
 4 |   - type: textarea
 5 |     id: description
 6 |     attributes:
 7 |       label: Describe the bug
 8 |       description: A clear and concise description of what the bug is
 9 |     validations:
10 |       required: true
11 |   
12 |   - type: textarea
13 |     id: reproduction
14 |     attributes:
15 |       label: Steps to reproduce the bug
16 |       description: |
17 |         Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
18 |         If you have code snippets, error messages, stack traces please provide them here as well.
19 |         Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
20 |         Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
21 |       placeholder: |
22 |         Steps to reproduce the behavior:
23 |           
24 |           1.
25 |           2.
26 |           3.
27 |     validations:
28 |       required: true
29 | 
30 |   - type: textarea
31 |     id: expected-behavior
32 |     validations:
33 |       required: true
34 |     attributes:
35 |       label: Expected behavior
36 |       description: A clear and concise description of the expected results.
37 | 
38 |   - type: textarea
39 |     id: environment-info
40 |     attributes:
41 |       label: Environment info
42 |       description: Please share your environemnt info with us. You can run the command `datasets-cli env` and copy-paste its output below.
43 |       placeholder: datasets version, platform, python version, ...
44 |     validations:
45 |       required: true
46 | 


--------------------------------------------------------------------------------
/src/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | __version__ = "4.4.2.dev0"
16 | 
17 | from .arrow_dataset import Column, Dataset
18 | from .arrow_reader import ReadInstruction
19 | from .builder import ArrowBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
20 | from .combine import concatenate_datasets, interleave_datasets
21 | from .dataset_dict import DatasetDict, IterableDatasetDict
22 | from .download import *
23 | from .features import *
24 | from .fingerprint import disable_caching, enable_caching, is_caching_enabled
25 | from .info import DatasetInfo
26 | from .inspect import (
27 |     get_dataset_config_info,
28 |     get_dataset_config_names,
29 |     get_dataset_default_config_name,
30 |     get_dataset_infos,
31 |     get_dataset_split_names,
32 | )
33 | from .iterable_dataset import IterableColumn, IterableDataset
34 | from .load import load_dataset, load_dataset_builder, load_from_disk
35 | from .splits import (
36 |     NamedSplit,
37 |     NamedSplitAll,
38 |     Split,
39 |     SplitBase,
40 |     SplitDict,
41 |     SplitGenerator,
42 |     SplitInfo,
43 |     SubSplitInfo,
44 |     percent,
45 | )
46 | from .utils import *
47 | from .utils import logging
48 | 


--------------------------------------------------------------------------------
/tests/distributed_scripts/run_torch_distributed.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from argparse import ArgumentParser
 3 | from typing import List
 4 | 
 5 | import torch.utils.data
 6 | 
 7 | from datasets import Dataset, IterableDataset
 8 | from datasets.distributed import split_dataset_by_node
 9 | 
10 | 
11 | NUM_SHARDS = 4
12 | NUM_ITEMS_PER_SHARD = 3
13 | 
14 | 
15 | class FailedTestError(RuntimeError):
16 |     pass
17 | 
18 | 
19 | def gen(shards: List[str]):
20 |     for shard in shards:
21 |         for i in range(NUM_ITEMS_PER_SHARD):
22 |             yield {"i": i, "shard": shard}
23 | 
24 | 
25 | def main():
26 |     rank = int(os.environ["RANK"])
27 |     world_size = int(os.environ["WORLD_SIZE"])
28 | 
29 |     parser = ArgumentParser()
30 |     parser.add_argument("--streaming", type=bool)
31 |     parser.add_argument("--local_rank", type=int)
32 |     parser.add_argument("--num_workers", type=int, default=0)
33 |     args = parser.parse_args()
34 |     streaming = args.streaming
35 |     num_workers = args.num_workers
36 | 
37 |     gen_kwargs = {"shards": [f"shard_{shard_idx}" for shard_idx in range(NUM_SHARDS)]}
38 |     ds = IterableDataset.from_generator(gen, gen_kwargs=gen_kwargs)
39 |     if not streaming:
40 |         ds = Dataset.from_list(list(ds))
41 | 
42 |     ds = split_dataset_by_node(ds, rank=rank, world_size=world_size)
43 |     dataloader = torch.utils.data.DataLoader(ds, num_workers=num_workers)
44 | 
45 |     full_size = NUM_SHARDS * NUM_ITEMS_PER_SHARD
46 |     expected_local_size = full_size // world_size
47 |     expected_local_size += int(rank < (full_size % world_size))
48 | 
49 |     local_size = sum(1 for _ in dataloader)
50 |     if local_size != expected_local_size:
51 |         raise FailedTestError(f"local_size {local_size} != expected_local_size {expected_local_size}")
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/src/datasets/io/abc.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Optional, Union
 3 | 
 4 | from .. import Dataset, DatasetDict, Features, IterableDataset, IterableDatasetDict, NamedSplit
 5 | from ..utils.typing import NestedDataStructureLike, PathLike
 6 | 
 7 | 
 8 | class AbstractDatasetReader(ABC):
 9 |     def __init__(
10 |         self,
11 |         path_or_paths: Optional[NestedDataStructureLike[PathLike]] = None,
12 |         split: Optional[NamedSplit] = None,
13 |         features: Optional[Features] = None,
14 |         cache_dir: str = None,
15 |         keep_in_memory: bool = False,
16 |         streaming: bool = False,
17 |         num_proc: Optional[int] = None,
18 |         **kwargs,
19 |     ):
20 |         self.path_or_paths = path_or_paths
21 |         self.split = split if split or isinstance(path_or_paths, dict) else "train"
22 |         self.features = features
23 |         self.cache_dir = cache_dir
24 |         self.keep_in_memory = keep_in_memory
25 |         self.streaming = streaming
26 |         self.num_proc = num_proc
27 |         self.kwargs = kwargs
28 | 
29 |     @abstractmethod
30 |     def read(self) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]:
31 |         pass
32 | 
33 | 
34 | class AbstractDatasetInputStream(ABC):
35 |     def __init__(
36 |         self,
37 |         features: Optional[Features] = None,
38 |         cache_dir: str = None,
39 |         keep_in_memory: bool = False,
40 |         streaming: bool = False,
41 |         num_proc: Optional[int] = None,
42 |         **kwargs,
43 |     ):
44 |         self.features = features
45 |         self.cache_dir = cache_dir
46 |         self.keep_in_memory = keep_in_memory
47 |         self.streaming = streaming
48 |         self.num_proc = num_proc
49 |         self.kwargs = kwargs
50 | 
51 |     @abstractmethod
52 |     def read(self) -> Union[Dataset, IterableDataset]:
53 |         pass
54 | 


--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 | Copyright 2023 The HuggingFace Team. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | -->
16 | 
17 | # 🤗 Datasets Notebooks
18 | 
19 | You can find here a list of the official notebooks provided by Hugging Face.
20 | 
21 | Also, we would like to list here interesting content created by the community.
22 | If you wrote some notebook(s) leveraging 🤗 Datasets and would like it to be listed here, please open a
23 | Pull Request so it can be included under the Community notebooks.
24 | 
25 | ## Hugging Face's notebooks 🤗
26 | 
27 | ### Documentation notebooks
28 | 
29 | You can open any page of the documentation as a notebook in Colab (there is a button directly on said pages) but they are also listed here if you need them:
30 | 
31 | | Notebook     |      Description      |   |   |
32 | |:----------|:-------------|:-------------|------:|
33 | | [Quickstart](https://github.com/huggingface/notebooks/blob/main/datasets_doc/en/quickstart.ipynb) | A quick presentation on integrating Datasets into a model training workflow |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/datasets_doc/en/quickstart.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/datasets_doc/en/quickstart.ipynb)|
34 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark_indices_mapping.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import tempfile
 4 | 
 5 | import datasets
 6 | from utils import generate_example_dataset, get_duration
 7 | 
 8 | 
 9 | SPEED_TEST_N_EXAMPLES = 500_000
10 | 
11 | RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)
12 | RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json"))
13 | 
14 | 
15 | @get_duration
16 | def select(dataset: datasets.Dataset):
17 |     _ = dataset.select(range(0, len(dataset), 2))
18 | 
19 | 
20 | @get_duration
21 | def sort(dataset: datasets.Dataset):
22 |     _ = dataset.sort("numbers")
23 | 
24 | 
25 | @get_duration
26 | def shuffle(dataset: datasets.Dataset):
27 |     _ = dataset.shuffle()
28 | 
29 | 
30 | @get_duration
31 | def train_test_split(dataset: datasets.Dataset):
32 |     _ = dataset.train_test_split(0.1)
33 | 
34 | 
35 | @get_duration
36 | def shard(dataset: datasets.Dataset, num_shards=10):
37 |     for shard_id in range(num_shards):
38 |         _ = dataset.shard(num_shards, shard_id)
39 | 
40 | 
41 | def benchmark_indices_mapping():
42 |     times = {"num examples": SPEED_TEST_N_EXAMPLES}
43 |     functions = (select, sort, shuffle, train_test_split, shard)
44 |     with tempfile.TemporaryDirectory() as tmp_dir:
45 |         print("generating dataset")
46 |         features = datasets.Features({"text": datasets.Value("string"), "numbers": datasets.Value("float32")})
47 |         dataset = generate_example_dataset(
48 |             os.path.join(tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES
49 |         )
50 |         print("Functions")
51 |         for func in functions:
52 |             print(func.__name__)
53 |             times[func.__name__] = func(dataset)
54 | 
55 |     with open(RESULTS_FILE_PATH, "wb") as f:
56 |         f.write(json.dumps(times).encode("utf-8"))
57 | 
58 | 
59 | if __name__ == "__main__":  # useful to run the profiler
60 |     benchmark_indices_mapping()
61 | 


--------------------------------------------------------------------------------
/tests/test_splits.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | import pytest
 4 | 
 5 | from datasets.splits import Split, SplitDict, SplitInfo
 6 | from datasets.utils.py_utils import asdict
 7 | 
 8 | 
 9 | @pytest.mark.parametrize(
10 |     "split_dict",
11 |     [
12 |         SplitDict(),
13 |         SplitDict({"train": SplitInfo(name="train", num_bytes=1337, num_examples=42, dataset_name="my_dataset")}),
14 |         SplitDict({"train": SplitInfo(name="train", num_bytes=1337, num_examples=42)}),
15 |         SplitDict({"train": SplitInfo()}),
16 |     ],
17 | )
18 | def test_split_dict_to_yaml_list(split_dict: SplitDict):
19 |     split_dict_yaml_list = split_dict._to_yaml_list()
20 |     assert len(split_dict_yaml_list) == len(split_dict)
21 |     reloaded = SplitDict._from_yaml_list(split_dict_yaml_list)
22 |     for split_name, split_info in split_dict.items():
23 |         # dataset_name field is deprecated, and is therefore not part of the YAML dump
24 |         split_info.dataset_name = None
25 |         # the split name of split_dict takes over the name of the split info object
26 |         split_info.name = split_name
27 |     assert split_dict == reloaded
28 | 
29 | 
30 | @pytest.mark.parametrize(
31 |     "split_info", [SplitInfo(), SplitInfo(dataset_name=None), SplitInfo(dataset_name="my_dataset")]
32 | )
33 | def test_split_dict_asdict_has_dataset_name(split_info):
34 |     # For backward compatibility, we need asdict(split_dict) to return split info dictrionaries with the "dataset_name"
35 |     # field even if it's deprecated. This way old versionso of `datasets` can still reload dataset_infos.json files
36 |     split_dict_asdict = asdict(SplitDict({"train": split_info}))
37 |     assert "dataset_name" in split_dict_asdict["train"]
38 |     assert split_dict_asdict["train"]["dataset_name"] == split_info.dataset_name
39 | 
40 | 
41 | def test_named_split_inequality():
42 |     # Used while building the docs, when set as a default parameter value in a function signature
43 |     assert Split.TRAIN != inspect.Parameter.empty
44 | 


--------------------------------------------------------------------------------
/tests/test_parallel.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from datasets.parallel import ParallelBackendConfig, parallel_backend
 4 | from datasets.utils.py_utils import map_nested
 5 | 
 6 | from .utils import require_dill_gt_0_3_2, require_joblibspark, require_not_windows
 7 | 
 8 | 
 9 | def add_one(i):  # picklable for multiprocessing
10 |     return i + 1
11 | 
12 | 
13 | @require_dill_gt_0_3_2
14 | @require_joblibspark
15 | @require_not_windows
16 | def test_parallel_backend_input():
17 |     with parallel_backend("spark"):
18 |         assert ParallelBackendConfig.backend_name == "spark"
19 | 
20 |     lst = [1, 2, 3]
21 |     with pytest.raises(ValueError):
22 |         with parallel_backend("unsupported backend"):
23 |             map_nested(add_one, lst, num_proc=2)
24 | 
25 |     with pytest.raises(ValueError):
26 |         with parallel_backend("unsupported backend"):
27 |             map_nested(add_one, lst, num_proc=-1)
28 | 
29 | 
30 | @require_dill_gt_0_3_2
31 | @require_joblibspark
32 | @require_not_windows
33 | @pytest.mark.parametrize("num_proc", [2, -1])
34 | def test_parallel_backend_map_nested(num_proc):
35 |     s1 = [1, 2]
36 |     s2 = {"a": 1, "b": 2}
37 |     s3 = {"a": [1, 2], "b": [3, 4]}
38 |     s4 = {"a": {"1": 1}, "b": 2}
39 |     s5 = {"a": 1, "b": 2, "c": 3, "d": 4}
40 |     expected_map_nested_s1 = [2, 3]
41 |     expected_map_nested_s2 = {"a": 2, "b": 3}
42 |     expected_map_nested_s3 = {"a": [2, 3], "b": [4, 5]}
43 |     expected_map_nested_s4 = {"a": {"1": 2}, "b": 3}
44 |     expected_map_nested_s5 = {"a": 2, "b": 3, "c": 4, "d": 5}
45 | 
46 |     with parallel_backend("spark"):
47 |         assert map_nested(add_one, s1, num_proc=num_proc) == expected_map_nested_s1
48 |         assert map_nested(add_one, s2, num_proc=num_proc) == expected_map_nested_s2
49 |         assert map_nested(add_one, s3, num_proc=num_proc) == expected_map_nested_s3
50 |         assert map_nested(add_one, s4, num_proc=num_proc) == expected_map_nested_s4
51 |         assert map_nested(add_one, s5, num_proc=num_proc) == expected_map_nested_s5
52 | 


--------------------------------------------------------------------------------
/docs/source/nlp_load.mdx:
--------------------------------------------------------------------------------
 1 | # Load text data
 2 | 
 3 | This guide shows you how to load text datasets. To learn how to load any type of dataset, take a look at the <a class="underline decoration-sky-400 decoration-2 font-semibold" href="./loading">general loading guide</a>.
 4 | 
 5 | Text files are one of the most common file types for storing a dataset. By default, 🤗 Datasets samples a text file line by line to build the dataset.
 6 | 
 7 | ```py
 8 | >>> from datasets import load_dataset
 9 | >>> dataset = load_dataset("text", data_files={"train": ["my_text_1.txt", "my_text_2.txt"], "test": "my_test_file.txt"})
10 | 
11 | # Load from a directory
12 | >>> dataset = load_dataset("text", data_dir="path/to/text/dataset")
13 | ```
14 | 
15 | To sample a text file by paragraph or even an entire document, use the `sample_by` parameter:
16 | 
17 | ```py
18 | # Sample by paragraph
19 | >>> dataset = load_dataset("text", data_files={"train": "my_train_file.txt", "test": "my_test_file.txt"}, sample_by="paragraph")
20 | 
21 | # Sample by document
22 | >>> dataset = load_dataset("text", data_files={"train": "my_train_file.txt", "test": "my_test_file.txt"}, sample_by="document")
23 | ```
24 | 
25 | You can also use grep patterns to load specific files:
26 | 
27 | ```py
28 | >>> from datasets import load_dataset
29 | >>> c4_subset = load_dataset("allenai/c4", data_files="en/c4-train.0000*-of-01024.json.gz")
30 | ```
31 | 
32 | To load remote text files via HTTP, pass the URLs instead:
33 | 
34 | ```py
35 | >>> dataset = load_dataset("text", data_files="https://huggingface.co/datasets/hf-internal-testing/dataset_with_data_files/resolve/main/data/train.txt")
36 | ```
37 | 
38 | To load XML data you can use the "xml" loader, which is equivalent to "text" with sample_by="document":
39 | 
40 | ```py
41 | >>> from datasets import load_dataset
42 | >>> dataset = load_dataset("xml", data_files={"train": ["my_xml_1.xml", "my_xml_2.xml"], "test": "my_xml_file.xml"})
43 | 
44 | # Load from a directory
45 | >>> dataset = load_dataset("xml", data_dir="path/to/xml/dataset")
46 | ```
47 | 


--------------------------------------------------------------------------------
/src/datasets/io/spark.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import pyspark
 4 | 
 5 | from .. import Features, NamedSplit
 6 | from ..download import DownloadMode
 7 | from ..packaged_modules.spark.spark import Spark
 8 | from .abc import AbstractDatasetReader
 9 | 
10 | 
11 | class SparkDatasetReader(AbstractDatasetReader):
12 |     """A dataset reader that reads from a Spark DataFrame.
13 | 
14 |     When caching, cache materialization is parallelized over Spark; an NFS that is accessible to the driver must be
15 |     provided. Streaming is not currently supported.
16 |     """
17 | 
18 |     def __init__(
19 |         self,
20 |         df: pyspark.sql.DataFrame,
21 |         split: Optional[NamedSplit] = None,
22 |         features: Optional[Features] = None,
23 |         streaming: bool = True,
24 |         cache_dir: str = None,
25 |         keep_in_memory: bool = False,
26 |         working_dir: str = None,
27 |         load_from_cache_file: bool = True,
28 |         file_format: str = "arrow",
29 |         **kwargs,
30 |     ):
31 |         super().__init__(
32 |             split=split,
33 |             features=features,
34 |             cache_dir=cache_dir,
35 |             keep_in_memory=keep_in_memory,
36 |             streaming=streaming,
37 |             **kwargs,
38 |         )
39 |         self._load_from_cache_file = load_from_cache_file
40 |         self._file_format = file_format
41 |         self.builder = Spark(
42 |             df=df,
43 |             features=features,
44 |             cache_dir=cache_dir,
45 |             working_dir=working_dir,
46 |             **kwargs,
47 |         )
48 | 
49 |     def read(self):
50 |         if self.streaming:
51 |             return self.builder.as_streaming_dataset(split=self.split)
52 |         download_mode = None if self._load_from_cache_file else DownloadMode.FORCE_REDOWNLOAD
53 |         self.builder.download_and_prepare(
54 |             download_mode=download_mode,
55 |             file_format=self._file_format,
56 |         )
57 |         return self.builder.as_dataset(split=self.split)
58 | 


--------------------------------------------------------------------------------
/tests/test_dataset_list.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from datasets import List, Value
 4 | from datasets.arrow_dataset import Dataset
 5 | 
 6 | 
 7 | class DatasetListTest(TestCase):
 8 |     def _create_example_records(self):
 9 |         return [
10 |             {"col_1": 3, "col_2": "a"},
11 |             {"col_1": 2, "col_2": "b"},
12 |             {"col_1": 1, "col_2": "c"},
13 |             {"col_1": 0, "col_2": "d"},
14 |         ]
15 | 
16 |     def _create_example_dict(self):
17 |         data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}
18 |         return Dataset.from_dict(data)
19 | 
20 |     def test_create(self):
21 |         example_records = self._create_example_records()
22 |         dset = Dataset.from_list(example_records)
23 |         self.assertListEqual(dset.column_names, ["col_1", "col_2"])
24 |         for i, r in enumerate(dset):
25 |             self.assertDictEqual(r, example_records[i])
26 | 
27 |     def test_list_dict_equivalent(self):
28 |         example_records = self._create_example_records()
29 |         dset = Dataset.from_list(example_records)
30 |         dset_from_dict = Dataset.from_dict({k: [r[k] for r in example_records] for k in example_records[0]})
31 |         self.assertEqual(dset.info, dset_from_dict.info)
32 | 
33 |     def test_uneven_records(self):  # checks what happens with missing columns
34 |         uneven_records = [{"col_1": 1}, {"col_2": "x"}]
35 |         dset = Dataset.from_list(uneven_records)
36 |         self.assertDictEqual(dset[0], {"col_1": 1})
37 |         self.assertDictEqual(dset[1], {"col_1": None})  # NB: first record is used for columns
38 | 
39 |     def test_variable_list_records(self):  # checks if the type can be inferred from the second record
40 |         list_records = [{"col_1": []}, {"col_1": [1, 2]}]
41 |         dset = Dataset.from_list(list_records)
42 |         self.assertEqual(dset.info.features["col_1"], List(Value("int64")))
43 | 
44 |     def test_create_empty(self):
45 |         dset = Dataset.from_list([])
46 |         self.assertEqual(len(dset), 0)
47 |         self.assertListEqual(dset.column_names, [])
48 | 


--------------------------------------------------------------------------------
/src/datasets/utils/track.py:
--------------------------------------------------------------------------------
 1 | from collections.abc import Iterable, Iterator
 2 | 
 3 | 
 4 | class tracked_str(str):
 5 |     origins = {}
 6 | 
 7 |     def set_origin(self, origin: str):
 8 |         if super().__repr__() not in self.origins:
 9 |             self.origins[super().__repr__()] = origin
10 | 
11 |     def get_origin(self):
12 |         return self.origins.get(super().__repr__(), str(self))
13 | 
14 |     def __repr__(self) -> str:
15 |         if super().__repr__() not in self.origins or self.origins[super().__repr__()] == self:
16 |             return super().__repr__()
17 |         else:
18 |             return f"{str(self)} (origin={self.origins[super().__repr__()]})"
19 | 
20 | 
21 | class tracked_list(list):
22 |     def __init__(self, *args, **kwargs) -> None:
23 |         super().__init__(*args, **kwargs)
24 |         self.last_item = None
25 | 
26 |     def __iter__(self) -> Iterator:
27 |         for x in super().__iter__():
28 |             self.last_item = x
29 |             yield x
30 |         self.last_item = None
31 | 
32 |     def __repr__(self) -> str:
33 |         if self.last_item is None:
34 |             return super().__repr__()
35 |         else:
36 |             return f"{self.__class__.__name__}(current={self.last_item})"
37 | 
38 | 
39 | class TrackedIterableFromGenerator(Iterable):
40 |     """Utility class to create an iterable from a generator function, in order to reset the generator when needed."""
41 | 
42 |     def __init__(self, generator, *args):
43 |         super().__init__()
44 |         self.generator = generator
45 |         self.args = args
46 |         self.last_item = None
47 | 
48 |     def __iter__(self):
49 |         for x in self.generator(*self.args):
50 |             self.last_item = x
51 |             yield x
52 |         self.last_item = None
53 | 
54 |     def __repr__(self) -> str:
55 |         if self.last_item is None:
56 |             return super().__repr__()
57 |         else:
58 |             return f"{self.__class__.__name__}(current={self.last_item})"
59 | 
60 |     def __reduce__(self):
61 |         return (self.__class__, (self.generator, *self.args))
62 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/audiofolder/audiofolder.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | 
 3 | from ..folder_based_builder import folder_based_builder
 4 | 
 5 | 
 6 | logger = datasets.utils.logging.get_logger(__name__)
 7 | 
 8 | 
 9 | class AudioFolderConfig(folder_based_builder.FolderBasedBuilderConfig):
10 |     """Builder Config for AudioFolder."""
11 | 
12 |     drop_labels: bool = None
13 |     drop_metadata: bool = None
14 | 
15 |     def __post_init__(self):
16 |         super().__post_init__()
17 | 
18 | 
19 | class AudioFolder(folder_based_builder.FolderBasedBuilder):
20 |     BASE_FEATURE = datasets.Audio
21 |     BASE_COLUMN_NAME = "audio"
22 |     BUILDER_CONFIG_CLASS = AudioFolderConfig
23 |     EXTENSIONS: list[str]  # definition at the bottom of the script
24 | 
25 | 
26 | # Obtained with:
27 | # ```
28 | # import soundfile as sf
29 | #
30 | # AUDIO_EXTENSIONS = [f".{format.lower()}" for format in sf.available_formats().keys()]
31 | #
32 | # # .opus decoding is supported if libsndfile >= 1.0.31:
33 | # AUDIO_EXTENSIONS.extend([".opus"])
34 | # ```
35 | # We intentionally did not run this code on launch because:
36 | # (1) Soundfile was an optional dependency, so importing it in global namespace is not allowed
37 | # (2) To ensure the list of supported extensions is deterministic
38 | # (3) We use TorchCodec now anyways instead of Soundfile
39 | AUDIO_EXTENSIONS = [
40 |     ".aiff",
41 |     ".au",
42 |     ".avr",
43 |     ".caf",
44 |     ".flac",
45 |     ".htk",
46 |     ".svx",
47 |     ".mat4",
48 |     ".mat5",
49 |     ".mpc2k",
50 |     ".ogg",
51 |     ".paf",
52 |     ".pvf",
53 |     ".raw",
54 |     ".rf64",
55 |     ".sd2",
56 |     ".sds",
57 |     ".ircam",
58 |     ".voc",
59 |     ".w64",
60 |     ".wav",
61 |     ".nist",
62 |     ".wavex",
63 |     ".wve",
64 |     ".xi",
65 |     ".mp3",
66 |     ".opus",
67 |     ".3gp",
68 |     ".3g2",
69 |     ".avi",
70 |     ".asf",
71 |     ".flv",
72 |     ".mp4",
73 |     ".mov",
74 |     ".m4v",
75 |     ".mkv",
76 |     ".mpg",
77 |     ".webm",
78 |     ".f4v",
79 |     ".wmv",
80 |     ".wma",
81 |     ".ogg",
82 |     ".ogm",
83 |     ".mxf",
84 |     ".nut",
85 | ]
86 | AudioFolder.EXTENSIONS = AUDIO_EXTENSIONS
87 | 


--------------------------------------------------------------------------------
/tests/test_sharding_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from datasets.utils.sharding import _distribute_shards, _number_of_shards_in_gen_kwargs, _split_gen_kwargs
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "kwargs, expected",
 8 |     [
 9 |         ({"num_shards": 0, "max_num_jobs": 1}, []),
10 |         ({"num_shards": 10, "max_num_jobs": 1}, [range(10)]),
11 |         ({"num_shards": 10, "max_num_jobs": 10}, [range(i, i + 1) for i in range(10)]),
12 |         ({"num_shards": 1, "max_num_jobs": 10}, [range(1)]),
13 |         ({"num_shards": 10, "max_num_jobs": 3}, [range(0, 4), range(4, 7), range(7, 10)]),
14 |         ({"num_shards": 3, "max_num_jobs": 10}, [range(0, 1), range(1, 2), range(2, 3)]),
15 |     ],
16 | )
17 | def test_distribute_shards(kwargs, expected):
18 |     out = _distribute_shards(**kwargs)
19 |     assert out == expected
20 | 
21 | 
22 | @pytest.mark.parametrize(
23 |     "gen_kwargs, max_num_jobs, expected",
24 |     [
25 |         ({"foo": 0}, 10, [{"foo": 0}]),
26 |         ({"shards": [0, 1, 2, 3]}, 1, [{"shards": [0, 1, 2, 3]}]),
27 |         ({"shards": [0, 1, 2, 3]}, 4, [{"shards": [0]}, {"shards": [1]}, {"shards": [2]}, {"shards": [3]}]),
28 |         ({"shards": [0, 1]}, 4, [{"shards": [0]}, {"shards": [1]}]),
29 |         ({"shards": [0, 1, 2, 3]}, 2, [{"shards": [0, 1]}, {"shards": [2, 3]}]),
30 |     ],
31 | )
32 | def test_split_gen_kwargs(gen_kwargs, max_num_jobs, expected):
33 |     out = _split_gen_kwargs(gen_kwargs, max_num_jobs)
34 |     assert out == expected
35 | 
36 | 
37 | @pytest.mark.parametrize(
38 |     "gen_kwargs, expected",
39 |     [
40 |         ({"foo": 0}, 1),
41 |         ({"shards": [0]}, 1),
42 |         ({"shards": [0, 1, 2, 3]}, 4),
43 |         ({"shards": [0, 1, 2, 3], "foo": 0}, 4),
44 |         ({"shards": [0, 1, 2, 3], "other": (0, 1)}, 4),
45 |         ({"shards": [0, 1, 2, 3], "shards2": [0, 1]}, RuntimeError),
46 |     ],
47 | )
48 | def test_number_of_shards_in_gen_kwargs(gen_kwargs, expected):
49 |     if expected is RuntimeError:
50 |         with pytest.raises(expected):
51 |             _number_of_shards_in_gen_kwargs(gen_kwargs)
52 |     else:
53 |         out = _number_of_shards_in_gen_kwargs(gen_kwargs)
54 |         assert out == expected
55 | 


--------------------------------------------------------------------------------
/src/datasets/io/text.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from .. import Features, NamedSplit
 4 | from ..packaged_modules.text.text import Text
 5 | from ..utils.typing import NestedDataStructureLike, PathLike
 6 | from .abc import AbstractDatasetReader
 7 | 
 8 | 
 9 | class TextDatasetReader(AbstractDatasetReader):
10 |     def __init__(
11 |         self,
12 |         path_or_paths: NestedDataStructureLike[PathLike],
13 |         split: Optional[NamedSplit] = None,
14 |         features: Optional[Features] = None,
15 |         cache_dir: str = None,
16 |         keep_in_memory: bool = False,
17 |         streaming: bool = False,
18 |         num_proc: Optional[int] = None,
19 |         **kwargs,
20 |     ):
21 |         super().__init__(
22 |             path_or_paths,
23 |             split=split,
24 |             features=features,
25 |             cache_dir=cache_dir,
26 |             keep_in_memory=keep_in_memory,
27 |             streaming=streaming,
28 |             num_proc=num_proc,
29 |             **kwargs,
30 |         )
31 |         path_or_paths = path_or_paths if isinstance(path_or_paths, dict) else {self.split: path_or_paths}
32 |         self.builder = Text(
33 |             cache_dir=cache_dir,
34 |             data_files=path_or_paths,
35 |             features=features,
36 |             **kwargs,
37 |         )
38 | 
39 |     def read(self):
40 |         # Build iterable dataset
41 |         if self.streaming:
42 |             dataset = self.builder.as_streaming_dataset(split=self.split)
43 |         # Build regular (map-style) dataset
44 |         else:
45 |             download_config = None
46 |             download_mode = None
47 |             verification_mode = None
48 |             base_path = None
49 | 
50 |             self.builder.download_and_prepare(
51 |                 download_config=download_config,
52 |                 download_mode=download_mode,
53 |                 verification_mode=verification_mode,
54 |                 base_path=base_path,
55 |                 num_proc=self.num_proc,
56 |             )
57 |             dataset = self.builder.as_dataset(
58 |                 split=self.split, verification_mode=verification_mode, in_memory=self.keep_in_memory
59 |             )
60 |         return dataset
61 | 


--------------------------------------------------------------------------------
/tests/test_offline_util.py:
--------------------------------------------------------------------------------
 1 | from tempfile import NamedTemporaryFile
 2 | 
 3 | import httpx
 4 | import pytest
 5 | import requests
 6 | from huggingface_hub import get_session
 7 | from huggingface_hub.errors import OfflineModeIsEnabled
 8 | 
 9 | from datasets.utils.file_utils import fsspec_get, fsspec_head
10 | 
11 | from .utils import (
12 |     IS_HF_HUB_1_x,
13 |     OfflineSimulationMode,
14 |     RequestWouldHangIndefinitelyError,
15 |     offline,
16 |     require_not_windows,
17 | )
18 | 
19 | 
20 | @pytest.mark.integration
21 | @require_not_windows  # fsspec get keeps a file handle on windows that raises PermissionError
22 | def test_offline_with_timeout():
23 |     expected_exception = httpx.ReadTimeout if IS_HF_HUB_1_x else requests.ConnectTimeout
24 |     with offline(OfflineSimulationMode.CONNECTION_TIMES_OUT):
25 |         with pytest.raises(RequestWouldHangIndefinitelyError):
26 |             get_session().request("GET", "https://huggingface.co")
27 | 
28 |         with pytest.raises(expected_exception):
29 |             get_session().request("GET", "https://huggingface.co", timeout=1.0)
30 | 
31 |         with pytest.raises(expected_exception), NamedTemporaryFile() as temp_file:
32 |             fsspec_get("hf://dummy", temp_file=temp_file)
33 | 
34 | 
35 | @pytest.mark.integration
36 | @require_not_windows  # fsspec get keeps a file handle on windows that raises PermissionError
37 | def test_offline_with_connection_error():
38 |     expected_exception = httpx.ConnectError if IS_HF_HUB_1_x else requests.ConnectionError
39 |     with offline(OfflineSimulationMode.CONNECTION_FAILS):
40 |         with pytest.raises(expected_exception):
41 |             get_session().request("GET", "https://huggingface.co")
42 | 
43 |         with pytest.raises(expected_exception), NamedTemporaryFile() as temp_file:
44 |             fsspec_get("hf://dummy", temp_file=temp_file)
45 | 
46 | 
47 | def test_offline_with_datasets_offline_mode_enabled():
48 |     with offline(OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1):
49 |         with pytest.raises(OfflineModeIsEnabled):
50 |             fsspec_head("hf://dummy")
51 |         with pytest.raises(OfflineModeIsEnabled), NamedTemporaryFile() as temp_file:
52 |             fsspec_get("hf://dummy", temp_file=temp_file)
53 | 


--------------------------------------------------------------------------------
/benchmarks/utils.py:
--------------------------------------------------------------------------------
 1 | import timeit
 2 | 
 3 | import numpy as np
 4 | 
 5 | import datasets
 6 | from datasets.arrow_writer import ArrowWriter
 7 | from datasets.features.features import _ArrayXD
 8 | 
 9 | 
10 | def get_duration(func):
11 |     def wrapper(*args, **kwargs):
12 |         starttime = timeit.default_timer()
13 |         _ = func(*args, **kwargs)
14 |         delta = timeit.default_timer() - starttime
15 |         return delta
16 | 
17 |     wrapper.__name__ = func.__name__
18 | 
19 |     return wrapper
20 | 
21 | 
22 | def generate_examples(features: dict, num_examples=100, seq_shapes=None):
23 |     dummy_data = []
24 |     seq_shapes = seq_shapes or {}
25 |     for i in range(num_examples):
26 |         example = {}
27 |         for col_id, (k, v) in enumerate(features.items()):
28 |             if isinstance(v, _ArrayXD):
29 |                 data = np.random.rand(*v.shape).astype(v.dtype)
30 |             elif isinstance(v, datasets.Value):
31 |                 if v.dtype == "string":
32 |                     data = "The small grey turtle was surprisingly fast when challenged."
33 |                 else:
34 |                     data = np.random.randint(10, size=1).astype(v.dtype).item()
35 |             elif isinstance(v, datasets.Sequence):
36 |                 while isinstance(v, datasets.Sequence):
37 |                     v = v.feature
38 |                 shape = seq_shapes[k]
39 |                 data = np.random.rand(*shape).astype(v.dtype)
40 |             example[k] = data
41 | 
42 |         dummy_data.append((i, example))
43 | 
44 |     return dummy_data
45 | 
46 | 
47 | def generate_example_dataset(dataset_path, features, num_examples=100, seq_shapes=None):
48 |     dummy_data = generate_examples(features, num_examples=num_examples, seq_shapes=seq_shapes)
49 | 
50 |     with ArrowWriter(features=features, path=dataset_path) as writer:
51 |         for key, record in dummy_data:
52 |             example = features.encode_example(record)
53 |             writer.write(example)
54 | 
55 |         num_final_examples, num_bytes = writer.finalize()
56 | 
57 |     if not num_final_examples == num_examples:
58 |         raise ValueError(
59 |             f"Error writing the dataset, wrote {num_final_examples} examples but should have written {num_examples}."
60 |         )
61 | 
62 |     dataset = datasets.Dataset.from_file(filename=dataset_path, info=datasets.DatasetInfo(features=features))
63 | 
64 |     return dataset
65 | 


--------------------------------------------------------------------------------
/src/datasets/io/generator.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Optional
 2 | 
 3 | from .. import Features, NamedSplit, Split
 4 | from ..packaged_modules.generator.generator import Generator
 5 | from .abc import AbstractDatasetInputStream
 6 | 
 7 | 
 8 | class GeneratorDatasetInputStream(AbstractDatasetInputStream):
 9 |     def __init__(
10 |         self,
11 |         generator: Callable,
12 |         features: Optional[Features] = None,
13 |         cache_dir: str = None,
14 |         keep_in_memory: bool = False,
15 |         streaming: bool = False,
16 |         gen_kwargs: Optional[dict] = None,
17 |         num_proc: Optional[int] = None,
18 |         split: NamedSplit = Split.TRAIN,
19 |         fingerprint: Optional[str] = None,
20 |         **kwargs,
21 |     ):
22 |         super().__init__(
23 |             features=features,
24 |             cache_dir=cache_dir,
25 |             keep_in_memory=keep_in_memory,
26 |             streaming=streaming,
27 |             num_proc=num_proc,
28 |             **kwargs,
29 |         )
30 |         self.builder = Generator(
31 |             cache_dir=cache_dir,
32 |             features=features,
33 |             generator=generator,
34 |             gen_kwargs=gen_kwargs,
35 |             split=split,
36 |             config_id="default-fingerprint=" + fingerprint if fingerprint else None,
37 |             **kwargs,
38 |         )
39 |         self.fingerprint = fingerprint
40 | 
41 |     def read(self):
42 |         # Build iterable dataset
43 |         if self.streaming:
44 |             dataset = self.builder.as_streaming_dataset(split=self.builder.config.split)
45 |         # Build regular (map-style) dataset
46 |         else:
47 |             download_config = None
48 |             download_mode = None
49 |             verification_mode = None
50 |             base_path = None
51 | 
52 |             self.builder.download_and_prepare(
53 |                 download_config=download_config,
54 |                 download_mode=download_mode,
55 |                 verification_mode=verification_mode,
56 |                 base_path=base_path,
57 |                 num_proc=self.num_proc,
58 |             )
59 |             dataset = self.builder.as_dataset(
60 |                 split=self.builder.config.split, verification_mode=verification_mode, in_memory=self.keep_in_memory
61 |             )
62 |             if self.fingerprint:
63 |                 dataset._fingerprint = self.fingerprint
64 |         return dataset
65 | 


--------------------------------------------------------------------------------
/tests/features/test_pdf.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from datasets import Dataset, Features, Pdf
 6 | 
 7 | from ..utils import require_pdfplumber
 8 | 
 9 | 
10 | @require_pdfplumber
11 | @pytest.mark.parametrize(
12 |     "build_example",
13 |     [
14 |         lambda pdf_path: pdf_path,
15 |         lambda pdf_path: Path(pdf_path),
16 |         lambda pdf_path: open(pdf_path, "rb").read(),
17 |         lambda pdf_path: {"path": pdf_path},
18 |         lambda pdf_path: {"path": pdf_path, "bytes": None},
19 |         lambda pdf_path: {"path": pdf_path, "bytes": open(pdf_path, "rb").read()},
20 |         lambda pdf_path: {"path": None, "bytes": open(pdf_path, "rb").read()},
21 |         lambda pdf_path: {"bytes": open(pdf_path, "rb").read()},
22 |     ],
23 | )
24 | def test_pdf_feature_encode_example(shared_datadir, build_example):
25 |     import pdfplumber
26 | 
27 |     pdf_path = str(shared_datadir / "test_pdf.pdf")
28 |     pdf = Pdf()
29 |     encoded_example = pdf.encode_example(build_example(pdf_path))
30 |     assert isinstance(encoded_example, dict)
31 |     assert encoded_example.keys() == {"bytes", "path"}
32 |     assert encoded_example["bytes"] is not None or encoded_example["path"] is not None
33 |     decoded_example = pdf.decode_example(encoded_example)
34 |     assert isinstance(decoded_example, pdfplumber.pdf.PDF)
35 | 
36 | 
37 | @require_pdfplumber
38 | def test_dataset_with_pdf_feature(shared_datadir):
39 |     import pdfplumber
40 | 
41 |     pdf_path = str(shared_datadir / "test_pdf.pdf")
42 |     data = {"pdf": [pdf_path]}
43 |     features = Features({"pdf": Pdf()})
44 |     dset = Dataset.from_dict(data, features=features)
45 |     item = dset[0]
46 |     assert item.keys() == {"pdf"}
47 |     assert isinstance(item["pdf"], pdfplumber.pdf.PDF)
48 |     batch = dset[:1]
49 |     assert len(batch) == 1
50 |     assert batch.keys() == {"pdf"}
51 |     assert isinstance(batch["pdf"], list) and all(isinstance(item, pdfplumber.pdf.PDF) for item in batch["pdf"])
52 |     column = dset["pdf"]
53 |     assert len(column) == 1
54 |     assert isinstance(column, list) and all(isinstance(item, pdfplumber.pdf.PDF) for item in column)
55 | 
56 |     # from bytes
57 |     with open(pdf_path, "rb") as f:
58 |         data = {"pdf": [f.read()]}
59 |     dset = Dataset.from_dict(data, features=features)
60 |     item = dset[0]
61 |     assert item.keys() == {"pdf"}
62 |     assert isinstance(item["pdf"], pdfplumber.pdf.PDF)
63 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark_getitem_100B.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from dataclasses import dataclass
 4 | 
 5 | import numpy as np
 6 | import pyarrow as pa
 7 | 
 8 | import datasets
 9 | from utils import get_duration
10 | 
11 | 
12 | SPEED_TEST_N_EXAMPLES = 100_000_000_000
13 | SPEED_TEST_CHUNK_SIZE = 10_000
14 | 
15 | RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)
16 | RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json"))
17 | 
18 | 
19 | def generate_100B_dataset(num_examples: int, chunk_size: int) -> datasets.Dataset:
20 |     table = pa.Table.from_pydict({"col": [0] * chunk_size})
21 |     table = pa.concat_tables([table] * (num_examples // chunk_size))
22 |     return datasets.Dataset(table, fingerprint="table_100B")
23 | 
24 | 
25 | @dataclass
26 | class RandIter:
27 |     low: int
28 |     high: int
29 |     size: int
30 |     seed: int
31 | 
32 |     def __post_init__(self):
33 |         rng = np.random.default_rng(self.seed)
34 |         self._sampled_values = rng.integers(low=self.low, high=self.high, size=self.size).tolist()
35 | 
36 |     def __iter__(self):
37 |         return iter(self._sampled_values)
38 | 
39 |     def __len__(self):
40 |         return self.size
41 | 
42 | 
43 | @get_duration
44 | def get_first_row(dataset: datasets.Dataset):
45 |     _ = dataset[0]
46 | 
47 | 
48 | @get_duration
49 | def get_last_row(dataset: datasets.Dataset):
50 |     _ = dataset[-1]
51 | 
52 | 
53 | @get_duration
54 | def get_batch_of_1024_rows(dataset: datasets.Dataset):
55 |     _ = dataset[range(len(dataset) // 2, len(dataset) // 2 + 1024)]
56 | 
57 | 
58 | @get_duration
59 | def get_batch_of_1024_random_rows(dataset: datasets.Dataset):
60 |     _ = dataset[RandIter(0, len(dataset), 1024, seed=42)]
61 | 
62 | 
63 | def benchmark_table_100B():
64 |     times = {"num examples": SPEED_TEST_N_EXAMPLES}
65 |     functions = (get_first_row, get_last_row, get_batch_of_1024_rows, get_batch_of_1024_random_rows)
66 |     print("generating dataset")
67 |     dataset = generate_100B_dataset(num_examples=SPEED_TEST_N_EXAMPLES, chunk_size=SPEED_TEST_CHUNK_SIZE)
68 |     print("Functions")
69 |     for func in functions:
70 |         print(func.__name__)
71 |         times[func.__name__] = func(dataset)
72 | 
73 |     with open(RESULTS_FILE_PATH, "wb") as f:
74 |         f.write(json.dumps(times).encode("utf-8"))
75 | 
76 | 
77 | if __name__ == "__main__":  # useful to run the profiler
78 |     benchmark_table_100B()
79 | 


--------------------------------------------------------------------------------
/tests/packaged_modules/test_arrow.py:
--------------------------------------------------------------------------------
 1 | import pyarrow as pa
 2 | import pytest
 3 | 
 4 | from datasets.builder import InvalidConfigName
 5 | from datasets.data_files import DataFilesList
 6 | from datasets.packaged_modules.arrow.arrow import Arrow, ArrowConfig
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def arrow_file_streaming_format(tmp_path):
11 |     filename = tmp_path / "stream.arrow"
12 |     testdata = [[1, 1, 1], [0, 100, 6], [1, 90, 900]]
13 | 
14 |     schema = pa.schema([pa.field("input_ids", pa.list_(pa.int32()))])
15 |     array = pa.array(testdata, type=pa.list_(pa.int32()))
16 |     table = pa.Table.from_arrays([array], schema=schema)
17 |     with open(filename, "wb") as f:
18 |         with pa.ipc.new_stream(f, schema) as writer:
19 |             writer.write_table(table)
20 |     return str(filename)
21 | 
22 | 
23 | @pytest.fixture
24 | def arrow_file_file_format(tmp_path):
25 |     filename = tmp_path / "file.arrow"
26 |     testdata = [[1, 1, 1], [0, 100, 6], [1, 90, 900]]
27 | 
28 |     schema = pa.schema([pa.field("input_ids", pa.list_(pa.int32()))])
29 |     array = pa.array(testdata, type=pa.list_(pa.int32()))
30 |     table = pa.Table.from_arrays([array], schema=schema)
31 |     with open(filename, "wb") as f:
32 |         with pa.ipc.new_file(f, schema) as writer:
33 |             writer.write_table(table)
34 |     return str(filename)
35 | 
36 | 
37 | @pytest.mark.parametrize(
38 |     "file_fixture, config_kwargs",
39 |     [
40 |         ("arrow_file_streaming_format", {}),
41 |         ("arrow_file_file_format", {}),
42 |     ],
43 | )
44 | def test_arrow_generate_tables(file_fixture, config_kwargs, request):
45 |     arrow = Arrow(**config_kwargs)
46 |     generator = arrow._generate_tables([[request.getfixturevalue(file_fixture)]])
47 |     pa_table = pa.concat_tables([table for _, table in generator])
48 | 
49 |     expected = {"input_ids": [[1, 1, 1], [0, 100, 6], [1, 90, 900]]}
50 |     assert pa_table.to_pydict() == expected
51 | 
52 | 
53 | def test_config_raises_when_invalid_name() -> None:
54 |     with pytest.raises(InvalidConfigName, match="Bad characters"):
55 |         _ = ArrowConfig(name="name-with-*-invalid-character")
56 | 
57 | 
58 | @pytest.mark.parametrize("data_files", ["str_path", ["str_path"], DataFilesList(["str_path"], [()])])
59 | def test_config_raises_when_invalid_data_files(data_files) -> None:
60 |     with pytest.raises(ValueError, match="Expected a DataFilesDict"):
61 |         _ = ArrowConfig(name="name", data_files=data_files)
62 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import datasets
 4 | import datasets.config
 5 | 
 6 | 
 7 | # Import fixture modules as plugins
 8 | pytest_plugins = ["tests.fixtures.files", "tests.fixtures.hub", "tests.fixtures.fsspec"]
 9 | 
10 | 
11 | def pytest_collection_modifyitems(config, items):
12 |     # Mark tests as "unit" by default if not marked as "integration" (or already marked as "unit")
13 |     for item in items:
14 |         if any(marker in item.keywords for marker in ["integration", "unit"]):
15 |             continue
16 |         item.add_marker(pytest.mark.unit)
17 | 
18 | 
19 | @pytest.fixture(autouse=True)
20 | def set_test_cache_config(tmp_path_factory, monkeypatch):
21 |     # test_hf_cache_home = tmp_path_factory.mktemp("cache")  # TODO: why a cache dir per test function does not work?
22 |     test_hf_cache_home = tmp_path_factory.getbasetemp() / "cache"
23 |     test_hf_datasets_cache = test_hf_cache_home / "datasets"
24 |     monkeypatch.setattr("datasets.config.HF_DATASETS_CACHE", str(test_hf_datasets_cache))
25 |     test_downloaded_datasets_path = test_hf_datasets_cache / "downloads"
26 |     monkeypatch.setattr("datasets.config.DOWNLOADED_DATASETS_PATH", str(test_downloaded_datasets_path))
27 |     test_extracted_datasets_path = test_hf_datasets_cache / "downloads" / "extracted"
28 |     monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(test_extracted_datasets_path))
29 | 
30 | 
31 | @pytest.fixture(autouse=True)
32 | def disable_implicit_token(monkeypatch):
33 |     monkeypatch.setattr("huggingface_hub.constants.HF_HUB_DISABLE_IMPLICIT_TOKEN", True)
34 | 
35 | 
36 | @pytest.fixture(autouse=True, scope="session")
37 | def disable_tqdm_output():
38 |     datasets.disable_progress_bar()
39 | 
40 | 
41 | @pytest.fixture(autouse=True)
42 | def set_update_download_counts_to_false(monkeypatch):
43 |     # don't take tests into account when counting downloads
44 |     monkeypatch.setattr("datasets.config.HF_UPDATE_DOWNLOAD_COUNTS", False)
45 | 
46 | 
47 | @pytest.fixture
48 | def set_sqlalchemy_silence_uber_warning(monkeypatch):
49 |     # Required to suppress RemovedIn20Warning when feature(s) are not compatible with SQLAlchemy 2.0
50 |     # To be removed once SQLAlchemy 2.0 supported
51 |     try:
52 |         monkeypatch.setattr("sqlalchemy.util.deprecations.SILENCE_UBER_WARNING", True)
53 |     except (ModuleNotFoundError, AttributeError):
54 |         pass
55 | 
56 | 
57 | @pytest.fixture(autouse=True, scope="session")
58 | def zero_time_out_for_remote_code():
59 |     datasets.config.TIME_OUT_REMOTE_CODE = 0
60 | 


--------------------------------------------------------------------------------
/tests/test_filesystem.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import fsspec
 4 | import pytest
 5 | from fsspec.core import url_to_fs
 6 | from fsspec.registry import _registry as _fsspec_registry
 7 | 
 8 | from datasets.filesystems import COMPRESSION_FILESYSTEMS, is_remote_filesystem
 9 | 
10 | from .utils import require_lz4, require_zstandard
11 | 
12 | 
13 | def test_mockfs(mockfs):
14 |     assert "mock" in _fsspec_registry
15 |     assert "bz2" in _fsspec_registry
16 | 
17 | 
18 | def test_non_mockfs():
19 |     assert "mock" not in _fsspec_registry
20 |     assert "bz2" in _fsspec_registry
21 | 
22 | 
23 | def test_is_remote_filesystem(mockfs):
24 |     is_remote = is_remote_filesystem(mockfs)
25 |     assert is_remote is True
26 | 
27 |     fs = fsspec.filesystem("file")
28 | 
29 |     is_remote = is_remote_filesystem(fs)
30 |     assert is_remote is False
31 | 
32 | 
33 | @pytest.mark.parametrize("compression_fs_class", COMPRESSION_FILESYSTEMS)
34 | def test_compression_filesystems(compression_fs_class, gz_file, bz2_file, lz4_file, zstd_file, xz_file, text_file):
35 |     input_paths = {"gzip": gz_file, "xz": xz_file, "zstd": zstd_file, "bz2": bz2_file, "lz4": lz4_file}
36 |     input_path = input_paths[compression_fs_class.protocol]
37 |     if input_path is None:
38 |         reason = f"for '{compression_fs_class.protocol}' compression protocol, "
39 |         if compression_fs_class.protocol == "lz4":
40 |             reason += require_lz4.kwargs["reason"]
41 |         elif compression_fs_class.protocol == "zstd":
42 |             reason += require_zstandard.kwargs["reason"]
43 |         pytest.skip(reason)
44 |     fs = fsspec.filesystem(compression_fs_class.protocol, fo=input_path)
45 |     expected_filename = os.path.basename(input_path)
46 |     expected_filename = expected_filename[: expected_filename.rindex(".")]
47 |     assert fs.glob("*") == [expected_filename]
48 |     with fs.open(expected_filename, "r", encoding="utf-8") as f, open(text_file, encoding="utf-8") as expected_file:
49 |         assert f.read() == expected_file.read()
50 | 
51 | 
52 | @pytest.mark.parametrize("protocol", ["zip", "gzip"])
53 | def test_fs_isfile(protocol, zip_jsonl_path, jsonl_gz_path):
54 |     compressed_file_paths = {"zip": zip_jsonl_path, "gzip": jsonl_gz_path}
55 |     compressed_file_path = compressed_file_paths[protocol]
56 |     member_file_path = "dataset.jsonl"
57 |     path = f"{protocol}://{member_file_path}::{compressed_file_path}"
58 |     fs, *_ = url_to_fs(path)
59 |     assert fs.isfile(member_file_path)
60 |     assert not fs.isfile("non_existing_" + member_file_path)
61 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/imagefolder/imagefolder.py:
--------------------------------------------------------------------------------
  1 | import datasets
  2 | 
  3 | from ..folder_based_builder import folder_based_builder
  4 | 
  5 | 
  6 | logger = datasets.utils.logging.get_logger(__name__)
  7 | 
  8 | 
  9 | class ImageFolderConfig(folder_based_builder.FolderBasedBuilderConfig):
 10 |     """BuilderConfig for ImageFolder."""
 11 | 
 12 |     drop_labels: bool = None
 13 |     drop_metadata: bool = None
 14 | 
 15 |     def __post_init__(self):
 16 |         super().__post_init__()
 17 | 
 18 | 
 19 | class ImageFolder(folder_based_builder.FolderBasedBuilder):
 20 |     BASE_FEATURE = datasets.Image
 21 |     BASE_COLUMN_NAME = "image"
 22 |     BUILDER_CONFIG_CLASS = ImageFolderConfig
 23 |     EXTENSIONS: list[str]  # definition at the bottom of the script
 24 | 
 25 | 
 26 | # Obtained with:
 27 | # ```
 28 | # import PIL.Image
 29 | # IMAGE_EXTENSIONS = []
 30 | # PIL.Image.init()
 31 | # for ext, format in PIL.Image.EXTENSION.items():
 32 | #     if format in PIL.Image.OPEN:
 33 | #         IMAGE_EXTENSIONS.append(ext[1:])
 34 | # ```
 35 | # We intentionally do not run this code on launch because:
 36 | # (1) Pillow is an optional dependency, so importing Pillow in global namespace is not allowed
 37 | # (2) To ensure the list of supported extensions is deterministic
 38 | IMAGE_EXTENSIONS = [
 39 |     ".blp",
 40 |     ".bmp",
 41 |     ".dib",
 42 |     ".bufr",
 43 |     ".cur",
 44 |     ".pcx",
 45 |     ".dcx",
 46 |     ".dds",
 47 |     ".ps",
 48 |     ".eps",
 49 |     ".fit",
 50 |     ".fits",
 51 |     ".fli",
 52 |     ".flc",
 53 |     ".ftc",
 54 |     ".ftu",
 55 |     ".gbr",
 56 |     ".gif",
 57 |     ".grib",
 58 |     # ".h5",   # may contain zero or several images
 59 |     # ".hdf",  # may contain zero or several images
 60 |     ".png",
 61 |     ".apng",
 62 |     ".jp2",
 63 |     ".j2k",
 64 |     ".jpc",
 65 |     ".jpf",
 66 |     ".jpx",
 67 |     ".j2c",
 68 |     ".icns",
 69 |     ".ico",
 70 |     ".im",
 71 |     ".iim",
 72 |     ".tif",
 73 |     ".tiff",
 74 |     ".jfif",
 75 |     ".jpe",
 76 |     ".jpg",
 77 |     ".jpeg",
 78 |     ".mpg",
 79 |     ".mpeg",
 80 |     ".msp",
 81 |     ".pcd",
 82 |     ".pxr",
 83 |     ".pbm",
 84 |     ".pgm",
 85 |     ".ppm",
 86 |     ".pnm",
 87 |     ".psd",
 88 |     ".bw",
 89 |     ".rgb",
 90 |     ".rgba",
 91 |     ".sgi",
 92 |     ".ras",
 93 |     ".tga",
 94 |     ".icb",
 95 |     ".vda",
 96 |     ".vst",
 97 |     ".webp",
 98 |     ".wmf",
 99 |     ".emf",
100 |     ".xbm",
101 |     ".xpm",
102 | ]
103 | ImageFolder.EXTENSIONS = IMAGE_EXTENSIONS
104 | 


--------------------------------------------------------------------------------
/docs/source/about_arrow.md:
--------------------------------------------------------------------------------
 1 | # Datasets 🤝 Arrow
 2 | 
 3 | ## What is Arrow?
 4 | 
 5 | [Arrow](https://arrow.apache.org/) enables large amounts of data to be processed and moved quickly. It is a specific data format that stores data in a columnar memory layout. This provides several significant advantages:
 6 | 
 7 | * Arrow's standard format allows [zero-copy reads](https://en.wikipedia.org/wiki/Zero-copy) which removes virtually all serialization overhead.
 8 | * Arrow is language-agnostic so it supports different programming languages.
 9 | * Arrow is column-oriented so it is faster at querying and processing slices or columns of data.
10 | * Arrow allows for copy-free hand-offs to standard machine learning tools such as NumPy, Pandas, PyTorch, and TensorFlow.
11 | * Arrow supports many, possibly nested, column types.
12 | 
13 | ## Memory-mapping
14 | 
15 | 🤗 Datasets uses Arrow for its local caching system. It allows datasets to be backed by an on-disk cache, which is memory-mapped for fast lookup.
16 | This architecture allows for large datasets to be used on machines with relatively small device memory.
17 | 
18 | For example, loading the full English Wikipedia dataset only takes a few MB of RAM:
19 | 
20 | ```python
21 | >>> import os; import psutil; import timeit
22 | >>> from datasets import load_dataset
23 | 
24 | # Process.memory_info is expressed in bytes, so convert to megabytes 
25 | >>> mem_before = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
26 | >>> wiki = load_dataset("wikimedia/wikipedia", "20220301.en", split="train")
27 | >>> mem_after = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
28 | 
29 | >>> print(f"RAM memory used: {(mem_after - mem_before)} MB")
30 | RAM memory used: 50 MB
31 | ```
32 | 
33 | This is possible because the Arrow data is actually memory-mapped from disk, and not loaded in memory.
34 | Memory-mapping allows access to data on disk, and leverages virtual memory capabilities for fast lookups.
35 | 
36 | ## Performance
37 | 
38 | Iterating over a memory-mapped dataset using Arrow is fast. Iterating over Wikipedia on a laptop gives you speeds of 1-3 Gbit/s:
39 | 
40 | ```python
41 | >>> s = """batch_size = 1000
42 | ... for batch in wiki.iter(batch_size):
43 | ...     ...
44 | ... """
45 | 
46 | >>> elapsed_time = timeit.timeit(stmt=s, number=1, globals=globals())
47 | >>> print(f"Time to iterate over the {wiki.dataset_size >> 30} GB dataset: {elapsed_time:.1f} sec, "
48 | ...       f"ie. {float(wiki.dataset_size >> 27)/elapsed_time:.1f} Gb/s")
49 | Time to iterate over the 18 GB dataset: 31.8 sec, ie. 4.8 Gb/s
50 | ```
51 | 


--------------------------------------------------------------------------------
/src/datasets/utils/_filelock.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License
15 | """Utilities to handle file locking in `datasets`."""
16 | 
17 | import os
18 | 
19 | from filelock import FileLock as FileLock_
20 | from filelock import UnixFileLock
21 | from filelock import __version__ as _filelock_version
22 | from packaging import version
23 | 
24 | 
25 | class FileLock(FileLock_):
26 |     """
27 |     A `filelock.FileLock` initializer that handles long paths.
28 |     It also uses the current umask for lock files.
29 |     """
30 | 
31 |     MAX_FILENAME_LENGTH = 255
32 | 
33 |     def __init__(self, lock_file, *args, **kwargs):
34 |         # The "mode" argument is required if we want to use the current umask in filelock >= 3.10
35 |         # In previous previous it was already using the current umask.
36 |         if "mode" not in kwargs and version.parse(_filelock_version) >= version.parse("3.10.0"):
37 |             umask = os.umask(0o666)
38 |             os.umask(umask)
39 |             kwargs["mode"] = 0o666 & ~umask
40 |         lock_file = self.hash_filename_if_too_long(lock_file)
41 |         super().__init__(lock_file, *args, **kwargs)
42 | 
43 |     @classmethod
44 |     def hash_filename_if_too_long(cls, path: str) -> str:
45 |         path = os.path.abspath(os.path.expanduser(path))
46 |         filename = os.path.basename(path)
47 |         max_filename_length = cls.MAX_FILENAME_LENGTH
48 |         if issubclass(cls, UnixFileLock):
49 |             max_filename_length = min(max_filename_length, os.statvfs(os.path.dirname(path)).f_namemax)
50 |         if len(filename) > max_filename_length:
51 |             dirname = os.path.dirname(path)
52 |             hashed_filename = str(hash(filename))
53 |             new_filename = (
54 |                 filename[: max_filename_length - len(hashed_filename) - 8] + "..." + hashed_filename + ".lock"
55 |             )
56 |             return os.path.join(dirname, new_filename)
57 |         else:
58 |             return path
59 | 


--------------------------------------------------------------------------------
/docs/source/package_reference/utilities.mdx:
--------------------------------------------------------------------------------
 1 | # Utilities
 2 | 
 3 | ## Configure logging
 4 | 
 5 | 🤗 Datasets strives to be transparent and explicit about how it works, but this can be quite verbose at times. We have included a series of logging methods which allow you to easily adjust the level of verbosity of the entire library. Currently the default verbosity of the library is set to `WARNING`.
 6 | 
 7 | To change the level of verbosity, use one of the direct setters. For instance, here is how to change the verbosity to the `INFO` level:
 8 | 
 9 | ```py
10 | import datasets
11 | datasets.logging.set_verbosity_info()
12 | ```
13 | 
14 | You can also use the environment variable `DATASETS_VERBOSITY` to override the default verbosity, and set it to one of the following: `debug`, `info`, `warning`, `error`, `critical`:
15 | 
16 | ```bash
17 | DATASETS_VERBOSITY=error ./myprogram.py
18 | ```
19 | 
20 | All the methods of this logging module are documented below. The main ones are:
21 | 
22 | - [`logging.get_verbosity`] to get the current level of verbosity in the logger
23 | - [`logging.set_verbosity`] to set the verbosity to the level of your choice
24 | 
25 | In order from the least to the most verbose (with their corresponding `int` values):
26 | 
27 | 1. `logging.CRITICAL` or `logging.FATAL` (int value, 50): only report the most critical errors.
28 | 2. `logging.ERROR` (int value, 40): only report errors.
29 | 3. `logging.WARNING` or `logging.WARN` (int value, 30): only reports error and warnings. This the default level used by the library.
30 | 4. `logging.INFO` (int value, 20): reports error, warnings and basic information.
31 | 5. `logging.DEBUG` (int value, 10): report all information.
32 | 
33 | [[autodoc]] datasets.logging.get_verbosity
34 | 
35 | [[autodoc]] datasets.logging.set_verbosity
36 | 
37 | [[autodoc]] datasets.logging.set_verbosity_info
38 | 
39 | [[autodoc]] datasets.logging.set_verbosity_warning
40 | 
41 | [[autodoc]] datasets.logging.set_verbosity_debug
42 | 
43 | [[autodoc]] datasets.logging.set_verbosity_error
44 | 
45 | [[autodoc]] datasets.logging.disable_propagation
46 | 
47 | [[autodoc]] datasets.logging.enable_propagation
48 | 
49 | ## Configure progress bars
50 | 
51 | By default, `tqdm` progress bars will be displayed during dataset download and preprocessing. You can disable them globally by setting `HF_DATASETS_DISABLE_PROGRESS_BARS`
52 | environment variable. You can also enable/disable them using [`~utils.enable_progress_bars`] and [`~utils.disable_progress_bars`]. If set, the environment variable has priority on the helpers.
53 | 
54 | [[autodoc]] datasets.utils.enable_progress_bars
55 | 
56 | [[autodoc]] datasets.utils.disable_progress_bars
57 | 
58 | [[autodoc]] datasets.utils.are_progress_bars_disabled


--------------------------------------------------------------------------------
/docs/source/dataset_card.mdx:
--------------------------------------------------------------------------------
 1 | # Create a dataset card
 2 | 
 3 | Each dataset should have a dataset card to promote responsible usage and inform users of any potential biases within the dataset.
 4 | This idea was inspired by the Model Cards proposed by [Mitchell, 2018](https://huggingface.co/papers/1810.03993).
 5 | Dataset cards help users understand a dataset's contents, the context for using the dataset, how it was created, and any other considerations a user should be aware of.
 6 | 
 7 | Creating a dataset card is easy and can be done in just a few steps:
 8 | 
 9 | 1. Go to your dataset repository on the [Hub](https://hf.co/new-dataset) and click on **Create Dataset Card** to create a new `README.md` file in your repository.
10 | 
11 | 2. Use the **Metadata UI** to select the tags that describe your dataset. You can add a license, language, pretty_name, the task_categories, size_categories, and any other tags that you think are relevant. These tags help users discover and find your dataset on the Hub.
12 | 
13 | <div class="flex justify-center">
14 |     <img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/datasets-metadata-ui.png"/>
15 |     <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/datasets-metadata-ui-dark.png"/>
16 | </div>
17 | 
18 |   > [!TIP]
19 |   > For a complete, but not required, set of tag options you can also look at the [Dataset Card specifications](https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1). This'll have a few more tag options like `multilinguality` and `language_creators` which are useful but not absolutely necessary.
20 | 
21 | 3. Click on the **Import dataset card template** link to automatically create a template with all the relevant fields to complete. Fill out the template sections to the best of your ability. Take a look at the [Dataset Card Creation Guide](https://github.com/huggingface/datasets/blob/main/templates/README_guide.md) for more detailed information about what to include in each section of the card. For fields you are unable to complete, you can write **[More Information Needed]**.
22 | 
23 | 4. Once you're done, commit the changes to the `README.md` file and you'll see the completed dataset card on your repository.
24 | 
25 | YAML also allows you to customize the way your dataset is loaded by [defining splits and/or configurations](./repository_structure#define-your-splits-and-subsets-in-yaml) without the need to write any code.
26 | 
27 | Feel free to take a look at the [SNLI](https://huggingface.co/datasets/stanfordnlp/snli), [CNN/DailyMail](https://huggingface.co/datasets/abisee/cnn_dailymail), and [Allociné](https://huggingface.co/datasets/tblard/allocine) dataset cards as examples to help you get started.
28 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark_map_filter.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import tempfile
 4 | 
 5 | import transformers
 6 | 
 7 | import datasets
 8 | from utils import generate_example_dataset, get_duration
 9 | 
10 | 
11 | SPEED_TEST_N_EXAMPLES = 500_000
12 | 
13 | RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)
14 | RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json"))
15 | 
16 | 
17 | @get_duration
18 | def map(dataset: datasets.Dataset, **kwargs):
19 |     _ = dataset.map(**kwargs)
20 | 
21 | 
22 | @get_duration
23 | def filter(dataset: datasets.Dataset, **kwargs):
24 |     _ = dataset.filter(**kwargs)
25 | 
26 | 
27 | def benchmark_map_filter():
28 |     times = {"num examples": SPEED_TEST_N_EXAMPLES}
29 |     with tempfile.TemporaryDirectory() as tmp_dir:
30 |         features = datasets.Features({"text": datasets.Value("string"), "numbers": datasets.Value("float32")})
31 |         dataset = generate_example_dataset(
32 |             os.path.join(tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES
33 |         )
34 | 
35 |         tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True)
36 | 
37 |         def tokenize(examples):
38 |             return tokenizer(examples["text"])
39 | 
40 |         times["map identity"] = map(dataset)
41 | 
42 |         times["map identity batched"] = map(dataset, batched=True)
43 | 
44 |         times["map no-op batched"] = map(dataset, function=lambda x: None, batched=True)
45 | 
46 |         with dataset.formatted_as(type="numpy"):
47 |             times["map no-op batched numpy"] = map(dataset, function=lambda x: None, batched=True)
48 | 
49 |         with dataset.formatted_as(type="pandas"):
50 |             times["map no-op batched pandas"] = map(dataset, function=lambda x: None, batched=True)
51 | 
52 |         with dataset.formatted_as(type="torch", columns="numbers"):
53 |             times["map no-op batched pytorch"] = map(dataset, function=lambda x: None, batched=True)
54 | 
55 |         with dataset.formatted_as(type="tensorflow", columns="numbers"):
56 |             times["map no-op batched tensorflow"] = map(dataset, function=lambda x: None, batched=True)
57 | 
58 |         times["map fast-tokenizer batched"] = map(dataset, function=tokenize, batched=True)
59 | 
60 |         times["filter"] = filter(dataset)
61 | 
62 |         # Activate later when tokenizer support batched inputs
63 |         # with dataset.formatted_as(type='numpy'):
64 |         #     times[func.__name__ + " fast-tokenizer batched numpy"] = func(dataset, function=tokenize, batched=True)
65 | 
66 |     with open(RESULTS_FILE_PATH, "wb") as f:
67 |         f.write(json.dumps(times).encode("utf-8"))
68 | 
69 | 
70 | if __name__ == "__main__":  # useful to run the profiler
71 |     benchmark_map_filter()
72 | 


--------------------------------------------------------------------------------
/docs/source/use_with_pandas.mdx:
--------------------------------------------------------------------------------
 1 | # Use with Pandas
 2 | 
 3 | This document is a quick introduction to using `datasets` with Pandas, with a particular focus on how to process
 4 | datasets using Pandas functions, and how to convert a dataset to Pandas or from Pandas.
 5 | 
 6 | This is particularly useful as it allows fast operations, since `datasets` uses PyArrow under the hood and PyArrow is well integrated with Pandas.
 7 | 
 8 | ## Dataset format
 9 | 
10 | By default, datasets return regular Python objects: integers, floats, strings, lists, etc.
11 | 
12 | To get Pandas DataFrames or Series instead, you can set the format of the dataset to `pandas` using [`Dataset.with_format`]:
13 | 
14 | ```py
15 | >>> from datasets import Dataset
16 | >>> data = {"col_0": ["a", "b", "c", "d"], "col_1": [0., 0., 1., 1.]}
17 | >>> ds = Dataset.from_dict(data)
18 | >>> ds = ds.with_format("pandas")
19 | >>> ds[0]       # pd.DataFrame
20 |   col_0  col_1
21 | 0     a    0.0
22 | >>> ds[:2]      # pd.DataFrame
23 |   col_0  col_1
24 | 0     a    0.0
25 | 1     b    0.0
26 | >>> ds["data"]  # pd.Series
27 | 0    a
28 | 1    b
29 | 2    c
30 | 3    d
31 | Name: col_0, dtype: object
32 | ```
33 | 
34 | This also works for `IterableDataset` objects obtained e.g. using `load_dataset(..., streaming=True)`:
35 | 
36 | ```py
37 | >>> ds = ds.with_format("pandas")
38 | >>> for df in ds.iter(batch_size=2):
39 | ...     print(df)
40 | ...     break
41 |   col_0  col_1
42 | 0     a    0.0
43 | 1     b    0.0
44 | ```
45 | 
46 | ## Process data
47 | 
48 | Pandas functions are generally faster than regular hand-written python functions, and therefore they are a good option to optimize data processing. You can use Pandas functions to process a dataset in [`Dataset.map`] or [`Dataset.filter`]:
49 | 
50 | ```python
51 | >>> from datasets import Dataset
52 | >>> data = {"col_0": ["a", "b", "c", "d"], "col_1": [0., 0., 1., 1.]}
53 | >>> ds = Dataset.from_dict(data)
54 | >>> ds = ds.with_format("pandas")
55 | >>> ds = ds.map(lambda df: df.assign(col_2=df.col_1 + 1), batched=True)
56 | >>> ds[:2]
57 |   col_0  col_1  col_2
58 | 0     a    0.0    1.0
59 | 1     b    0.0    1.0
60 | >>> ds = ds.filter(lambda df: df.col_0 == "b", batched=True)
61 | >>> ds[0]
62 |   col_0  col_1  col_2
63 | 0     b    0.0    1.0
64 | ```
65 | 
66 | We use `batched=True` because it is faster to process batches of data in Pandas rather than row by row. It's also possible to use `batch_size=` in `map()` to set the size of each `df`.
67 | 
68 | This also works for [`IterableDataset.map`] and [`IterableDataset.filter`].
69 | 
70 | ## Import or Export from Pandas
71 | 
72 | To import data from Pandas, you can use [`Dataset.from_pandas`]:
73 | 
74 | ```python
75 | ds = Dataset.from_pandas(df)
76 | ```
77 | 
78 | And you can use [`Dataset.to_pandas`] to export a Dataset to a Pandas DataFrame:
79 | 
80 | 
81 | ```python
82 | df = Dataset.to_pandas()
83 | ```
84 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/pandas/pandas.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import warnings
 3 | from dataclasses import dataclass
 4 | from typing import Optional
 5 | 
 6 | import pandas as pd
 7 | import pyarrow as pa
 8 | 
 9 | import datasets
10 | from datasets.builder import Key
11 | from datasets.table import table_cast
12 | 
13 | 
14 | @dataclass
15 | class PandasConfig(datasets.BuilderConfig):
16 |     """BuilderConfig for Pandas."""
17 | 
18 |     features: Optional[datasets.Features] = None
19 | 
20 |     def __post_init__(self):
21 |         super().__post_init__()
22 | 
23 | 
24 | class Pandas(datasets.ArrowBasedBuilder):
25 |     BUILDER_CONFIG_CLASS = PandasConfig
26 | 
27 |     def _info(self):
28 |         warnings.warn(
29 |             "The Pandas builder is deprecated and will be removed in the next major version of datasets.",
30 |             FutureWarning,
31 |         )
32 |         return datasets.DatasetInfo(features=self.config.features)
33 | 
34 |     def _split_generators(self, dl_manager):
35 |         """We handle string, list and dicts in datafiles"""
36 |         if not self.config.data_files:
37 |             raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
38 |         data_files = dl_manager.download_and_extract(self.config.data_files)
39 |         if isinstance(data_files, (str, list, tuple)):
40 |             files = data_files
41 |             if isinstance(files, str):
42 |                 files = [files]
43 |             # Use `dl_manager.iter_files` to skip hidden files in an extracted archive
44 |             files = [dl_manager.iter_files(file) for file in files]
45 |             return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files})]
46 |         splits = []
47 |         for split_name, files in data_files.items():
48 |             if isinstance(files, str):
49 |                 files = [files]
50 |             # Use `dl_manager.iter_files` to skip hidden files in an extracted archive
51 |             files = [dl_manager.iter_files(file) for file in files]
52 |             splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
53 |         return splits
54 | 
55 |     def _cast_table(self, pa_table: pa.Table) -> pa.Table:
56 |         if self.config.features is not None:
57 |             # more expensive cast to support nested features with keys in a different order
58 |             # allows str <-> int/float or str to Audio for example
59 |             pa_table = table_cast(pa_table, self.config.features.arrow_schema)
60 |         return pa_table
61 | 
62 |     def _generate_tables(self, files):
63 |         for i, file in enumerate(itertools.chain.from_iterable(files)):
64 |             with open(file, "rb") as f:
65 |                 pa_table = pa.Table.from_pandas(pd.read_pickle(f))
66 |                 yield Key(i, 0), self._cast_table(pa_table)
67 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/eval/eval.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from itertools import islice
 4 | 
 5 | import pyarrow as pa
 6 | 
 7 | import datasets
 8 | from datasets.builder import Key
 9 | 
10 | 
11 | logger = datasets.utils.logging.get_logger(__name__)
12 | 
13 | 
14 | class Eval(datasets.GeneratorBasedBuilder):
15 |     NUM_EXAMPLES_FOR_FEATURES_INFERENCE = 5
16 | 
17 |     def _info(self):
18 |         return datasets.DatasetInfo()
19 | 
20 |     def _split_generators(self, dl_manager):
21 |         """We handle string, list and dicts in datafiles"""
22 |         if not self.config.data_files:
23 |             raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
24 |         dl_manager.download_config.extract_on_the_fly = True
25 |         data_files = dl_manager.download_and_extract(self.config.data_files)
26 |         splits = []
27 |         for split_name, logs in data_files.items():
28 |             if isinstance(logs, str):
29 |                 logs = [logs]
30 |             logs_files = [dl_manager.iter_files(log) for log in logs]
31 |             splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"logs_files": logs_files}))
32 |         if not self.info.features:
33 |             first_examples = list(
34 |                 islice(self._iter_samples_from_log_files(logs_files[0]), self.NUM_EXAMPLES_FOR_FEATURES_INFERENCE)
35 |             )
36 |             pa_tables = [pa.Table.from_pylist([example]) for example in first_examples]
37 |             inferred_arrow_schema = pa.concat_tables(pa_tables, promote_options="default").schema
38 |             self.info.features = datasets.Features.from_arrow_schema(inferred_arrow_schema)
39 | 
40 |         return splits
41 | 
42 |     def _sort_samples_key(self, sample_path: str):
43 |         # looks like "{sample_idx}_epoch_{epoch_idx}""
44 |         (sample_idx_str, epoch_idx_str) = os.path.splitext(os.path.basename(sample_path))[0].split("_epoch_")
45 |         return (int(epoch_idx_str), int(sample_idx_str))
46 | 
47 |     def _iter_samples_from_log_files(self, log_files: list[str]):
48 |         sample_files = [log_file for log_file in log_files if os.path.basename(os.path.dirname(log_file)) == "samples"]
49 |         sample_files.sort(key=self._sort_samples_key)
50 |         for sample_file in sample_files:
51 |             with open(sample_file) as f:
52 |                 sample = json.load(f)
53 |                 for field in sample:
54 |                     if isinstance(sample[field], dict):
55 |                         sample[field] = json.dumps(sample[field])
56 |                     if isinstance(sample[field], list):
57 |                         sample[field] = [json.dumps(x) for x in sample[field]]
58 |                 yield sample
59 | 
60 |     def _generate_examples(self, logs_files):
61 |         for file_idx, log_files in enumerate(logs_files):
62 |             for sample_idx, sample in enumerate(self._iter_samples_from_log_files(log_files)):
63 |                 yield Key(file_idx, sample_idx), sample
64 | 


--------------------------------------------------------------------------------
/docs/source/index.mdx:
--------------------------------------------------------------------------------
 1 | # Datasets
 2 | 
 3 | <img class="float-left !m-0 !border-0 !dark:border-0 !shadow-none !max-w-lg w-[150px]" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/datasets_logo.png"/>
 4 | 
 5 | 🤗 Datasets is a library for easily accessing and sharing AI datasets for Audio, Computer Vision, and Natural Language Processing (NLP) tasks.
 6 | 
 7 | Load a dataset in a single line of code, and use our powerful data processing and streaming methods to quickly get your dataset ready for training in a deep learning model. Backed by the Apache Arrow format, process large datasets with zero-copy reads without any memory constraints for optimal speed and efficiency. We also feature a deep integration with the [Hugging Face Hub](https://huggingface.co/datasets), allowing you to easily load and share a dataset with the wider machine learning community.
 8 | 
 9 | Find your dataset today on the [Hugging Face Hub](https://huggingface.co/datasets), and take an in-depth look inside of it with the live viewer.
10 | 
11 | <div class="mt-10">
12 |   <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
13 |     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./tutorial"
14 |       ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Tutorials</div>
15 |       <p class="text-gray-700">Learn the basics and become familiar with loading, accessing, and processing a dataset. Start here if you are using 🤗 Datasets for the first time!</p>
16 |     </a>
17 |     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./how_to"
18 |       ><div class="w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">How-to guides</div>
19 |       <p class="text-gray-700">Practical guides to help you achieve a specific goal. Take a look at these guides to learn how to use 🤗 Datasets to solve real-world problems.</p>
20 |     </a>
21 |     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./about_arrow"
22 |       ><div class="w-full text-center bg-gradient-to-br from-pink-400 to-pink-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Conceptual guides</div>
23 |       <p class="text-gray-700">High-level explanations for building a better understanding about important topics such as the underlying data format, the cache, and how datasets are generated.</p>
24 |    </a>
25 |     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./package_reference/main_classes"
26 |       ><div class="w-full text-center bg-gradient-to-br from-purple-400 to-purple-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Reference</div>
27 |       <p class="text-gray-700">Technical descriptions of how 🤗 Datasets classes and methods work.</p>
28 |     </a>
29 |   </div>
30 | </div>
31 | 


--------------------------------------------------------------------------------
/docs/source/package_reference/table_classes.mdx:
--------------------------------------------------------------------------------
  1 | # Table Classes
  2 | 
  3 | Each `Dataset` object is backed by a PyArrow Table.
  4 | A Table can be loaded from either the disk (memory mapped) or in memory.
  5 | Several Table types are available, and they all inherit from [`table.Table`].
  6 | 
  7 | ## Table
  8 | 
  9 | [[autodoc]] datasets.table.Table
 10 |     - validate
 11 |     - equals
 12 |     - to_batches
 13 |     - to_pydict
 14 |     - to_pandas
 15 |     - to_string
 16 |     - field
 17 |     - column
 18 |     - itercolumns
 19 |     - schema
 20 |     - columns
 21 |     - num_columns
 22 |     - num_rows
 23 |     - shape
 24 |     - nbytes
 25 | 
 26 | ## InMemoryTable
 27 | 
 28 | [[autodoc]] datasets.table.InMemoryTable
 29 |     - validate
 30 |     - equals
 31 |     - to_batches
 32 |     - to_pydict
 33 |     - to_pandas
 34 |     - to_string
 35 |     - field
 36 |     - column
 37 |     - itercolumns
 38 |     - schema
 39 |     - columns
 40 |     - num_columns
 41 |     - num_rows
 42 |     - shape
 43 |     - nbytes
 44 |     - column_names
 45 |     - slice
 46 |     - filter
 47 |     - flatten
 48 |     - combine_chunks
 49 |     - cast
 50 |     - replace_schema_metadata
 51 |     - add_column
 52 |     - append_column
 53 |     - remove_column
 54 |     - set_column
 55 |     - rename_columns
 56 |     - select
 57 |     - drop
 58 |     - from_file
 59 |     - from_buffer
 60 |     - from_pandas
 61 |     - from_arrays
 62 |     - from_pydict
 63 |     - from_batches
 64 | 
 65 | ## MemoryMappedTable
 66 | 
 67 | [[autodoc]] datasets.table.MemoryMappedTable
 68 |     - validate
 69 |     - equals
 70 |     - to_batches
 71 |     - to_pydict
 72 |     - to_pandas
 73 |     - to_string
 74 |     - field
 75 |     - column
 76 |     - itercolumns
 77 |     - schema
 78 |     - columns
 79 |     - num_columns
 80 |     - num_rows
 81 |     - shape
 82 |     - nbytes
 83 |     - column_names
 84 |     - slice
 85 |     - filter
 86 |     - flatten
 87 |     - combine_chunks
 88 |     - cast
 89 |     - replace_schema_metadata
 90 |     - add_column
 91 |     - append_column
 92 |     - remove_column
 93 |     - set_column
 94 |     - rename_columns
 95 |     - select
 96 |     - drop
 97 |     - from_file
 98 | 
 99 | ## ConcatenationTable
100 | 
101 | [[autodoc]] datasets.table.ConcatenationTable
102 |     - validate
103 |     - equals
104 |     - to_batches
105 |     - to_pydict
106 |     - to_pandas
107 |     - to_string
108 |     - field
109 |     - column
110 |     - itercolumns
111 |     - schema
112 |     - columns
113 |     - num_columns
114 |     - num_rows
115 |     - shape
116 |     - nbytes
117 |     - column_names
118 |     - slice
119 |     - filter
120 |     - flatten
121 |     - combine_chunks
122 |     - cast
123 |     - replace_schema_metadata
124 |     - add_column
125 |     - append_column
126 |     - remove_column
127 |     - set_column
128 |     - rename_columns
129 |     - select
130 |     - drop
131 |     - from_blocks
132 |     - from_tables
133 | 
134 | ## Utils
135 | 
136 | [[autodoc]] datasets.table.concat_tables
137 | 
138 | [[autodoc]] datasets.table.list_table_cache_files
139 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/xml/xml.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | from dataclasses import dataclass
 3 | from typing import Optional
 4 | 
 5 | import pyarrow as pa
 6 | 
 7 | import datasets
 8 | from datasets.features.features import require_storage_cast
 9 | from datasets.table import table_cast
10 | 
11 | 
12 | logger = datasets.utils.logging.get_logger(__name__)
13 | 
14 | 
15 | @dataclass
16 | class XmlConfig(datasets.BuilderConfig):
17 |     """BuilderConfig for xml files."""
18 | 
19 |     features: Optional[datasets.Features] = None
20 |     encoding: str = "utf-8"
21 |     encoding_errors: Optional[str] = None
22 | 
23 | 
24 | class Xml(datasets.ArrowBasedBuilder):
25 |     BUILDER_CONFIG_CLASS = XmlConfig
26 | 
27 |     def _info(self):
28 |         return datasets.DatasetInfo(features=self.config.features)
29 | 
30 |     def _split_generators(self, dl_manager):
31 |         """The `data_files` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]].
32 | 
33 |         If str or List[str], then the dataset returns only the 'train' split.
34 |         If dict, then keys should be from the `datasets.Split` enum.
35 |         """
36 |         if not self.config.data_files:
37 |             raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
38 |         dl_manager.download_config.extract_on_the_fly = True
39 |         data_files = dl_manager.download_and_extract(self.config.data_files)
40 |         splits = []
41 |         for split_name, files in data_files.items():
42 |             if isinstance(files, str):
43 |                 files = [files]
44 |             files = [dl_manager.iter_files(file) for file in files]
45 |             splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
46 |         return splits
47 | 
48 |     def _cast_table(self, pa_table: pa.Table) -> pa.Table:
49 |         if self.config.features is not None:
50 |             schema = self.config.features.arrow_schema
51 |             if all(not require_storage_cast(feature) for feature in self.config.features.values()):
52 |                 # cheaper cast
53 |                 pa_table = pa_table.cast(schema)
54 |             else:
55 |                 # more expensive cast; allows str <-> int/float or str to Audio for example
56 |                 pa_table = table_cast(pa_table, schema)
57 |             return pa_table
58 |         else:
59 |             return pa_table.cast(pa.schema({"xml": pa.string()}))
60 | 
61 |     def _generate_tables(self, files):
62 |         pa_table_names = list(self.config.features) if self.config.features is not None else ["xml"]
63 |         for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
64 |             # open in text mode, by default translates universal newlines ("\n", "\r\n" and "\r") into "\n"
65 |             with open(file, encoding=self.config.encoding, errors=self.config.encoding_errors) as f:
66 |                 xml = f.read()
67 |                 pa_table = pa.Table.from_arrays([pa.array([xml])], names=pa_table_names)
68 |                 yield (file_idx, 0), self._cast_table(pa_table)
69 | 


--------------------------------------------------------------------------------
/docs/source/use_with_spark.mdx:
--------------------------------------------------------------------------------
 1 | # Use with Spark
 2 | 
 3 | This document is a quick introduction to using 🤗 Datasets with Spark, with a particular focus on how to load a Spark DataFrame into a [`Dataset`] object.
 4 | 
 5 | From there, you have fast access to any element and you can use it as a data loader to train models.
 6 | 
 7 | ## Load from Spark
 8 | 
 9 | A [`Dataset`] object is a wrapper of an Arrow table, which allows fast reads from arrays in the dataset to PyTorch, TensorFlow and JAX tensors.
10 | The Arrow table is memory mapped from disk, which can load datasets bigger than your available RAM.
11 | 
12 | You can get a [`Dataset`] from a Spark DataFrame using [`Dataset.from_spark`]:
13 | 
14 | ```py
15 | >>> from datasets import Dataset
16 | >>> df = spark.createDataFrame(
17 | ...     data=[[1, "Elia"], [2, "Teo"], [3, "Fang"]],
18 | ...     columns=["id", "name"],
19 | ... )
20 | >>> ds = Dataset.from_spark(df)
21 | ```
22 | 
23 | The Spark workers write the dataset on disk in a cache directory as Arrow files, and the [`Dataset`] is loaded from there.
24 | 
25 | Alternatively, you can skip materialization by using [`IterableDataset.from_spark`], which returns an [`IterableDataset`]:
26 | 
27 |  ```py
28 |  >>> from datasets import IterableDataset
29 |  >>> df = spark.createDataFrame(
30 |  ...     data=[[1, "Elia"], [2, "Teo"], [3, "Fang"]],
31 |  ...     columns=["id", "name"],
32 |  ... )
33 |  >>> ds = IterableDataset.from_spark(df)
34 |  >>> print(next(iter(ds)))
35 |  {"id": 1, "name": "Elia"}
36 |  ```
37 | 
38 | ### Caching
39 | 
40 | When using [`Dataset.from_spark`], the resulting [`Dataset`] is cached; if you call [`Dataset.from_spark`] multiple
41 | times on the same DataFrame it won't re-run the Spark job that writes the dataset as Arrow files on disk.
42 | 
43 | You can set the cache location by passing `cache_dir=` to [`Dataset.from_spark`].
44 | Make sure to use a disk that is available to both your workers and your current machine (the driver).
45 | 
46 | > [!WARNING]
47 | > In a different session, a Spark DataFrame doesn't have the same [semantic hash](https://spark.apache.org/docs/3.2.0/api/python/reference/api/pyspark.sql.DataFrame.semanticHash.html), and it will rerun a Spark job and store it in a new cache.
48 | 
49 | ### Feature types
50 | 
51 | If your dataset is made of images, audio data or N-dimensional arrays, you can specify the `features=` argument in
52 | [`Dataset.from_spark`] (or [`IterableDataset.from_spark`]):
53 | 
54 | ```py
55 | >>> from datasets import Dataset, Features, Image, Value
56 | >>> data = [(0, open("image.png", "rb").read())]
57 | >>> df = spark.createDataFrame(data, "idx: int, image: binary")
58 | >>> # Also works if you have arrays
59 | >>> # data = [(0, np.zeros(shape=(32, 32, 3), dtype=np.int32).tolist())]
60 | >>> # df = spark.createDataFrame(data, "idx: int, image: array<array<array<int>>>")
61 | >>> features = Features({"idx": Value("int64"), "image": Image()})
62 | >>> dataset = Dataset.from_spark(df, features=features)
63 | >>> dataset[0]
64 | {'idx': 0, 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32>}
65 | ```
66 | 
67 | You can check the [`Features`] documentation to know about all the feature types available.
68 | 


--------------------------------------------------------------------------------
/tests/commands/test_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from collections import namedtuple
 3 | 
 4 | import pytest
 5 | 
 6 | from datasets import ClassLabel, Features, List, Value
 7 | from datasets.commands.test import TestCommand
 8 | from datasets.info import DatasetInfo, DatasetInfosDict
 9 | 
10 | 
11 | _TestCommandArgs = namedtuple(
12 |     "_TestCommandArgs",
13 |     [
14 |         "dataset",
15 |         "name",
16 |         "cache_dir",
17 |         "data_dir",
18 |         "all_configs",
19 |         "save_infos",
20 |         "ignore_verifications",
21 |         "force_redownload",
22 |         "clear_cache",
23 |         "num_proc",
24 |     ],
25 |     defaults=[None, None, None, False, False, False, False, False, None],
26 | )
27 | 
28 | 
29 | def is_1percent_close(source, target):
30 |     return (abs(source - target) / target) < 0.01
31 | 
32 | 
33 | @pytest.mark.integration
34 | def test_test_command(dataset_dir):
35 |     args = _TestCommandArgs(dataset=dataset_dir, all_configs=True, save_infos=True)
36 |     test_command = TestCommand(*args)
37 |     test_command.run()
38 |     dataset_readme_path = os.path.join(dataset_dir, "README.md")
39 |     assert os.path.exists(dataset_readme_path)
40 |     dataset_infos = DatasetInfosDict.from_directory(dataset_dir)
41 |     expected_dataset_infos = DatasetInfosDict(
42 |         {
43 |             "default": DatasetInfo(
44 |                 features=Features(
45 |                     {
46 |                         "tokens": List(Value("string")),
47 |                         "ner_tags": List(
48 |                             ClassLabel(names=["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"])
49 |                         ),
50 |                         "langs": List(Value("string")),
51 |                         "spans": List(Value("string")),
52 |                     }
53 |                 ),
54 |                 splits=[
55 |                     {
56 |                         "name": "train",
57 |                         "num_bytes": 2351563,
58 |                         "num_examples": 10000,
59 |                     },
60 |                     {
61 |                         "name": "validation",
62 |                         "num_bytes": 238418,
63 |                         "num_examples": 1000,
64 |                     },
65 |                 ],
66 |                 download_size=3940680,
67 |                 dataset_size=2589981,
68 |             )
69 |         }
70 |     )
71 |     assert dataset_infos.keys() == expected_dataset_infos.keys()
72 |     for key in DatasetInfo._INCLUDED_INFO_IN_YAML:
73 |         result, expected = getattr(dataset_infos["default"], key), getattr(expected_dataset_infos["default"], key)
74 |         if key == "num_bytes":
75 |             assert is_1percent_close(result, expected)
76 |         elif key == "splits":
77 |             assert list(result) == list(expected)
78 |             for split in result:
79 |                 assert result[split].name == expected[split].name
80 |                 assert result[split].num_examples == expected[split].num_examples
81 |                 assert is_1percent_close(result[split].num_bytes, expected[split].num_bytes)
82 |         else:
83 |             result == expected
84 | 


--------------------------------------------------------------------------------
/docs/source/package_reference/loading_methods.mdx:
--------------------------------------------------------------------------------
  1 | # Loading methods
  2 | 
  3 | Methods for listing and loading datasets:
  4 | 
  5 | ## Datasets
  6 | 
  7 | [[autodoc]] datasets.load_dataset
  8 | 
  9 | [[autodoc]] datasets.load_from_disk
 10 | 
 11 | [[autodoc]] datasets.load_dataset_builder
 12 | 
 13 | [[autodoc]] datasets.get_dataset_config_names
 14 | 
 15 | [[autodoc]] datasets.get_dataset_infos
 16 | 
 17 | [[autodoc]] datasets.get_dataset_split_names
 18 | 
 19 | ## From files
 20 | 
 21 | Configurations used to load data files.
 22 | They are used when loading local files or a dataset repository:
 23 | 
 24 | - local files: `load_dataset("parquet", data_dir="path/to/data/dir")`
 25 | - dataset repository: `load_dataset("allenai/c4")`
 26 | 
 27 | You can pass arguments to `load_dataset` to configure data loading.
 28 | For example you can specify the `sep` parameter to define the [`~datasets.packaged_modules.csv.CsvConfig`] that is used to load the data:
 29 | 
 30 | ```python
 31 | load_dataset("csv", data_dir="path/to/data/dir", sep="\t")
 32 | ```
 33 | 
 34 | ### Text
 35 | 
 36 | [[autodoc]] datasets.packaged_modules.text.TextConfig
 37 | 
 38 | [[autodoc]] datasets.packaged_modules.text.Text
 39 | 
 40 | ### CSV
 41 | 
 42 | [[autodoc]] datasets.packaged_modules.csv.CsvConfig
 43 | 
 44 | [[autodoc]] datasets.packaged_modules.csv.Csv
 45 | 
 46 | ### JSON
 47 | 
 48 | [[autodoc]] datasets.packaged_modules.json.JsonConfig
 49 | 
 50 | [[autodoc]] datasets.packaged_modules.json.Json
 51 | 
 52 | ### XML
 53 | 
 54 | [[autodoc]] datasets.packaged_modules.xml.XmlConfig
 55 | 
 56 | [[autodoc]] datasets.packaged_modules.xml.Xml
 57 | 
 58 | ### Parquet
 59 | 
 60 | [[autodoc]] datasets.packaged_modules.parquet.ParquetConfig
 61 | 
 62 | [[autodoc]] datasets.packaged_modules.parquet.Parquet
 63 | 
 64 | ### Arrow
 65 | 
 66 | [[autodoc]] datasets.packaged_modules.arrow.ArrowConfig
 67 | 
 68 | [[autodoc]] datasets.packaged_modules.arrow.Arrow
 69 | 
 70 | ### SQL
 71 | 
 72 | [[autodoc]] datasets.packaged_modules.sql.SqlConfig
 73 | 
 74 | [[autodoc]] datasets.packaged_modules.sql.Sql
 75 | 
 76 | ### Images
 77 | 
 78 | [[autodoc]] datasets.packaged_modules.imagefolder.ImageFolderConfig
 79 | 
 80 | [[autodoc]] datasets.packaged_modules.imagefolder.ImageFolder
 81 | 
 82 | ### Audio
 83 | 
 84 | [[autodoc]] datasets.packaged_modules.audiofolder.AudioFolderConfig
 85 | 
 86 | [[autodoc]] datasets.packaged_modules.audiofolder.AudioFolder
 87 | 
 88 | ### Videos
 89 | 
 90 | [[autodoc]] datasets.packaged_modules.videofolder.VideoFolderConfig
 91 | 
 92 | [[autodoc]] datasets.packaged_modules.videofolder.VideoFolder
 93 | 
 94 | ### HDF5
 95 | 
 96 | [[autodoc]] datasets.packaged_modules.hdf5.HDF5Config
 97 | 
 98 | [[autodoc]] datasets.packaged_modules.hdf5.HDF5
 99 | 
100 | ### Pdf
101 | 
102 | [[autodoc]] datasets.packaged_modules.pdffolder.PdfFolderConfig
103 | 
104 | [[autodoc]] datasets.packaged_modules.pdffolder.PdfFolder
105 | 
106 | ### Nifti
107 | 
108 | [[autodoc]] datasets.packaged_modules.niftifolder.NiftiFolderConfig
109 | 
110 | [[autodoc]] datasets.packaged_modules.niftifolder.NiftiFolder
111 | 
112 | ### WebDataset
113 | 
114 | [[autodoc]] datasets.packaged_modules.webdataset.WebDataset
115 | 


--------------------------------------------------------------------------------
/docs/source/filesystems.mdx:
--------------------------------------------------------------------------------
 1 | # Cloud storage
 2 | 
 3 | ## Hugging Face Datasets
 4 | 
 5 | The Hugging Face Dataset Hub is home to a growing collection of datasets that span a variety of domains and tasks.
 6 | 
 7 | It's more than a cloud storage: the Dataset Hub is a platform that provides data versioning thanks to git, as well as a Dataset Viewer to explore the data, making it a great place to store AI-ready datasets.
 8 | 
 9 | This guide shows how to import data from other cloud storage using the filesystems implementations from `fsspec`.
10 | 
11 | ## Import data from a cloud storage
12 | 
13 | Most cloud storage providers have a `fsspec` FileSystem implementation, which is useful to import data from any cloud provider with the same code.
14 | This is especially useful to publish datasets on Hugging Face.
15 | 
16 | Take a look at the following table for some example of supported cloud storage providers:
17 | 
18 | | Storage provider     | Filesystem implementation                                     |
19 | |----------------------|---------------------------------------------------------------|
20 | | Amazon S3            | [s3fs](https://s3fs.readthedocs.io/en/latest/)                |
21 | | Google Cloud Storage | [gcsfs](https://gcsfs.readthedocs.io/en/latest/)              |
22 | | Azure Blob/DataLake  | [adlfs](https://github.com/fsspec/adlfs)                      |
23 | | Oracle Cloud Storage | [ocifs](https://ocifs.readthedocs.io/en/latest/)              |
24 | 
25 | This guide will show you how to import data files from any cloud storage and save a dataset on Hugging Face.
26 | 
27 | Let's say we want to publish a dataset on Hugging Face from Parquet files from a cloud storage.
28 | 
29 | First, instantiate your cloud storage filesystem and list the files you'd like to import:
30 | 
31 | ```python
32 | >>> import fsspec
33 | >>> fs = fsspec.filesystem("...")  # s3 / gcs / abfs / adl / oci / ...
34 | >>> data_dir = "path/to/my/data/"
35 | >>> pattern = "*.parquet"
36 | >>> data_files = fs.glob(data_dir + pattern)
37 | ["path/to/my/data/0001.parquet", "path/to/my/data/0001.parquet", ...]
38 | ```
39 | 
40 | Then you can create a dataset on Hugging Face and import the data files, using for example:
41 | 
42 | ```python
43 | >>> from huggingface_hub import create_repo, upload_file
44 | >>> from tqdm.auto import tqdm
45 | >>> destination_dataset = "username/my-dataset"
46 | >>> create_repo(destination_dataset, repo_type="dataset")
47 | >>> for data_file in tqdm(fs.glob(data_dir + pattern)):
48 | ...     with fs.open(data_file) as fileobj:
49 | ...         path_in_repo = data_file[len(data_dir):]
50 | ...         upload_file(
51 | ...             path_or_fileobj=fileobj,
52 | ...             path_in_repo=path_in_repo,
53 | ...             repo_id=destination_dataset,
54 | ...             repo_type="dataset",
55 | ...         )
56 | ```
57 | 
58 | Check out the [huggingface_hub](https://huggingface.co/docs/huggingface_hub) documentation on files uploads [here](https://huggingface.co/docs/huggingface_hub/en/guides/upload) if you're looking for more upload options.
59 | 
60 | Finally you can now load the dataset using 🤗 Datasets:
61 | 
62 | ```python
63 | >>> from datasets import load_dataset
64 | >>> ds = load_dataset("username/my-dataset")
65 | ```
66 | 


--------------------------------------------------------------------------------
/templates/README.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | TODO: "Add YAML tags here. Delete these instructions and copy-paste the YAML tags obtained with the online tagging app: https://huggingface.co/spaces/huggingface/datasets-tagging"
  3 | ---
  4 | 
  5 | # Dataset Card for [Dataset Name]
  6 | 
  7 | ## Table of Contents
  8 | - [Table of Contents](#table-of-contents)
  9 | - [Dataset Description](#dataset-description)
 10 |   - [Dataset Summary](#dataset-summary)
 11 |   - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
 12 |   - [Languages](#languages)
 13 | - [Dataset Structure](#dataset-structure)
 14 |   - [Data Instances](#data-instances)
 15 |   - [Data Fields](#data-fields)
 16 |   - [Data Splits](#data-splits)
 17 | - [Dataset Creation](#dataset-creation)
 18 |   - [Curation Rationale](#curation-rationale)
 19 |   - [Source Data](#source-data)
 20 |   - [Annotations](#annotations)
 21 |   - [Personal and Sensitive Information](#personal-and-sensitive-information)
 22 | - [Considerations for Using the Data](#considerations-for-using-the-data)
 23 |   - [Social Impact of Dataset](#social-impact-of-dataset)
 24 |   - [Discussion of Biases](#discussion-of-biases)
 25 |   - [Other Known Limitations](#other-known-limitations)
 26 | - [Additional Information](#additional-information)
 27 |   - [Dataset Curators](#dataset-curators)
 28 |   - [Licensing Information](#licensing-information)
 29 |   - [Citation Information](#citation-information)
 30 |   - [Contributions](#contributions)
 31 | 
 32 | ## Dataset Description
 33 | 
 34 | - **Homepage:**
 35 | - **Repository:**
 36 | - **Paper:**
 37 | - **Leaderboard:**
 38 | - **Point of Contact:**
 39 | 
 40 | ### Dataset Summary
 41 | 
 42 | [More Information Needed]
 43 | 
 44 | ### Supported Tasks and Leaderboards
 45 | 
 46 | [More Information Needed]
 47 | 
 48 | ### Languages
 49 | 
 50 | [More Information Needed]
 51 | 
 52 | ## Dataset Structure
 53 | 
 54 | ### Data Instances
 55 | 
 56 | [More Information Needed]
 57 | 
 58 | ### Data Fields
 59 | 
 60 | [More Information Needed]
 61 | 
 62 | ### Data Splits
 63 | 
 64 | [More Information Needed]
 65 | 
 66 | ## Dataset Creation
 67 | 
 68 | ### Curation Rationale
 69 | 
 70 | [More Information Needed]
 71 | 
 72 | ### Source Data
 73 | 
 74 | #### Initial Data Collection and Normalization
 75 | 
 76 | [More Information Needed]
 77 | 
 78 | #### Who are the source language producers?
 79 | 
 80 | [More Information Needed]
 81 | 
 82 | ### Annotations
 83 | 
 84 | #### Annotation process
 85 | 
 86 | [More Information Needed]
 87 | 
 88 | #### Who are the annotators?
 89 | 
 90 | [More Information Needed]
 91 | 
 92 | ### Personal and Sensitive Information
 93 | 
 94 | [More Information Needed]
 95 | 
 96 | ## Considerations for Using the Data
 97 | 
 98 | ### Social Impact of Dataset
 99 | 
100 | [More Information Needed]
101 | 
102 | ### Discussion of Biases
103 | 
104 | [More Information Needed]
105 | 
106 | ### Other Known Limitations
107 | 
108 | [More Information Needed]
109 | 
110 | ## Additional Information
111 | 
112 | ### Dataset Curators
113 | 
114 | [More Information Needed]
115 | 
116 | ### Licensing Information
117 | 
118 | [More Information Needed]
119 | 
120 | ### Citation Information
121 | 
122 | [More Information Needed]
123 | 
124 | ### Contributions
125 | 
126 | Thanks to [@github-username](https://github.com/<github-username>) for adding this dataset.
127 | 


--------------------------------------------------------------------------------
/src/datasets/naming.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Lint as: python3
16 | """Utilities for file names."""
17 | 
18 | import itertools
19 | import os
20 | import re
21 | 
22 | 
23 | _uppercase_uppercase_re = re.compile(r"([A-Z]+)([A-Z][a-z])")
24 | _lowercase_uppercase_re = re.compile(r"([a-z\d])([A-Z])")
25 | 
26 | _single_underscore_re = re.compile(r"(?<!_)_(?!_)")
27 | _multiple_underscores_re = re.compile(r"(_{2,})")
28 | 
29 | _split_re = r"^\w+(\.\w+)*$"
30 | 
31 | INVALID_WINDOWS_CHARACTERS_IN_PATH = r"<>:/\|?*"
32 | 
33 | 
34 | def camelcase_to_snakecase(name):
35 |     """Convert camel-case string to snake-case."""
36 |     name = _uppercase_uppercase_re.sub(r"\1_\2", name)
37 |     name = _lowercase_uppercase_re.sub(r"\1_\2", name)
38 |     return name.lower()
39 | 
40 | 
41 | def snakecase_to_camelcase(name):
42 |     """Convert snake-case string to camel-case string."""
43 |     name = _single_underscore_re.split(name)
44 |     name = [_multiple_underscores_re.split(n) for n in name]
45 |     return "".join(n.capitalize() for n in itertools.chain.from_iterable(name) if n != "")
46 | 
47 | 
48 | def filename_prefix_for_name(name):
49 |     if os.path.basename(name) != name:
50 |         raise ValueError(f"Should be a dataset name, not a path: {name}")
51 |     return camelcase_to_snakecase(name)
52 | 
53 | 
54 | def filename_prefix_for_split(name, split):
55 |     if os.path.basename(name) != name:
56 |         raise ValueError(f"Should be a dataset name, not a path: {name}")
57 |     if not re.match(_split_re, split):
58 |         raise ValueError(f"Split name should match '{_split_re}'' but got '{split}'.")
59 |     return f"{filename_prefix_for_name(name)}-{split}"
60 | 
61 | 
62 | def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None):
63 |     prefix = filename_prefix_for_split(dataset_name, split)
64 |     if filetype_suffix:
65 |         prefix += f".{filetype_suffix}"
66 |     filepath = os.path.join(data_dir, prefix)
67 |     return f"{filepath}*"
68 | 
69 | 
70 | def filenames_for_dataset_split(path, dataset_name, split, filetype_suffix=None, shard_lengths=None):
71 |     prefix = filename_prefix_for_split(dataset_name, split)
72 |     prefix = os.path.join(path, prefix)
73 | 
74 |     if shard_lengths and len(shard_lengths) > 1:
75 |         num_shards = len(shard_lengths)
76 |         filenames = [f"{prefix}-{shard_id:05d}-of-{num_shards:05d}" for shard_id in range(num_shards)]
77 |         if filetype_suffix:
78 |             filenames = [filename + f".{filetype_suffix}" for filename in filenames]
79 |         return filenames
80 |     else:
81 |         filename = prefix
82 |         if filetype_suffix:
83 |             filename += f".{filetype_suffix}"
84 |         return [filename]
85 | 


--------------------------------------------------------------------------------
/docs/source/nlp_process.mdx:
--------------------------------------------------------------------------------
 1 | # Process text data
 2 | 
 3 | This guide shows specific methods for processing text datasets. Learn how to:
 4 | 
 5 | - Tokenize a dataset with [`~Dataset.map`].
 6 | - Align dataset labels with label ids for NLI datasets.
 7 | 
 8 | For a guide on how to process any type of dataset, take a look at the <a class="underline decoration-sky-400 decoration-2 font-semibold" href="./process">general process guide</a>.
 9 | 
10 | ## Map
11 | 
12 | The [`~Dataset.map`] function supports processing batches of examples at once which speeds up tokenization.
13 | 
14 | Load a tokenizer from 🤗 [Transformers](https://huggingface.co/transformers/):
15 | 
16 | ```py
17 | >>> from transformers import AutoTokenizer
18 | 
19 | >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
20 | ```
21 | 
22 | Set the `batched` parameter to `True` in the [`~Dataset.map`] function to apply the tokenizer to batches of examples:
23 | 
24 | ```py
25 | >>> dataset = dataset.map(lambda examples: tokenizer(examples["text"]), batched=True)
26 | >>> dataset[0]
27 | {'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 
28 |  'label': 1, 
29 |  'input_ids': [101, 1996, 2600, 2003, 16036, 2000, 2022, 1996, 7398, 2301, 1005, 1055, 2047, 1000, 16608, 1000, 1998, 2008, 2002, 1005, 1055, 2183, 2000, 2191, 1037, 17624, 2130, 3618, 2084, 7779, 29058, 8625, 13327, 1010, 3744, 1011, 18856, 19513, 3158, 5477, 4168, 2030, 7112, 16562, 2140, 1012, 102], 
30 |  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
31 |  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
32 | ```
33 | 
34 | The [`~Dataset.map`] function converts the returned values to a PyArrow-supported format. But explicitly returning the tensors as NumPy arrays is faster because it is a natively supported PyArrow format. Set `return_tensors="np"` when you tokenize your text:
35 | 
36 | ```py
37 | >>> dataset = dataset.map(lambda examples: tokenizer(examples["text"], return_tensors="np"), batched=True)
38 | ```
39 | 
40 | ## Align
41 | 
42 | The [`~Dataset.align_labels_with_mapping`] function aligns a dataset label id with the label name. Not all 🤗 Transformers models follow the prescribed label mapping of the original dataset, especially for NLI datasets. For example, the [MNLI](https://huggingface.co/datasets/glue) dataset uses the following label mapping:
43 | 
44 | ```py
45 | >>> label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}
46 | ```
47 | 
48 | To align the dataset label mapping with the mapping used by a model, create a dictionary of the label name and id to align on:
49 | 
50 | ```py
51 | >>> label2id = {"contradiction": 0, "neutral": 1, "entailment": 2}
52 | ```
53 | 
54 | Pass the dictionary of the label mappings to the [`~Dataset.align_labels_with_mapping`] function, and the column to align on:
55 | 
56 | ```py
57 | >>> from datasets import load_dataset
58 | 
59 | >>> mnli = load_dataset("nyu-mll/glue", "mnli", split="train")
60 | >>> mnli_aligned = mnli.align_labels_with_mapping(label2id, "label")
61 | ```
62 | 
63 | You can also use this function to assign a custom mapping of labels to ids.


--------------------------------------------------------------------------------
/tests/test_hub.py:
--------------------------------------------------------------------------------
 1 | from textwrap import dedent
 2 | from types import SimpleNamespace
 3 | from unittest.mock import patch
 4 | from urllib.parse import quote
 5 | 
 6 | import pytest
 7 | from huggingface_hub import CommitOperationAdd, CommitOperationDelete
 8 | 
 9 | import datasets
10 | from datasets.config import METADATA_CONFIGS_FIELD
11 | from datasets.hub import delete_from_hub
12 | from datasets.utils.hub import hf_dataset_url
13 | 
14 | 
15 | @pytest.mark.parametrize("repo_id", ["canonical_dataset_name", "org-name/dataset-name"])
16 | @pytest.mark.parametrize("filename", ["filename.csv", "filename with blanks.csv"])
17 | @pytest.mark.parametrize("revision", [None, "v2"])
18 | def test_dataset_url(repo_id, filename, revision):
19 |     url = hf_dataset_url(repo_id=repo_id, filename=filename, revision=revision)
20 |     assert url == f"https://huggingface.co/datasets/{repo_id}/resolve/{revision or 'main'}/{quote(filename)}"
21 | 
22 | 
23 | def test_delete_from_hub(temporary_repo, hf_api, hf_token, csv_path, ci_hub_config) -> None:
24 |     with temporary_repo() as repo_id:
25 |         hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset")
26 |         hf_api.upload_file(
27 |             path_or_fileobj=str(csv_path),
28 |             path_in_repo="cats/train/0000.csv",
29 |             repo_id=repo_id,
30 |             repo_type="dataset",
31 |             token=hf_token,
32 |         )
33 |         hf_api.upload_file(
34 |             path_or_fileobj=str(csv_path),
35 |             path_in_repo="dogs/train/0000.csv",
36 |             repo_id=repo_id,
37 |             repo_type="dataset",
38 |             token=hf_token,
39 |         )
40 |         hf_api.upload_file(
41 |             token=hf_token,
42 |             path_or_fileobj=dedent(
43 |                 f"""\
44 |             ---
45 |             {METADATA_CONFIGS_FIELD}:
46 |             - config_name: cats
47 |               data_files:
48 |               - split: train
49 |                 path: cats/train/*
50 |             - config_name: dogs
51 |               data_files:
52 |               - split: train
53 |                 path: dogs/train/*
54 |             ---
55 |             """
56 |             ).encode(),
57 |             path_in_repo="README.md",
58 |             repo_id=repo_id,
59 |             repo_type="dataset",
60 |         )
61 |         commit_info = SimpleNamespace(
62 |             pr_url="https:///hub-ci.huggingface.co/datasets/__DUMMY_USER__/__DUMMY_DATASET__/refs%2Fpr%2F1"
63 |         )
64 |         with patch.object(datasets.hub.HfApi, "create_commit", return_value=commit_info) as mock_method:
65 |             _ = delete_from_hub(repo_id, "dogs")
66 |     assert mock_method.called
67 |     assert mock_method.call_args.kwargs.get("commit_message") == "Delete 'dogs' config"
68 |     assert mock_method.call_args.kwargs.get("create_pr")
69 |     expected_operations = [
70 |         CommitOperationDelete(path_in_repo="dogs/train/0000.csv", is_folder=False),
71 |         CommitOperationAdd(
72 |             path_in_repo="README.md",
73 |             path_or_fileobj=dedent(
74 |                 f"""\
75 |             ---
76 |             {METADATA_CONFIGS_FIELD}:
77 |             - config_name: cats
78 |               data_files:
79 |               - split: train
80 |                 path: cats/train/*
81 |             ---
82 |             """
83 |             ).encode(),
84 |         ),
85 |     ]
86 |     assert mock_method.call_args.kwargs.get("operations") == expected_operations
87 | 


--------------------------------------------------------------------------------
/docs/source/image_classification.mdx:
--------------------------------------------------------------------------------
 1 | # Image classification
 2 | 
 3 | Image classification datasets are used to train a model to classify an entire image. There are a wide variety of applications enabled by these datasets such as identifying endangered wildlife species or screening for disease in medical images. This guide will show you how to apply transformations to an image classification dataset.
 4 | 
 5 | Before you start, make sure you have up-to-date versions of `albumentations` and `cv2` installed:
 6 | 
 7 | ```bash
 8 | pip install -U albumentations opencv-python
 9 | ```
10 | 
11 | This guide uses the [Beans](https://huggingface.co/datasets/beans) dataset for identifying the type of bean plant disease based on an image of its leaf.
12 | 
13 | Load the dataset and take a look at an example:
14 | 
15 | ```py
16 | >>> from datasets import load_dataset
17 | 
18 | >>> dataset = load_dataset("AI-Lab-Makerere/beans")
19 | >>> dataset["train"][10]
20 | {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x500 at 0x7F8D2F4D7A10>,
21 |  'image_file_path': '/root/.cache/huggingface/datasets/downloads/extracted/b0a21163f78769a2cf11f58dfc767fb458fc7cea5c05dccc0144a2c0f0bc1292/train/angular_leaf_spot/angular_leaf_spot_train.204.jpg',
22 |  'labels': 0}
23 | ```
24 | 
25 | The dataset has three fields:
26 | 
27 | * `image`: a PIL image object.
28 | * `image_file_path`: the path to the image file.
29 | * `labels`: the label or category of the image.
30 | 
31 | Next, check out an image:
32 | 
33 | <div class="flex justify-center">
34 |     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/img_clf.png">
35 | </div>
36 | 
37 | Now apply some augmentations with `albumentations`. You'll randomly crop the image, flip it horizontally, and adjust its brightness.
38 | 
39 | ```py
40 | >>> import cv2
41 | >>> import albumentations
42 | >>> import numpy as np
43 | 
44 | >>> transform = albumentations.Compose([
45 | ...     albumentations.RandomCrop(width=256, height=256),
46 | ...     albumentations.HorizontalFlip(p=0.5),
47 | ...     albumentations.RandomBrightnessContrast(p=0.2),
48 | ... ])
49 | ```
50 | 
51 | Create a function to apply the transformation to the images:
52 | 
53 | ```py
54 | >>> def transforms(examples):
55 | ...     examples["pixel_values"] = [
56 | ...         transform(image=np.array(image))["image"] for image in examples["image"]
57 | ...     ]
58 | ... 
59 | ...     return examples
60 | ```
61 | 
62 | Use the [`~Dataset.set_transform`] function to apply the transformation on-the-fly to batches of the dataset to consume less disk space:
63 | 
64 | ```py
65 | >>> dataset.set_transform(transforms)
66 | ```
67 | 
68 | You can verify the transformation worked by indexing into the `pixel_values` of the first example:
69 | 
70 | ```py
71 | >>> import numpy as np
72 | >>> import matplotlib.pyplot as plt
73 | 
74 | >>> img = dataset["train"][0]["pixel_values"]
75 | >>> plt.imshow(img)
76 | ```
77 | 
78 | <div class="flex justify-center">
79 |     <img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/img_clf_aug.png">
80 |     <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/img_clf_aug.png"/>
81 | </div>
82 | 
83 | > [!TIP]
84 | > Now that you know how to process a dataset for image classification, learn
85 | > [how to train an image classification model](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)
86 | > and use it for inference.


--------------------------------------------------------------------------------
/docs/source/about_map_batch.mdx:
--------------------------------------------------------------------------------
 1 | # Batch mapping
 2 | 
 3 | Combining the utility of [`Dataset.map`] with batch mode is very powerful. It allows you to speed up processing, and freely control the size of the generated dataset. 
 4 | 
 5 | ## Need for speed
 6 | 
 7 | The primary objective of batch mapping is to speed up processing. Often times, it is faster to work with batches of data instead of single examples. Naturally, batch mapping lends itself to tokenization. For example, the 🤗 [Tokenizers](https://huggingface.co/docs/tokenizers/python/latest/) library works faster with batches because it parallelizes the tokenization of all the examples in a batch.
 8 | 
 9 | ## Input size != output size
10 | 
11 | The ability to control the size of the generated dataset can be leveraged for many interesting use-cases. In the How-to [map](#map) section, there are examples of using batch mapping to:
12 | 
13 | - Split long sentences into shorter chunks.
14 | - Augment a dataset with additional tokens.
15 | 
16 | It is helpful to understand how this works, so you can come up with your own ways to use batch mapping. At this point, you may be wondering how you can control the size of the generated dataset. The answer is: **the mapped function does not have to return an output batch of the same size**.
17 | 
18 | In other words, your mapped function input can be a batch of size `N` and return a batch of size `M`. The output `M` can be greater than or less than `N`. This means you can concatenate your examples, divide it up, and even add more examples!
19 | 
20 | However, remember that all values in the output dictionary must contain the **same number of elements** as the other fields in the output dictionary. Otherwise, it is not possible to define the number of examples in the output returned by the mapped function. The number can vary between successive batches processed by the mapped function. For a single batch though, all values of the output dictionary should have the same length (i.e., the number of elements).
21 | 
22 | For example, from a dataset of 1 column and 3 rows, if you use `map` to return a new column with twice as many rows, then you will have an error.
23 | In this case, you end up with one column with 3 rows, and one column with 6 rows. As you can see, the table will not be valid:
24 | 
25 | ```py
26 | >>> from datasets import Dataset
27 | >>> dataset = Dataset.from_dict({"a": [0, 1, 2]})
28 | >>> dataset.map(lambda batch: {"b": batch["a"] * 2}, batched=True)  # new column with 6 elements: [0, 1, 2, 0, 1, 2]
29 | 'ArrowInvalid: Column 1 named b expected length 3 but got length 6'
30 | ```
31 | 
32 | To make it valid, you have to drop one of the columns:
33 | 
34 | ```py
35 | >>> from datasets import Dataset
36 | >>> dataset = Dataset.from_dict({"a": [0, 1, 2]})
37 | >>> dataset_with_duplicates = dataset.map(lambda batch: {"b": batch["a"] * 2}, remove_columns=["a"], batched=True)
38 | >>> len(dataset_with_duplicates)
39 | 6
40 | ```
41 | Alternatively, you can overwrite the existing column to achieve the same result.
42 | For example, here’s how to duplicate every row in the dataset by overwriting column `"a"`:
43 | 
44 | ```py
45 | >>> from datasets import Dataset
46 | >>> dataset = Dataset.from_dict({"a": [0, 1, 2]})
47 | # overwrites the existing "a" column with duplicated values
48 | >>> duplicated_dataset = dataset.map(
49 | ...     lambda batch: {"a": [x for x in batch["a"] for _ in range(2)]},
50 | ...     batched=True
51 | ... )
52 | >>> duplicated_dataset
53 | Dataset({
54 |     features: ['a'],
55 |     num_rows: 6
56 | })
57 | >>> duplicated_dataset["a"]
58 | [0, 0, 1, 1, 2, 2]
59 | ```
60 | 


--------------------------------------------------------------------------------
/docs/source/about_cache.mdx:
--------------------------------------------------------------------------------
 1 | # The cache
 2 | 
 3 | The cache is one of the reasons why 🤗 Datasets is so efficient. It stores previously downloaded and processed datasets so when you need to use them again, they are reloaded directly from the cache. This avoids having to download a dataset all over again, or reapplying processing functions. Even after you close and start another Python session, 🤗 Datasets will reload your dataset directly from the cache!
 4 | 
 5 | ## Fingerprint
 6 | 
 7 | How does the cache keeps track of what transforms are applied to a dataset? Well, 🤗 Datasets assigns a fingerprint to the cache file. A fingerprint keeps track of the current state of a dataset. The initial fingerprint is computed using a hash from the Arrow table, or a hash of the Arrow files if the dataset is on disk. Subsequent fingerprints are computed by combining the fingerprint of the previous state, and a hash of the latest transform applied. 
 8 | 
 9 | > [!TIP]
10 | > Transforms are any of the processing methods from the [How-to Process](./process) guides such as [`Dataset.map`] or [`Dataset.shuffle`].
11 | 
12 | Here are what the actual fingerprints look like:
13 | 
14 | ```py
15 | >>> from datasets import Dataset
16 | >>> dataset1 = Dataset.from_dict({"a": [0, 1, 2]})
17 | >>> dataset2 = dataset1.map(lambda x: {"a": x["a"] + 1})
18 | >>> print(dataset1._fingerprint, dataset2._fingerprint)
19 | d19493523d95e2dc 5b86abacd4b42434
20 | ```
21 | 
22 | In order for a transform to be hashable, it needs to be picklable by [dill](https://dill.readthedocs.io/en/latest/) or [pickle](https://docs.python.org/3/library/pickle). 
23 | 
24 | When you use a non-hashable transform, 🤗 Datasets uses a random fingerprint instead and raises a warning. The non-hashable transform is considered different from the previous transforms. As a result, 🤗 Datasets will recompute all the transforms. Make sure your transforms are serializable with pickle or dill to avoid this!
25 | 
26 | An example of when 🤗 Datasets recomputes everything is when caching is disabled. When this happens, the cache files are generated every time and they get written to a temporary directory. Once your Python session ends, the cache files in the temporary directory are deleted. A random hash is assigned to these cache files, instead of a fingerprint. 
27 | 
28 | > [!TIP]
29 | > When caching is disabled, use [`Dataset.save_to_disk`] to save your transformed dataset or it will be deleted once the session ends.
30 | 
31 | ## Hashing
32 | 
33 | The fingerprint of a dataset is updated by hashing the function passed to `map` as well as the `map` parameters (`batch_size`, `remove_columns`, etc.).
34 | 
35 | You can check the hash of any Python object using the [`fingerprint.Hasher`]:
36 | 
37 | ```py
38 | >>> from datasets.fingerprint import Hasher
39 | >>> my_func = lambda example: {"length": len(example["text"])}
40 | >>> print(Hasher.hash(my_func))
41 | '3d35e2b3e94c81d6'
42 | ```
43 | 
44 | The hash is computed by dumping the object using a `dill` pickler and hashing the dumped bytes.
45 | The pickler recursively dumps all the variables used in your function, so any change you do to an object that is used in your function, will cause the hash to change.
46 | 
47 | If one of your functions doesn't seem to have the same hash across sessions, it means at least one of its variables contains a Python object that is not deterministic.
48 | When this happens, feel free to hash any object you find suspicious to try to find the object that caused the hash to change.
49 | For example, if you use a list for which the order of its elements is not deterministic across sessions, then the hash won't be the same across sessions either.
50 | 


--------------------------------------------------------------------------------
/docs/source/image_process.mdx:
--------------------------------------------------------------------------------
 1 | # Process image data
 2 | 
 3 | This guide shows specific methods for processing image datasets. Learn how to:
 4 | 
 5 | - Use [`~Dataset.map`] with image dataset.
 6 | - Apply data augmentations to a dataset with [`~Dataset.set_transform`].
 7 | 
 8 | For a guide on how to process any type of dataset, take a look at the <a class="underline decoration-sky-400 decoration-2 font-semibold" href="./process">general process guide</a>.
 9 | 
10 | ## Map
11 | 
12 | The [`~Dataset.map`] function can apply transforms over an entire dataset.
13 | 
14 | For example, create a basic [`Resize`](https://pytorch.org/vision/stable/generated/torchvision.transforms.Resize.html) function:
15 | 
16 | ```py
17 | >>> def transforms(examples):
18 | ...     examples["pixel_values"] = [image.convert("RGB").resize((100,100)) for image in examples["image"]]
19 | ...     return examples
20 | ```
21 | 
22 | Now use the [`~Dataset.map`] function to resize the entire dataset, and set `batched=True` to speed up the process by accepting batches of examples. The transform returns `pixel_values` as a cacheable `PIL.Image` object:
23 | 
24 | ```py
25 | >>> dataset = dataset.map(transforms, remove_columns=["image"], batched=True)
26 | >>> dataset[0]
27 | {'label': 6,
28 |  'pixel_values': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=100x100 at 0x7F058237BB10>}
29 | ```
30 | 
31 | The cache file saves time because you don't have to execute the same transform twice. The [`~Dataset.map`] function is best for operations you only run once per training - like resizing an image - instead of using it for operations executed for each epoch, like data augmentations.
32 | 
33 | [`~Dataset.map`] takes up some memory, but you can reduce its memory requirements with the following parameters:
34 | 
35 | - [`batch_size`](./package_reference/main_classes#datasets.DatasetDict.map.batch_size) determines the number of examples that are processed in one call to the transform function.
36 | - [`writer_batch_size`](./package_reference/main_classes#datasets.DatasetDict.map.writer_batch_size) determines the number of processed examples that are kept in memory before they are stored away.
37 | 
38 | Both parameter values default to 1000, which can be expensive if you are storing images. Lower these values to use less memory when you use [`~Dataset.map`].
39 | 
40 | ## Apply transforms
41 | 
42 | 🤗 Datasets applies data augmentations from any library or package to your dataset. Transforms can be applied on-the-fly on batches of data with [`~Dataset.set_transform`], which consumes less disk space.
43 | 
44 | > [!TIP]
45 | > The following example uses [torchvision](https://pytorch.org/vision/stable/index.html), but feel free to use other data augmentation libraries like [Albumentations](https://albumentations.ai/docs/), [Kornia](https://kornia.readthedocs.io/en/latest/), and [imgaug](https://imgaug.readthedocs.io/en/latest/).
46 | 
47 | For example, if you'd like to change the color properties of an image randomly:
48 | 
49 | ```py
50 | >>> from torchvision.transforms import Compose, ColorJitter, ToTensor
51 | 
52 | >>> jitter = Compose(
53 | ...     [
54 | ...          ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.7),
55 | ...          ToTensor(),
56 | ...     ]
57 | ... )
58 | ```
59 | 
60 | Create a function to apply the `ColorJitter` transform:
61 | 
62 | ```py
63 | >>> def transforms(examples):
64 | ...     examples["pixel_values"] = [jitter(image.convert("RGB")) for image in examples["image"]]
65 | ...     return examples
66 | ```
67 | 
68 | Apply the transform with the [`~Dataset.set_transform`] function:
69 | 
70 | ```py
71 | >>> dataset.set_transform(transforms)
72 | ```


--------------------------------------------------------------------------------
/docs/source/audio_process.mdx:
--------------------------------------------------------------------------------
 1 | # Process audio data
 2 | 
 3 | This guide shows specific methods for processing audio datasets. Learn how to:
 4 | 
 5 | - Resample the sampling rate.
 6 | - Use [`~Dataset.map`] with audio datasets.
 7 | 
 8 | For a guide on how to process any type of dataset, take a look at the <a class="underline decoration-sky-400 decoration-2 font-semibold" href="./process">general process guide</a>.
 9 | 
10 | ## Cast
11 | 
12 | The [`~Dataset.cast_column`] function is used to cast a column to another feature to be decoded. When you use this function with the [`Audio`] feature, you can resample the sampling rate:
13 | 
14 | ```py
15 | >>> from datasets import load_dataset, Audio
16 | 
17 | >>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train")
18 | >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
19 | ```
20 | 
21 | Audio files are decoded and resampled on-the-fly, so the next time you access an example, the audio file is resampled to 16kHz:
22 | 
23 | ```py
24 | >>> audio = dataset[0]["audio"]
25 | <datasets.features._torchcodec.AudioDecoder object at 0x11642b6a0>
26 | >>> audio = audio_dataset[0]["audio"]
27 | >>> samples = audio.get_all_samples()
28 | >>> samples.data
29 | tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  2.3447e-06,
30 |          -1.9127e-04, -5.3330e-05]]
31 | >>> samples.sample_rate
32 | 16000
33 | ```
34 | 
35 | <div class="flex justify-center">
36 |   <img
37 |     class="block dark:hidden"
38 |     src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/resample.gif"
39 |   />
40 |   <img
41 |     class="hidden dark:block"
42 |     src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/resample-dark.gif"
43 |   />
44 | </div>
45 | 
46 | ## Map
47 | 
48 | The [`~Dataset.map`] function helps preprocess your entire dataset at once. Depending on the type of model you're working with, you'll need to either load a [feature extractor](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoFeatureExtractor) or a [processor](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoProcessor).
49 | 
50 | - For pretrained speech recognition models, load a feature extractor and tokenizer and combine them in a `processor`:
51 | 
52 |   ```py
53 |   >>> from transformers import AutoTokenizer, AutoFeatureExtractor, AutoProcessor
54 | 
55 |   >>> model_checkpoint = "facebook/wav2vec2-large-xlsr-53"
56 |   # after defining a vocab.json file you can instantiate a tokenizer object:
57 |   >>> tokenizer = AutoTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
58 |   >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
59 |   >>> processor = AutoProcessor.from_pretrained(feature_extractor=feature_extractor, tokenizer=tokenizer)
60 |   ```
61 | 
62 | - For fine-tuned speech recognition models, you only need to load a `processor`:
63 | 
64 |   ```py
65 |   >>> from transformers import AutoProcessor
66 | 
67 |   >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
68 |   ```
69 | 
70 | When you use [`~Dataset.map`] with your preprocessing function, include the `audio` column to ensure you're actually resampling the audio data:
71 | 
72 | ```py
73 | >>> def prepare_dataset(batch):
74 | ...     audio = batch["audio"]
75 | ...     batch["input_values"] = processor(audio.get_all_samples().data, sampling_rate=audio["sampling_rate"]).input_values[0]
76 | ...     batch["input_length"] = len(batch["input_values"])
77 | ...     with processor.as_target_processor():
78 | ...         batch["labels"] = processor(batch["sentence"]).input_ids
79 | ...     return batch
80 | >>> dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)
81 | ```
82 | 


--------------------------------------------------------------------------------
/src/datasets/utils/version.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # Lint as: python3
 16 | """Version utils."""
 17 | 
 18 | import dataclasses
 19 | import re
 20 | from dataclasses import dataclass
 21 | from functools import total_ordering
 22 | from typing import Optional, Union
 23 | 
 24 | 
 25 | _VERSION_REG = re.compile(r"^(?P<major>\d+)" r"\.(?P<minor>\d+)" r"\.(?P<patch>\d+)$")
 26 | 
 27 | 
 28 | @total_ordering
 29 | @dataclass
 30 | class Version:
 31 |     """Dataset version `MAJOR.MINOR.PATCH`.
 32 | 
 33 |     Args:
 34 |         version_str (`str`):
 35 |             The dataset version.
 36 |         description (`str`):
 37 |             A description of what is new in this version.
 38 |         major (`str`):
 39 |         minor (`str`):
 40 |         patch (`str`):
 41 | 
 42 |     Example:
 43 | 
 44 |     ```py
 45 |     >>> VERSION = datasets.Version("1.0.0")
 46 |     ```
 47 |     """
 48 | 
 49 |     version_str: str
 50 |     description: Optional[str] = None
 51 |     major: Optional[Union[str, int]] = None
 52 |     minor: Optional[Union[str, int]] = None
 53 |     patch: Optional[Union[str, int]] = None
 54 | 
 55 |     def __post_init__(self):
 56 |         self.major, self.minor, self.patch = _str_to_version_tuple(self.version_str)
 57 | 
 58 |     def __repr__(self):
 59 |         return f"{self.tuple[0]}.{self.tuple[1]}.{self.tuple[2]}"
 60 | 
 61 |     @property
 62 |     def tuple(self):
 63 |         return self.major, self.minor, self.patch
 64 | 
 65 |     def _validate_operand(self, other):
 66 |         if isinstance(other, str):
 67 |             return Version(other)
 68 |         elif isinstance(other, Version):
 69 |             return other
 70 |         raise TypeError(f"{other} (type {type(other)}) cannot be compared to version.")
 71 | 
 72 |     def __eq__(self, other):
 73 |         try:
 74 |             other = self._validate_operand(other)
 75 |         except (TypeError, ValueError):
 76 |             return False
 77 |         else:
 78 |             return self.tuple == other.tuple
 79 | 
 80 |     def __lt__(self, other):
 81 |         other = self._validate_operand(other)
 82 |         return self.tuple < other.tuple
 83 | 
 84 |     def __hash__(self):
 85 |         return hash(_version_tuple_to_str(self.tuple))
 86 | 
 87 |     @classmethod
 88 |     def from_dict(cls, dic):
 89 |         field_names = {f.name for f in dataclasses.fields(cls)}
 90 |         return cls(**{k: v for k, v in dic.items() if k in field_names})
 91 | 
 92 |     def _to_yaml_string(self) -> str:
 93 |         return self.version_str
 94 | 
 95 | 
 96 | def _str_to_version_tuple(version_str):
 97 |     """Return the tuple (major, minor, patch) version extracted from the str."""
 98 |     res = _VERSION_REG.match(version_str)
 99 |     if not res:
100 |         raise ValueError(f"Invalid version '{version_str}'. Format should be x.y.z with {{x,y,z}} being digits.")
101 |     return tuple(int(v) for v in [res.group("major"), res.group("minor"), res.group("patch")])
102 | 
103 | 
104 | def _version_tuple_to_str(version_tuple):
105 |     """Return the str version from the version tuple (major, minor, patch)."""
106 |     return ".".join(str(v) for v in version_tuple)
107 | 


--------------------------------------------------------------------------------
/src/datasets/packaged_modules/arrow/arrow.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | from dataclasses import dataclass
 3 | from typing import Optional
 4 | 
 5 | import pyarrow as pa
 6 | 
 7 | import datasets
 8 | from datasets.builder import Key
 9 | from datasets.table import table_cast
10 | 
11 | 
12 | logger = datasets.utils.logging.get_logger(__name__)
13 | 
14 | 
15 | @dataclass
16 | class ArrowConfig(datasets.BuilderConfig):
17 |     """BuilderConfig for Arrow."""
18 | 
19 |     features: Optional[datasets.Features] = None
20 | 
21 |     def __post_init__(self):
22 |         super().__post_init__()
23 | 
24 | 
25 | class Arrow(datasets.ArrowBasedBuilder):
26 |     BUILDER_CONFIG_CLASS = ArrowConfig
27 | 
28 |     def _info(self):
29 |         return datasets.DatasetInfo(features=self.config.features)
30 | 
31 |     def _split_generators(self, dl_manager):
32 |         """We handle string, list and dicts in datafiles"""
33 |         if not self.config.data_files:
34 |             raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
35 |         dl_manager.download_config.extract_on_the_fly = True
36 |         data_files = dl_manager.download_and_extract(self.config.data_files)
37 |         splits = []
38 |         for split_name, files in data_files.items():
39 |             if isinstance(files, str):
40 |                 files = [files]
41 |             # Use `dl_manager.iter_files` to skip hidden files in an extracted archive
42 |             files = [dl_manager.iter_files(file) for file in files]
43 |             # Infer features if they are stored in the arrow schema
44 |             if self.info.features is None:
45 |                 for file in itertools.chain.from_iterable(files):
46 |                     with open(file, "rb") as f:
47 |                         try:
48 |                             reader = pa.ipc.open_stream(f)
49 |                         except (OSError, pa.lib.ArrowInvalid):
50 |                             reader = pa.ipc.open_file(f)
51 |                     self.info.features = datasets.Features.from_arrow_schema(reader.schema)
52 |                     break
53 |             splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
54 |         return splits
55 | 
56 |     def _cast_table(self, pa_table: pa.Table) -> pa.Table:
57 |         if self.info.features is not None:
58 |             # more expensive cast to support nested features with keys in a different order
59 |             # allows str <-> int/float or str to Audio for example
60 |             pa_table = table_cast(pa_table, self.info.features.arrow_schema)
61 |         return pa_table
62 | 
63 |     def _generate_tables(self, files):
64 |         for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
65 |             with open(file, "rb") as f:
66 |                 try:
67 |                     try:
68 |                         batches = pa.ipc.open_stream(f)
69 |                     except (OSError, pa.lib.ArrowInvalid):
70 |                         reader = pa.ipc.open_file(f)
71 |                         batches = (reader.get_batch(i) for i in range(reader.num_record_batches))
72 |                     for batch_idx, record_batch in enumerate(batches):
73 |                         pa_table = pa.Table.from_batches([record_batch])
74 |                         # Uncomment for debugging (will print the Arrow table size and elements)
75 |                         # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
76 |                         # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
77 |                         yield Key(file_idx, batch_idx), self._cast_table(pa_table)
78 |                 except ValueError as e:
79 |                     logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
80 |                     raise
81 | 


--------------------------------------------------------------------------------
/.zenodo.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "license": "Apache-2.0",
  3 |     "creators": [
  4 |         {
  5 |             "affiliation": "Hugging Face",
  6 |             "name": "Quentin Lhoest"
  7 |         },
  8 |         {
  9 |             "orcid": "0000-0003-1727-1045",
 10 |             "affiliation": "Hugging Face",
 11 |             "name": "Albert Villanova del Moral"
 12 |         },
 13 |         {
 14 |             "affiliation": "Hugging Face",
 15 |             "name": "Patrick von Platen"
 16 |         },
 17 |         {
 18 |             "affiliation": "Hugging Face",
 19 |             "name": "Thomas Wolf"
 20 |         },
 21 |         {
 22 |             "affiliation": "Hugging Face",
 23 |             "name": "Mario Šaško"
 24 |         },
 25 |         {
 26 |             "affiliation": "Hugging Face",
 27 |             "name": "Yacine Jernite"
 28 |         },
 29 |         {
 30 |             "affiliation": "Hugging Face",
 31 |             "name": "Abhishek Thakur"
 32 |         },
 33 |         {
 34 |             "affiliation": "Hugging Face",
 35 |             "name": "Lewis Tunstall"
 36 |         },
 37 |         {
 38 |             "affiliation": "Hugging Face",
 39 |             "name": "Suraj Patil"
 40 |         },
 41 |         {
 42 |             "affiliation": "Hugging Face",
 43 |             "name": "Mariama Drame"
 44 |         },
 45 |         {
 46 |             "affiliation": "Hugging Face",
 47 |             "name": "Julien Chaumond"
 48 |         },
 49 |         {
 50 |             "affiliation": "Hugging Face",
 51 |             "name": "Julien Plu"
 52 |         },
 53 |         {
 54 |             "affiliation": "Hugging Face",
 55 |             "name": "Joe Davison"
 56 |         },
 57 |         {
 58 |             "affiliation": "Hugging Face",
 59 |             "name": "Simon Brandeis"
 60 |         },
 61 |         {
 62 |             "affiliation": "Hugging Face",
 63 |             "name": "Victor Sanh"
 64 |         },
 65 |         {
 66 |             "affiliation": "Hugging Face",
 67 |             "name": "Teven Le Scao"
 68 |         },
 69 |         {
 70 |             "affiliation": "Hugging Face",
 71 |             "name": "Kevin Canwen Xu"
 72 |         },
 73 |         {
 74 |             "affiliation": "Hugging Face",
 75 |             "name": "Nicolas Patry"
 76 |         },
 77 |         {
 78 |             "affiliation": "Hugging Face",
 79 |             "name": "Steven Liu"
 80 |         },
 81 |         {
 82 |             "affiliation": "Hugging Face",
 83 |             "name": "Angelina McMillan-Major"
 84 |         },
 85 |         {
 86 |             "affiliation": "Hugging Face",
 87 |             "name": "Philipp Schmid"
 88 |         },
 89 |         {
 90 |             "affiliation": "Hugging Face",
 91 |             "name": "Sylvain Gugger"
 92 |         },
 93 |         {
 94 |             "affiliation": "Hugging Face",
 95 |             "name": "Nathan Raw"
 96 |         },
 97 |         {
 98 |             "affiliation": "Hugging Face",
 99 |             "name": "Sylvain Lesage"
100 |         },
101 |         {
102 |             "affiliation": "Hugging Face",
103 |             "name": "Anton Lozhkov"
104 |         },
105 |         {
106 |             "affiliation": "Hugging Face",
107 |             "name": "Matthew Carrigan"
108 |         },
109 |         {
110 |             "affiliation": "Hugging Face",
111 |             "name": "Th\u00e9o Matussi\u00e8re"
112 |         },
113 |         {
114 |             "affiliation": "Hugging Face",
115 |             "name": "Leandro von Werra"
116 |         },
117 |         {
118 |             "affiliation": "Hugging Face",
119 |             "name": "Lysandre Debut"
120 |         },
121 |         {
122 |             "affiliation": "Hugging Face",
123 |             "name": "Stas Bekman"
124 |         },
125 |         {
126 |             "affiliation": "Hugging Face",
127 |             "name": "Cl\u00e9ment Delangue"
128 |         }
129 |     ]
130 | }


--------------------------------------------------------------------------------
/src/datasets/utils/deprecation_utils.py:
--------------------------------------------------------------------------------
  1 | import enum
  2 | import inspect
  3 | import warnings
  4 | from functools import wraps
  5 | from typing import Callable, Optional
  6 | 
  7 | from .logging import get_logger
  8 | 
  9 | 
 10 | _emitted_deprecation_warnings = set()
 11 | logger = get_logger(__name__)
 12 | 
 13 | 
 14 | def deprecated(help_message: Optional[str] = None):
 15 |     """Decorator to mark a class or a function as deprecated.
 16 | 
 17 |     Args:
 18 |         help_message (:obj:`str`, optional): An optional message to guide the user on how to
 19 |             switch to non-deprecated usage of the library.
 20 |     """
 21 | 
 22 |     def decorator(deprecated_class_or_function: Callable):
 23 |         global _emitted_deprecation_warnings
 24 | 
 25 |         if inspect.isclass(deprecated_class_or_function):
 26 |             deprecated_function = deprecated_class_or_function.__init__
 27 |             name = deprecated_class_or_function.__name__
 28 |         else:
 29 |             deprecated_function = deprecated_class_or_function
 30 |             name = deprecated_function.__name__
 31 |             # Support deprecating __init__ class method: class name instead
 32 |             name = name if name != "__init__" else deprecated_function.__qualname__.split(".")[-2]
 33 | 
 34 |         warning_msg = (
 35 |             f"{name} is deprecated and will be removed in the next major version of datasets." + f" {help_message}"
 36 |             if help_message
 37 |             else ""
 38 |         )
 39 | 
 40 |         @wraps(deprecated_function)
 41 |         def wrapper(*args, **kwargs):
 42 |             func_hash = hash(deprecated_function)
 43 |             if func_hash not in _emitted_deprecation_warnings:
 44 |                 warnings.warn(warning_msg, category=FutureWarning, stacklevel=2)
 45 |                 _emitted_deprecation_warnings.add(func_hash)
 46 |             return deprecated_function(*args, **kwargs)
 47 | 
 48 |         wrapper._decorator_name_ = "deprecated"
 49 | 
 50 |         if inspect.isclass(deprecated_class_or_function):
 51 |             deprecated_class_or_function.__init__ = wrapper
 52 |             return deprecated_class_or_function
 53 |         else:
 54 |             return wrapper
 55 | 
 56 |     return decorator
 57 | 
 58 | 
 59 | class OnAccess(enum.EnumMeta):
 60 |     """
 61 |     Enum metaclass that calls a user-specified function whenever a member is accessed.
 62 |     """
 63 | 
 64 |     def __getattribute__(cls, name):
 65 |         obj = super().__getattribute__(name)
 66 |         if isinstance(obj, enum.Enum) and obj._on_access:
 67 |             obj._on_access()
 68 |         return obj
 69 | 
 70 |     def __getitem__(cls, name):
 71 |         member = super().__getitem__(name)
 72 |         if member._on_access:
 73 |             member._on_access()
 74 |         return member
 75 | 
 76 |     def __call__(cls, value, names=None, *, module=None, qualname=None, type=None, start=1):
 77 |         obj = super().__call__(value, names, module=module, qualname=qualname, type=type, start=start)
 78 |         if isinstance(obj, enum.Enum) and obj._on_access:
 79 |             obj._on_access()
 80 |         return obj
 81 | 
 82 | 
 83 | class DeprecatedEnum(enum.Enum, metaclass=OnAccess):
 84 |     """
 85 |     Enum class that calls `deprecate` method whenever a member is accessed.
 86 |     """
 87 | 
 88 |     def __new__(cls, value):
 89 |         member = object.__new__(cls)
 90 |         member._value_ = value
 91 |         member._on_access = member.deprecate
 92 |         return member
 93 | 
 94 |     @property
 95 |     def help_message(self):
 96 |         return ""
 97 | 
 98 |     def deprecate(self):
 99 |         help_message = f" {self.help_message}" if self.help_message else ""
100 |         warnings.warn(
101 |             f"'{self.__objclass__.__name__}' is deprecated and will be removed in the next major version of datasets."
102 |             + help_message,
103 |             FutureWarning,
104 |             stacklevel=3,
105 |         )
106 | 


--------------------------------------------------------------------------------
/docs/source/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | Before you start, you'll need to setup your environment and install the appropriate packages. 🤗 Datasets is tested on **Python 3.9+**.
 4 | 
 5 | > [!TIP]
 6 | > If you want to use 🤗 Datasets with TensorFlow or PyTorch, you'll need to install them separately. Refer to the [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2-packages-are-available) or the [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) for the specific install command for your framework.
 7 | 
 8 | ## Virtual environment
 9 | 
10 | You should install 🤗 Datasets in a [virtual environment](https://docs.python.org/3/library/venv.html) to keep things tidy and avoid dependency conflicts.
11 | 
12 | 1. Create and navigate to your project directory:
13 | 
14 |    ```bash
15 |    mkdir ~/my-project
16 |    cd ~/my-project
17 |    ```
18 | 
19 | 2. Start a virtual environment inside your directory:
20 | 
21 |    ```bash
22 |    python -m venv .env
23 |    ```
24 | 
25 | 3. Activate and deactivate the virtual environment with the following commands:
26 | 
27 |    ```bash
28 |    # Activate the virtual environment
29 |    source .env/bin/activate
30 | 
31 |    # Deactivate the virtual environment
32 |    source .env/bin/deactivate
33 |    ```
34 | 
35 | Once you've created your virtual environment, you can install 🤗 Datasets in it.
36 | 
37 | ## pip
38 | 
39 | The most straightforward way to install 🤗 Datasets is with pip:
40 | 
41 | ```bash
42 | pip install datasets
43 | ```
44 | 
45 | Run the following command to check if 🤗 Datasets has been properly installed:
46 | 
47 | ```bash
48 | python -c "from datasets import load_dataset; print(load_dataset('rajpurkar/squad', split='train')[0])"
49 | ```
50 | 
51 | This command downloads version 1 of the [Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer/), loads the training split, and prints the first training example. You should see:
52 | 
53 | ```python
54 | {'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']}, 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'id': '5733be284776f41900661182', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'title': 'University_of_Notre_Dame'}
55 | ```
56 | 
57 | ## Audio
58 | 
59 | To work with audio datasets, you need to install the [`Audio`] feature as an extra dependency:
60 | 
61 | ```bash
62 | pip install datasets[audio]
63 | ```
64 | 
65 | ## Vision
66 | 
67 | To work with image datasets, you need to install the [`Image`] feature as an extra dependency:
68 | 
69 | ```bash
70 | pip install datasets[vision]
71 | ```
72 | 
73 | ## source
74 | 
75 | Building 🤗 Datasets from source lets you make changes to the code base. To install from the source, clone the repository and install with the following commands:
76 | 
77 | ```bash
78 | git clone https://github.com/huggingface/datasets.git
79 | cd datasets
80 | pip install -e .
81 | ```
82 | 
83 | Again, you can check if 🤗 Datasets was properly installed with the following command:
84 | 
85 | ```bash
86 | python -c "from datasets import load_dataset; print(load_dataset('rajpurkar/squad', split='train')[0])"
87 | ```
88 | 
89 | ## conda
90 | 
91 | 🤗 Datasets can also be installed from conda, a package management system:
92 | 
93 | ```bash
94 | conda install -c huggingface -c conda-forge datasets
95 | ```
96 | 


--------------------------------------------------------------------------------
/docs/source/use_with_pyarrow.mdx:
--------------------------------------------------------------------------------
  1 | # Use with PyArrow
  2 | 
  3 | This document is a quick introduction to using `datasets` with PyArrow, with a particular focus on how to process
  4 | datasets using Arrow compute functions, and how to convert a dataset to PyArrow or from PyArrow.
  5 | 
  6 | This is particularly useful as it allows fast zero-copy operations, since `datasets` uses PyArrow under the hood.
  7 | 
  8 | ## Dataset format
  9 | 
 10 | By default, datasets return regular Python objects: integers, floats, strings, lists, etc.
 11 | 
 12 | To get PyArrow Tables or Arrays instead, you can set the format of the dataset to `pyarrow` using [`Dataset.with_format`]:
 13 | 
 14 | ```py
 15 | >>> from datasets import Dataset
 16 | >>> data = {"col_0": ["a", "b", "c", "d"], "col_1": [0., 0., 1., 1.]}
 17 | >>> ds = Dataset.from_dict(data)
 18 | >>> ds = ds.with_format("arrow")
 19 | >>> ds[0]       # pa.Table
 20 | pyarrow.Table
 21 | col_0: string
 22 | col_1: double
 23 | ----
 24 | col_0: [["a"]]
 25 | col_1: [[0]]
 26 | >>> ds[:2]      # pa.Table
 27 | pyarrow.Table
 28 | col_0: string
 29 | col_1: double
 30 | ----
 31 | col_0: [["a","b"]]
 32 | col_1: [[0,0]]
 33 | >>> ds["data"]  # pa.array
 34 | <pyarrow.lib.ChunkedArray object at 0x1394312a0>
 35 | [
 36 |   [
 37 |     "a",
 38 |     "b",
 39 |     "c",
 40 |     "d"
 41 |   ]
 42 | ]
 43 | ```
 44 | 
 45 | This also works for `IterableDataset` objects obtained e.g. using `load_dataset(..., streaming=True)`:
 46 | 
 47 | ```py
 48 | >>> ds = ds.with_format("arrow")
 49 | >>> for table in ds.iter(batch_size=2):
 50 | ...     print(table)
 51 | ...     break
 52 | pyarrow.Table
 53 | col_0: string
 54 | col_1: double
 55 | ----
 56 | col_0: [["a","b"]]
 57 | col_1: [[0,0]]
 58 | ```
 59 | 
 60 | ## Process data
 61 | 
 62 | PyArrow functions are generally faster than regular hand-written python functions, and therefore they are a good option to optimize data processing. You can use Arrow compute functions to process a dataset in [`Dataset.map`] or [`Dataset.filter`]:
 63 | 
 64 | ```python
 65 | >>> import pyarrow.compute as pc
 66 | >>> from datasets import Dataset
 67 | >>> data = {"col_0": ["a", "b", "c", "d"], "col_1": [0., 0., 1., 1.]}
 68 | >>> ds = Dataset.from_dict(data)
 69 | >>> ds = ds.with_format("arrow")
 70 | >>> ds = ds.map(lambda t: t.append_column("col_2", pc.add(t["col_1"], 1)), batched=True)
 71 | >>> ds[:2]
 72 | pyarrow.Table
 73 | col_0: string
 74 | col_1: double
 75 | col_2: double
 76 | ----
 77 | col_0: [["a","b"]]
 78 | col_1: [[0,0]]
 79 | col_2: [[1,1]]
 80 | >>> ds = ds.filter(lambda t: pc.equal(t["col_0"], "b"), batched=True)
 81 | >>> ds[0]
 82 | pyarrow.Table
 83 | col_0: string
 84 | col_1: double
 85 | col_2: double
 86 | ----
 87 | col_0: [["b"]]
 88 | col_1: [[0]]
 89 | col_2: [[1]]
 90 | ```
 91 | 
 92 | We use `batched=True` because it is faster to process batches of data in PyArrow rather than row by row. It's also possible to use `batch_size=` in `map()` to set the size of each `table`.
 93 | 
 94 | This also works for [`IterableDataset.map`] and [`IterableDataset.filter`].
 95 | 
 96 | ## Import or Export from PyArrow
 97 | 
 98 | A [`Dataset`] is a wrapper of a PyArrow Table, you can instantiate a Dataset directly from the Table:
 99 | 
100 | ```python
101 | ds = Dataset(table)
102 | ```
103 | 
104 | You can access the PyArrow Table of a dataset using [`Dataset.data`], which returns a [`MemoryMappedTable`] or a [`InMemoryTable`] or a [`ConcatenationTable`], depending on the origin of the Arrow data and the operations that were applied.
105 | 
106 | Those objects wrap the underlying PyArrow table accessible at `Dataset.data.table`. This table contains all the data of the dataset, but there might also be an indices mapping at `Dataset._indices` which maps the dataset rows indices to the PyArrow Table rows indices. This can happen if the dataset has been shuffled with [`Dataset.shuffle`] or if only a subset of the rows are used (e.g. after a [`Dataset.select`]).
107 | 
108 | In the general case, you can export a dataset to a PyArrow Table using `table = ds.with_format("arrow")[:]`.
109 | 


--------------------------------------------------------------------------------
/src/datasets/download/download_config.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | from dataclasses import dataclass, field
 3 | from pathlib import Path
 4 | from typing import Any, Optional, Union
 5 | 
 6 | from .. import config
 7 | 
 8 | 
 9 | @dataclass
10 | class DownloadConfig:
11 |     """Configuration for our cached path manager.
12 | 
13 |     Attributes:
14 |         cache_dir (`str` or `Path`, *optional*):
15 |             Specify a cache directory to save the file to (overwrite the
16 |             default cache dir).
17 |         force_download (`bool`, defaults to `False`):
18 |             If `True`, re-download the file even if it's already cached in
19 |             the cache dir.
20 |         resume_download (`bool`, defaults to `False`):
21 |             If `True`, resume the download if an incompletely received file is
22 |             found.
23 |         proxies (`dict`, *optional*):
24 |         user_agent (`str`, *optional*):
25 |             Optional string or dict that will be appended to the user-agent on remote
26 |             requests.
27 |         extract_compressed_file (`bool`, defaults to `False`):
28 |             If `True` and the path point to a zip or tar file,
29 |             extract the compressed file in a folder along the archive.
30 |         force_extract (`bool`, defaults to `False`):
31 |             If `True` when `extract_compressed_file` is `True` and the archive
32 |             was already extracted, re-extract the archive and override the folder where it was extracted.
33 |         delete_extracted (`bool`, defaults to `False`):
34 |             Whether to delete (or keep) the extracted files.
35 |         extract_on_the_fly (`bool`, defaults to `False`):
36 |             If `True`, extract compressed files while they are being read.
37 |         use_etag (`bool`, defaults to `True`):
38 |             Whether to use the ETag HTTP response header to validate the cached files.
39 |         num_proc (`int`, *optional*):
40 |             The number of processes to launch to download the files in parallel.
41 |         max_retries (`int`, default to `1`):
42 |             The number of times to retry an HTTP request if it fails.
43 |         token (`str` or `bool`, *optional*):
44 |             Optional string or boolean to use as Bearer token
45 |             for remote files on the Datasets Hub. If `True`, or not specified, will get token from `~/.huggingface`.
46 |         storage_options (`dict`, *optional*):
47 |             Key/value pairs to be passed on to the dataset file-system backend, if any.
48 |         download_desc (`str`, *optional*):
49 |             A description to be displayed alongside with the progress bar while downloading the files.
50 |         disable_tqdm (`bool`, defaults to `False`):
51 |             Whether to disable the individual files download progress bar
52 |     """
53 | 
54 |     cache_dir: Optional[Union[str, Path]] = None
55 |     force_download: bool = False
56 |     resume_download: bool = False
57 |     local_files_only: bool = False
58 |     proxies: Optional[dict] = None
59 |     user_agent: Optional[str] = None
60 |     extract_compressed_file: bool = False
61 |     force_extract: bool = False
62 |     delete_extracted: bool = False
63 |     extract_on_the_fly: bool = False
64 |     use_etag: bool = True
65 |     num_proc: Optional[int] = None
66 |     max_retries: int = 1
67 |     token: Optional[Union[str, bool]] = None
68 |     storage_options: dict[str, Any] = field(default_factory=dict)
69 |     download_desc: Optional[str] = None
70 |     disable_tqdm: bool = False
71 | 
72 |     def copy(self) -> "DownloadConfig":
73 |         return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})
74 | 
75 |     def __setattr__(self, name, value):
76 |         if name == "token" and getattr(self, "storage_options", None) is not None:
77 |             if "hf" not in self.storage_options:
78 |                 self.storage_options["hf"] = {"endpoint": config.HF_ENDPOINT, "token": value}
79 |             elif getattr(self.storage_options["hf"], "token", None) is None:
80 |                 self.storage_options["hf"]["token"] = value
81 |         super().__setattr__(name, value)
82 | 


--------------------------------------------------------------------------------