├── .dvc ├── config ├── .gitignore └── plots │ ├── scatter.json │ ├── default.json │ ├── confusion.json │ └── smooth.json ├── tests ├── __init__.py ├── io │ ├── __init__.py │ └── data │ │ ├── test_file.json.gz │ │ ├── test_file.json.xz │ │ ├── test_file.json.bz2 │ │ └── test_image_rgb.jpg ├── commands │ ├── __init__.py │ ├── conftest.py │ └── test_test.py ├── features │ ├── __init__.py │ ├── data │ │ ├── test_pdf.pdf │ │ ├── test_nifti.nii │ │ ├── test_image_rgb.jpg │ │ ├── test_nifti.nii.gz │ │ ├── test_audio_16000.mp3 │ │ ├── test_audio_16000.pcm │ │ ├── test_audio_44100.mp3 │ │ ├── test_audio_44100.wav │ │ ├── test_audio_48000.opus │ │ ├── test_image_rgba.png │ │ └── test_video_66x50.mov │ └── test_pdf.py ├── fixtures │ └── __init__.py ├── packaged_modules │ ├── __init__.py │ ├── test_sql.py │ ├── test_pandas.py │ ├── test_parquet.py │ └── test_arrow.py ├── test_filelock.py ├── _test_patching.py ├── test_experimental.py ├── test_version.py ├── test_info_utils.py ├── test_exceptions.py ├── distributed_scripts │ └── run_torch_distributed.py ├── test_splits.py ├── test_parallel.py ├── test_dataset_list.py ├── test_sharding_utils.py ├── test_offline_util.py ├── conftest.py ├── test_filesystem.py └── test_hub.py ├── benchmarks ├── results │ ├── .gitkeep │ ├── benchmark_indices_mapping.json │ ├── benchmark_getitem_100B.json │ ├── benchmark_map_filter.json │ ├── benchmark_iterating.json │ └── benchmark_array_xd.json ├── format.py ├── benchmark_indices_mapping.py ├── utils.py ├── benchmark_getitem_100B.py └── benchmark_map_filter.py ├── src └── datasets │ ├── io │ ├── __init__.py │ ├── abc.py │ ├── spark.py │ ├── text.py │ └── generator.py │ ├── utils │ ├── resources │ │ ├── __init__.py │ │ ├── multilingualities.json │ │ ├── size_categories.json │ │ └── creators.json │ ├── hub.py │ ├── typing.py │ ├── filelock.py │ ├── doc_utils.py │ ├── __init__.py │ ├── experimental.py │ ├── track.py │ ├── _filelock.py │ ├── version.py │ └── deprecation_utils.py │ ├── packaged_modules │ ├── arrow │ │ ├── __init__.py │ │ └── arrow.py │ ├── cache │ │ └── __init__.py │ ├── csv │ │ └── __init__.py │ ├── eval │ │ ├── __init__.py │ │ └── eval.py │ ├── hdf5 │ │ └── __init__.py │ ├── json │ │ └── __init__.py │ ├── pandas │ │ ├── __init__.py │ │ └── pandas.py │ ├── spark │ │ └── __init__.py │ ├── sql │ │ └── __init__.py │ ├── text │ │ └── __init__.py │ ├── xml │ │ ├── __init__.py │ │ └── xml.py │ ├── audiofolder │ │ ├── __init__.py │ │ └── audiofolder.py │ ├── generator │ │ ├── __init__.py │ │ └── generator.py │ ├── imagefolder │ │ ├── __init__.py │ │ └── imagefolder.py │ ├── niftifolder │ │ ├── __init__.py │ │ └── niftifolder.py │ ├── parquet │ │ └── __init__.py │ ├── pdffolder │ │ ├── __init__.py │ │ └── pdffolder.py │ ├── videofolder │ │ ├── __init__.py │ │ └── videofolder.py │ ├── webdataset │ │ └── __init__.py │ └── folder_based_builder │ │ └── __init__.py │ ├── parallel │ └── __init__.py │ ├── download │ ├── __init__.py │ └── download_config.py │ ├── commands │ ├── __init__.py │ ├── datasets_cli.py │ ├── env.py │ └── delete_from_hub.py │ ├── features │ ├── _torchcodec.py │ └── __init__.py │ ├── distributed.py │ ├── filesystems │ └── __init__.py │ ├── __init__.py │ └── naming.py ├── .github ├── conda │ ├── build.sh │ └── meta.yaml ├── workflows │ ├── trufflehog.yml │ ├── upload_pr_documentation.yml │ ├── build_pr_documentation.yml │ ├── build_documentation.yml │ ├── self-assign.yaml │ └── release-conda.yml └── ISSUE_TEMPLATE │ ├── config.yml │ ├── feature-request.yml │ └── bug-report.yml ├── docs └── source │ ├── imgs │ ├── course_banner.png │ └── datasets_logo_name.jpg │ ├── _redirects.yml │ ├── _config.py │ ├── package_reference │ ├── builder_classes.mdx │ ├── utilities.mdx │ ├── table_classes.mdx │ └── loading_methods.mdx │ ├── tutorial.md │ ├── cli.mdx │ ├── how_to.md │ ├── nlp_load.mdx │ ├── about_arrow.md │ ├── dataset_card.mdx │ ├── use_with_pandas.mdx │ ├── index.mdx │ ├── use_with_spark.mdx │ ├── filesystems.mdx │ ├── nlp_process.mdx │ ├── image_classification.mdx │ ├── about_map_batch.mdx │ ├── about_cache.mdx │ ├── image_process.mdx │ ├── audio_process.mdx │ ├── installation.md │ └── use_with_pyarrow.mdx ├── .dvcignore ├── .pre-commit-config.yaml ├── AUTHORS ├── ADD_NEW_DATASET.md ├── Makefile ├── pyproject.toml ├── .gitignore ├── SECURITY.md ├── notebooks └── README.md ├── templates └── README.md └── .zenodo.json /.dvc/config: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmarks/results/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/features/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/fixtures/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/packaged_modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/utils/resources/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/arrow/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/cache/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/csv/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/hdf5/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/json/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/pandas/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/spark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/sql/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/text/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/xml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dvc/.gitignore: -------------------------------------------------------------------------------- 1 | /config.local 2 | /tmp 3 | /cache 4 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/audiofolder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/generator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/imagefolder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/niftifolder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/parquet/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/pdffolder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/videofolder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/webdataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/folder_based_builder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/conda/build.sh: -------------------------------------------------------------------------------- 1 | $PYTHON setup.py install --single-version-externally-managed --record=record.txt 2 | -------------------------------------------------------------------------------- /src/datasets/parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .parallel import ParallelBackendConfig, parallel_backend, parallel_map 2 | -------------------------------------------------------------------------------- /tests/io/data/test_file.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/io/data/test_file.json.gz -------------------------------------------------------------------------------- /tests/io/data/test_file.json.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/io/data/test_file.json.xz -------------------------------------------------------------------------------- /tests/features/data/test_pdf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_pdf.pdf -------------------------------------------------------------------------------- /tests/io/data/test_file.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/io/data/test_file.json.bz2 -------------------------------------------------------------------------------- /tests/io/data/test_image_rgb.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/io/data/test_image_rgb.jpg -------------------------------------------------------------------------------- /docs/source/imgs/course_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/docs/source/imgs/course_banner.png -------------------------------------------------------------------------------- /tests/features/data/test_nifti.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_nifti.nii -------------------------------------------------------------------------------- /tests/features/data/test_image_rgb.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_image_rgb.jpg -------------------------------------------------------------------------------- /tests/features/data/test_nifti.nii.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_nifti.nii.gz -------------------------------------------------------------------------------- /docs/source/imgs/datasets_logo_name.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/docs/source/imgs/datasets_logo_name.jpg -------------------------------------------------------------------------------- /tests/features/data/test_audio_16000.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_audio_16000.mp3 -------------------------------------------------------------------------------- /tests/features/data/test_audio_16000.pcm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_audio_16000.pcm -------------------------------------------------------------------------------- /tests/features/data/test_audio_44100.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_audio_44100.mp3 -------------------------------------------------------------------------------- /tests/features/data/test_audio_44100.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_audio_44100.wav -------------------------------------------------------------------------------- /tests/features/data/test_audio_48000.opus: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_audio_48000.opus -------------------------------------------------------------------------------- /tests/features/data/test_image_rgba.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_image_rgba.png -------------------------------------------------------------------------------- /tests/features/data/test_video_66x50.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets/HEAD/tests/features/data/test_video_66x50.mov -------------------------------------------------------------------------------- /.dvcignore: -------------------------------------------------------------------------------- 1 | # Add patterns of files dvc should ignore, which could improve 2 | # the performance. Learn more at 3 | # https://dvc.org/doc/user-guide/dvcignore 4 | -------------------------------------------------------------------------------- /src/datasets/utils/hub.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | from huggingface_hub import hf_hub_url 4 | 5 | 6 | hf_dataset_url = partial(hf_hub_url, repo_type="dataset") 7 | -------------------------------------------------------------------------------- /benchmarks/results/benchmark_indices_mapping.json: -------------------------------------------------------------------------------- 1 | {"num examples": 500000, "select": 0.03741131999413483, "sort": 0.7371353159978753, "shuffle": 0.17655655200360343, "train_test_split": 0.29633847798686475, "shard": 0.01452581599005498} -------------------------------------------------------------------------------- /benchmarks/results/benchmark_getitem_100B.json: -------------------------------------------------------------------------------- 1 | {"num examples": 100000000000, "get_first_row": 0.00019991099999927542, "get_last_row": 5.4411000000698095e-05, "get_batch_of_1024_rows": 0.0004897069999998394, "get_batch_of_1024_random_rows": 0.01800621099999944} -------------------------------------------------------------------------------- /src/datasets/utils/resources/multilingualities.json: -------------------------------------------------------------------------------- 1 | { 2 | "monolingual": "contains a single language", 3 | "multilingual": "contains multiple languages", 4 | "translation": "contains translated or aligned text", 5 | "other": "other type of language distribution" 6 | } 7 | -------------------------------------------------------------------------------- /src/datasets/utils/typing.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import TypeVar, Union 3 | 4 | 5 | T = TypeVar("T") 6 | 7 | ListLike = Union[list[T], tuple[T, ...]] 8 | NestedDataStructureLike = Union[T, list[T], dict[str, T]] 9 | PathLike = Union[str, bytes, os.PathLike] 10 | -------------------------------------------------------------------------------- /src/datasets/utils/resources/size_categories.json: -------------------------------------------------------------------------------- 1 | [ 2 | "unknown", 3 | "n<1K", 4 | "1K1T" 14 | ] 15 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/charliermarsh/ruff-pre-commit # https://github.com/charliermarsh/ruff#usage 3 | rev: 'v0.11.8' 4 | hooks: 5 | # Run the linter. 6 | - id: ruff 7 | args: [ --fix ] 8 | # Run the formatter. 9 | - id: ruff-format 10 | -------------------------------------------------------------------------------- /tests/commands/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from huggingface_hub import snapshot_download 3 | 4 | 5 | @pytest.fixture 6 | def dataset_dir(tmp_path): 7 | dataset_dir = tmp_path / "test_command_dataset_dir" 8 | snapshot_download("hf-internal-testing/ner-jsonl", repo_type="dataset", local_dir=dataset_dir) 9 | return str(dataset_dir) 10 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | # This is the list of HuggingFace Datasets authors for copyright purposes. 2 | # 3 | # This does not necessarily list everyone who has contributed code, since in 4 | # some cases, their employer may be the copyright holder. To see the full list 5 | # of contributors, see the revision history in source control. 6 | 7 | Google Inc. 8 | HuggingFace Inc. 9 | -------------------------------------------------------------------------------- /src/datasets/download/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "DownloadConfig", 3 | "DownloadManager", 4 | "DownloadMode", 5 | "StreamingDownloadManager", 6 | ] 7 | 8 | from .download_config import DownloadConfig 9 | from .download_manager import DownloadManager, DownloadMode 10 | from .streaming_download_manager import StreamingDownloadManager 11 | -------------------------------------------------------------------------------- /src/datasets/utils/resources/creators.json: -------------------------------------------------------------------------------- 1 | { 2 | "language": [ 3 | "found", 4 | "crowdsourced", 5 | "expert-generated", 6 | "machine-generated", 7 | "other" 8 | ], 9 | "annotations": [ 10 | "found", 11 | "crowdsourced", 12 | "expert-generated", 13 | "machine-generated", 14 | "no-annotation", 15 | "other" 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /tests/test_filelock.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datasets.utils._filelock import FileLock 4 | 5 | 6 | def test_long_path(tmpdir): 7 | filename = "a" * 1000 + ".lock" 8 | lock1 = FileLock(str(tmpdir / filename)) 9 | assert lock1.lock_file.endswith(".lock") 10 | assert not lock1.lock_file.endswith(filename) 11 | assert len(os.path.basename(lock1.lock_file)) <= 255 12 | -------------------------------------------------------------------------------- /.github/workflows/trufflehog.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | 4 | name: Secret Leaks 5 | 6 | permissions: 7 | contents: read 8 | 9 | jobs: 10 | trufflehog: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v4 15 | with: 16 | fetch-depth: 0 17 | - name: Secret Scanning 18 | uses: trufflesecurity/trufflehog@main 19 | -------------------------------------------------------------------------------- /src/datasets/commands/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from argparse import ArgumentParser 3 | 4 | 5 | class BaseDatasetsCLICommand(ABC): 6 | @staticmethod 7 | @abstractmethod 8 | def register_subcommand(parser: ArgumentParser): 9 | raise NotImplementedError() 10 | 11 | @abstractmethod 12 | def run(self): 13 | raise NotImplementedError() 14 | -------------------------------------------------------------------------------- /tests/_test_patching.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: F401 2 | # This is the module that test_patching.py uses to test patch_submodule() 3 | import os 4 | import os as renamed_os 5 | from os import path 6 | from os import path as renamed_path 7 | from os.path import join 8 | from os.path import join as renamed_join 9 | 10 | 11 | open = open # we just need to have a builtin inside this module to test it properly 12 | -------------------------------------------------------------------------------- /ADD_NEW_DATASET.md: -------------------------------------------------------------------------------- 1 | # How to add one new datasets 2 | 3 | Add datasets directly to the 🤗 Hugging Face Hub! 4 | 5 | You can share your dataset on https://huggingface.co/datasets directly using your account, see the documentation: 6 | 7 | * [Create a dataset and upload files on the website](https://huggingface.co/docs/datasets/upload_dataset) 8 | * [Advanced guide using the CLI](https://huggingface.co/docs/datasets/share) 9 | -------------------------------------------------------------------------------- /src/datasets/utils/filelock.py: -------------------------------------------------------------------------------- 1 | # deprecated, please use the `filelock` package instead 2 | 3 | from filelock import ( # noqa: F401 # imported for backward compatibility TODO: remove in 3.0.0 4 | BaseFileLock, 5 | SoftFileLock, 6 | Timeout, 7 | UnixFileLock, 8 | WindowsFileLock, 9 | ) 10 | 11 | from ._filelock import FileLock # noqa: F401 # imported for backward compatibility. TODO: remove in 3.0.0 12 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | contact_links: 2 | - name: Datasets on the Hugging Face Hub 3 | url: https://huggingface.co/datasets 4 | about: Please use the "Community" tab of the dataset on the Hugging Face Hub to open a discussion or a pull request 5 | - name: Forum 6 | url: https://discuss.huggingface.co/c/datasets/10 7 | about: Please ask and answer questions here, and engage with other community members 8 | -------------------------------------------------------------------------------- /benchmarks/results/benchmark_map_filter.json: -------------------------------------------------------------------------------- 1 | {"num examples": 500000, "map identity": 10.19139202599763, "map identity batched": 0.6804238399927272, "map no-op batched": 0.5342009569867514, "map no-op batched numpy": 0.5792830920108827, "map no-op batched pandas": 0.4343639040016569, "map no-op batched pytorch": 0.5403374370071106, "map no-op batched tensorflow": 1.3869360350072384, "map fast-tokenizer batched": 8.074308118986664, "filter": 1.841787679004483} -------------------------------------------------------------------------------- /docs/source/_redirects.yml: -------------------------------------------------------------------------------- 1 | # This first_section was backported from nginx 2 | loading_datasets: loading 3 | share_dataset: share 4 | quicktour: quickstart 5 | dataset_streaming: stream 6 | torch_tensorflow: use_dataset 7 | splits: loading#slice-splits 8 | processing: process 9 | faiss_and_ea: faiss_es 10 | features: about_dataset_features 11 | exploring: access 12 | package_reference/logging_methods: package_reference/utilities 13 | # end of first_section 14 | -------------------------------------------------------------------------------- /docs/source/_config.py: -------------------------------------------------------------------------------- 1 | # docstyle-ignore 2 | INSTALL_CONTENT = """ 3 | # Datasets installation 4 | ! pip install datasets transformers 5 | # To install from source instead of the last release, comment the command above and uncomment the following one. 6 | # ! pip install git+https://github.com/huggingface/datasets.git 7 | """ 8 | 9 | notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}] 10 | default_branch_name = "main" 11 | version_prefix = "" 12 | -------------------------------------------------------------------------------- /.github/workflows/upload_pr_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Upload PR Documentation 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["Build PR Documentation"] 6 | types: 7 | - completed 8 | 9 | jobs: 10 | build: 11 | uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main 12 | with: 13 | package_name: datasets 14 | secrets: 15 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} 16 | comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} -------------------------------------------------------------------------------- /src/datasets/utils/doc_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | 4 | def is_documented_by(function_with_docstring: Callable): 5 | """Decorator to share docstrings across common functions. 6 | 7 | Args: 8 | function_with_docstring (`Callable`): Name of the function with the docstring. 9 | """ 10 | 11 | def wrapper(target_function): 12 | target_function.__doc__ = function_with_docstring.__doc__ 13 | return target_function 14 | 15 | return wrapper 16 | -------------------------------------------------------------------------------- /.github/workflows/build_pr_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Build PR Documentation 2 | 3 | on: 4 | pull_request: 5 | 6 | concurrency: 7 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 8 | cancel-in-progress: true 9 | 10 | jobs: 11 | build: 12 | uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main 13 | with: 14 | commit_sha: ${{ github.event.pull_request.head.sha }} 15 | pr_number: ${{ github.event.number }} 16 | package: datasets 17 | -------------------------------------------------------------------------------- /tests/test_experimental.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import warnings 3 | 4 | from datasets.utils import experimental 5 | 6 | 7 | @experimental 8 | def dummy_function(): 9 | return "success" 10 | 11 | 12 | class TestExperimentalFlag(unittest.TestCase): 13 | def test_experimental_warning(self): 14 | with warnings.catch_warnings(record=True) as w: 15 | warnings.simplefilter("always") 16 | self.assertEqual(dummy_function(), "success") 17 | self.assertEqual(len(w), 1) 18 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: quality style test 2 | 3 | check_dirs := tests src benchmarks utils 4 | 5 | # Check that source code meets quality standards 6 | 7 | quality: 8 | ruff check $(check_dirs) setup.py # linter 9 | ruff format --check $(check_dirs) setup.py # formatter 10 | 11 | # Format source code automatically 12 | 13 | style: 14 | ruff check --fix $(check_dirs) setup.py # linter 15 | ruff format $(check_dirs) setup.py # formatter 16 | 17 | # Run tests for the library 18 | 19 | test: 20 | python -m pytest -n auto --dist=loadfile -s -v ./tests/ 21 | -------------------------------------------------------------------------------- /.github/workflows/build_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Build documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - doc-builder* 8 | - v*-release 9 | - v*-patch 10 | 11 | jobs: 12 | build: 13 | uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main 14 | with: 15 | commit_sha: ${{ github.sha }} 16 | package: datasets 17 | notebook_folder: datasets_doc 18 | secrets: 19 | token: ${{ secrets.HUGGINGFACE_PUSH }} 20 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} 21 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/pdffolder/pdffolder.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | 3 | from ..folder_based_builder import folder_based_builder 4 | 5 | 6 | logger = datasets.utils.logging.get_logger(__name__) 7 | 8 | 9 | class PdfFolderConfig(folder_based_builder.FolderBasedBuilderConfig): 10 | """BuilderConfig for ImageFolder.""" 11 | 12 | drop_labels: bool = None 13 | drop_metadata: bool = None 14 | 15 | def __post_init__(self): 16 | super().__post_init__() 17 | 18 | 19 | class PdfFolder(folder_based_builder.FolderBasedBuilder): 20 | BASE_FEATURE = datasets.Pdf 21 | BASE_COLUMN_NAME = "pdf" 22 | BUILDER_CONFIG_CLASS = PdfFolderConfig 23 | EXTENSIONS: list[str] = [".pdf"] 24 | -------------------------------------------------------------------------------- /src/datasets/features/_torchcodec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torchcodec.decoders import AudioDecoder as _AudioDecoder 3 | 4 | 5 | class AudioDecoder(_AudioDecoder): 6 | def __getitem__(self, key: str): 7 | if key == "array": 8 | y = self.get_all_samples().data.cpu().numpy() 9 | return np.mean(y, axis=tuple(range(y.ndim - 1))) if y.ndim > 1 else y 10 | elif key == "sampling_rate": 11 | return self.get_samples_played_in_range(0, 0).sample_rate 12 | elif hasattr(super(), "__getitem__"): 13 | return super().__getitem__(key) 14 | else: 15 | raise TypeError("'torchcodec.decoders.AudioDecoder' object is not subscriptable") 16 | -------------------------------------------------------------------------------- /tests/test_version.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from datasets.utils.version import Version 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "other, expected_equality", 8 | [ 9 | (Version("1.0.0"), True), 10 | ("1.0.0", True), 11 | (Version("2.0.0"), False), 12 | ("2.0.0", False), 13 | ("1", False), 14 | ("a", False), 15 | (1, False), 16 | (None, False), 17 | ], 18 | ) 19 | def test_version_equality_and_hash(other, expected_equality): 20 | version = Version("1.0.0") 21 | assert (version == other) is expected_equality 22 | assert (version != other) is not expected_equality 23 | assert (hash(version) == hash(other)) is expected_equality 24 | -------------------------------------------------------------------------------- /src/datasets/features/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "Audio", 3 | "Array2D", 4 | "Array3D", 5 | "Array4D", 6 | "Array5D", 7 | "ClassLabel", 8 | "Features", 9 | "LargeList", 10 | "List", 11 | "Sequence", 12 | "Value", 13 | "Image", 14 | "Translation", 15 | "TranslationVariableLanguages", 16 | "Video", 17 | "Pdf", 18 | "Nifti", 19 | ] 20 | from .audio import Audio 21 | from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, List, Sequence, Value 22 | from .image import Image 23 | from .nifti import Nifti 24 | from .pdf import Pdf 25 | from .translation import Translation, TranslationVariableLanguages 26 | from .video import Video 27 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/niftifolder/niftifolder.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | 3 | from ..folder_based_builder import folder_based_builder 4 | 5 | 6 | logger = datasets.utils.logging.get_logger(__name__) 7 | 8 | 9 | class NiftiFolderConfig(folder_based_builder.FolderBasedBuilderConfig): 10 | """BuilderConfig for NiftiFolder.""" 11 | 12 | drop_labels: bool = None 13 | drop_metadata: bool = None 14 | 15 | def __post_init__(self): 16 | super().__post_init__() 17 | 18 | 19 | class NiftiFolder(folder_based_builder.FolderBasedBuilder): 20 | BASE_FEATURE = datasets.Nifti 21 | BASE_COLUMN_NAME = "nifti" 22 | BUILDER_CONFIG_CLASS = NiftiFolderConfig 23 | EXTENSIONS: list[str] = [".nii", ".nii.gz"] 24 | -------------------------------------------------------------------------------- /tests/packaged_modules/test_sql.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from datasets.builder import InvalidConfigName 4 | from datasets.data_files import DataFilesList 5 | from datasets.packaged_modules.sql.sql import SqlConfig 6 | 7 | 8 | def test_config_raises_when_invalid_name() -> None: 9 | with pytest.raises(InvalidConfigName, match="Bad characters"): 10 | _ = SqlConfig(name="name-with-*-invalid-character") 11 | 12 | 13 | @pytest.mark.parametrize("data_files", ["str_path", ["str_path"], DataFilesList(["str_path"], [()])]) 14 | def test_config_raises_when_invalid_data_files(data_files) -> None: 15 | with pytest.raises(ValueError, match="Expected a DataFilesDict"): 16 | _ = SqlConfig(name="name", data_files=data_files) 17 | -------------------------------------------------------------------------------- /tests/packaged_modules/test_pandas.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from datasets.builder import InvalidConfigName 4 | from datasets.data_files import DataFilesList 5 | from datasets.packaged_modules.pandas.pandas import PandasConfig 6 | 7 | 8 | def test_config_raises_when_invalid_name() -> None: 9 | with pytest.raises(InvalidConfigName, match="Bad characters"): 10 | _ = PandasConfig(name="name-with-*-invalid-character") 11 | 12 | 13 | @pytest.mark.parametrize("data_files", ["str_path", ["str_path"], DataFilesList(["str_path"], [()])]) 14 | def test_config_raises_when_invalid_data_files(data_files) -> None: 15 | with pytest.raises(ValueError, match="Expected a DataFilesDict"): 16 | _ = PandasConfig(name="name", data_files=data_files) 17 | -------------------------------------------------------------------------------- /tests/packaged_modules/test_parquet.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from datasets.builder import InvalidConfigName 4 | from datasets.data_files import DataFilesList 5 | from datasets.packaged_modules.parquet.parquet import ParquetConfig 6 | 7 | 8 | def test_config_raises_when_invalid_name() -> None: 9 | with pytest.raises(InvalidConfigName, match="Bad characters"): 10 | _ = ParquetConfig(name="name-with-*-invalid-character") 11 | 12 | 13 | @pytest.mark.parametrize("data_files", ["str_path", ["str_path"], DataFilesList(["str_path"], [()])]) 14 | def test_config_raises_when_invalid_data_files(data_files) -> None: 15 | with pytest.raises(ValueError, match="Expected a DataFilesDict"): 16 | _ = ParquetConfig(name="name", data_files=data_files) 17 | -------------------------------------------------------------------------------- /.dvc/plots/scatter.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": "point", 8 | "encoding": { 9 | "x": { 10 | "field": "", 11 | "type": "quantitative", 12 | "title": "" 13 | }, 14 | "y": { 15 | "field": "", 16 | "type": "quantitative", 17 | "title": "", 18 | "scale": { 19 | "zero": false 20 | } 21 | }, 22 | "color": { 23 | "field": "rev", 24 | "type": "nominal" 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | line-length = 119 3 | 4 | [tool.ruff.lint] 5 | # Ignored rules: 6 | # "E501" -> line length violation 7 | # "F821" -> undefined named in type annotation (e.g. Literal["something"]) 8 | # "C901" -> `function_name` is too complex 9 | ignore = ["E501", "F821", "C901"] 10 | select = ["C", "E", "F", "I", "W"] 11 | 12 | [tool.ruff.lint.isort] 13 | lines-after-imports = 2 14 | known-first-party = ["datasets"] 15 | 16 | [tool.ruff.lint.per-file-ignores] 17 | "__init__.py" = ["F401", "F403", "F405"] 18 | 19 | [tool.pytest.ini_options] 20 | # Test fails if a FutureWarning is thrown by `huggingface_hub` 21 | filterwarnings = [ 22 | "error::FutureWarning:huggingface_hub*", 23 | ] 24 | markers = [ 25 | "unit: unit test", 26 | "integration: integration test", 27 | ] 28 | -------------------------------------------------------------------------------- /.dvc/plots/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": { 8 | "type": "line" 9 | }, 10 | "encoding": { 11 | "x": { 12 | "field": "", 13 | "type": "quantitative", 14 | "title": "" 15 | }, 16 | "y": { 17 | "field": "", 18 | "type": "quantitative", 19 | "title": "", 20 | "scale": { 21 | "zero": false 22 | } 23 | }, 24 | "color": { 25 | "field": "rev", 26 | "type": "nominal" 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /.dvc/plots/confusion.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": "rect", 8 | "encoding": { 9 | "x": { 10 | "field": "", 11 | "type": "nominal", 12 | "sort": "ascending", 13 | "title": "" 14 | }, 15 | "y": { 16 | "field": "", 17 | "type": "nominal", 18 | "sort": "ascending", 19 | "title": "" 20 | }, 21 | "color": { 22 | "aggregate": "count", 23 | "type": "quantitative" 24 | }, 25 | "facet": { 26 | "field": "rev", 27 | "type": "nominal" 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /.github/workflows/self-assign.yaml: -------------------------------------------------------------------------------- 1 | name: Self-assign 2 | on: 3 | issue_comment: 4 | types: created 5 | jobs: 6 | one: 7 | runs-on: ubuntu-latest 8 | if: >- 9 | (github.event.comment.body == '#take' || 10 | github.event.comment.body == '#self-assign') 11 | && !github.event.issue.assignee 12 | steps: 13 | - run: | 14 | echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" 15 | curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees 16 | curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -X "DELETE" https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels/help%20wanted 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Locked files 2 | *.lock 3 | !dvc.lock 4 | 5 | # Extracted dummy data 6 | datasets/**/dummy_data-zip-extracted/ 7 | 8 | # Compiled python modules. 9 | *.pyc 10 | 11 | # Byte-compiled 12 | _pycache__/ 13 | .cache/ 14 | 15 | # Python egg metadata, regenerated from source files by setuptools. 16 | *.egg-info 17 | .eggs/ 18 | 19 | # PyPI distribution artifacts. 20 | build/ 21 | dist/ 22 | 23 | # Environments 24 | .env 25 | .venv 26 | env/ 27 | venv/ 28 | ENV/ 29 | env.bak/ 30 | venv.bak/ 31 | 32 | # pyenv 33 | .python-version 34 | 35 | # Tests 36 | .pytest_cache/ 37 | 38 | # Other 39 | *.DS_Store 40 | 41 | # PyCharm/vscode 42 | .idea 43 | .vscode 44 | 45 | # Vim 46 | .*.swp 47 | 48 | # playground 49 | /playground 50 | 51 | # Sphinx documentation 52 | docs/_build/ 53 | docs/source/_build/ 54 | 55 | # Benchmark results 56 | report.json 57 | report.md 58 | 59 | # Ruff 60 | .ruff_cache 61 | -------------------------------------------------------------------------------- /benchmarks/results/benchmark_iterating.json: -------------------------------------------------------------------------------- 1 | {"num examples": 50000, "read 5000": 0.2152090710005723, "read 50000": 2.077654693988734, "read_batch 50000 10": 1.5041199039987987, "read_batch 50000 100": 1.5411947140091797, "read_batch 50000 1000": 1.4684901159926085, "read_formatted numpy 5000": 4.584776938994764, "read_formatted pandas 5000": 3.7457121399929747, "read_formatted torch 5000": 4.565676491998602, "read_formatted tensorflow 5000": 5.269861594992108, "read_formatted_batch numpy 5000 10": 0.4242750950070331, "read_formatted_batch numpy 5000 1000": 0.007607111998368055, "shuffled read 5000": 0.22604441999283154, "shuffled read 50000": 2.268928524994408, "shuffled read_batch 50000 10": 55.44462437101174, "shuffled read_batch 50000 100": 6.876476717996411, "shuffled read_batch 50000 1000": 2.1420724369963864, "shuffled read_formatted numpy 5000": 4.8052272600034485, "shuffled read_formatted_batch numpy 5000 10": 6.500664097999106, "shuffled read_formatted_batch numpy 5000 1000": 0.0754691059992183} -------------------------------------------------------------------------------- /docs/source/package_reference/builder_classes.mdx: -------------------------------------------------------------------------------- 1 | # Builder classes 2 | 3 | ## Builders 4 | 5 | 🤗 Datasets relies on two main classes during the dataset building process: [`DatasetBuilder`] and [`BuilderConfig`]. 6 | 7 | [[autodoc]] datasets.DatasetBuilder 8 | 9 | [[autodoc]] datasets.GeneratorBasedBuilder 10 | 11 | [[autodoc]] datasets.ArrowBasedBuilder 12 | 13 | [[autodoc]] datasets.BuilderConfig 14 | 15 | ## Download 16 | 17 | [[autodoc]] datasets.DownloadManager 18 | 19 | [[autodoc]] datasets.StreamingDownloadManager 20 | 21 | [[autodoc]] datasets.DownloadConfig 22 | 23 | [[autodoc]] datasets.DownloadMode 24 | 25 | ## Verification 26 | 27 | [[autodoc]] datasets.VerificationMode 28 | 29 | ## Splits 30 | 31 | [[autodoc]] datasets.SplitGenerator 32 | 33 | [[autodoc]] datasets.Split 34 | 35 | [[autodoc]] datasets.NamedSplit 36 | 37 | [[autodoc]] datasets.NamedSplitAll 38 | 39 | [[autodoc]] datasets.ReadInstruction 40 | 41 | ## Version 42 | 43 | [[autodoc]] datasets.utils.Version 44 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/videofolder/videofolder.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | 3 | from ..folder_based_builder import folder_based_builder 4 | 5 | 6 | logger = datasets.utils.logging.get_logger(__name__) 7 | 8 | 9 | class VideoFolderConfig(folder_based_builder.FolderBasedBuilderConfig): 10 | """BuilderConfig for ImageFolder.""" 11 | 12 | drop_labels: bool = None 13 | drop_metadata: bool = None 14 | 15 | def __post_init__(self): 16 | super().__post_init__() 17 | 18 | 19 | class VideoFolder(folder_based_builder.FolderBasedBuilder): 20 | BASE_FEATURE = datasets.Video 21 | BASE_COLUMN_NAME = "video" 22 | BUILDER_CONFIG_CLASS = VideoFolderConfig 23 | EXTENSIONS: list[str] # definition at the bottom of the script 24 | 25 | 26 | # TODO: initial list, we should check the compatibility of other formats 27 | VIDEO_EXTENSIONS = [ 28 | ".mkv", 29 | ".mp4", 30 | ".avi", 31 | ".mpeg", 32 | ".mov", 33 | ] 34 | VideoFolder.EXTENSIONS = VIDEO_EXTENSIONS 35 | -------------------------------------------------------------------------------- /tests/test_info_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import datasets.config 4 | from datasets.utils.info_utils import is_small_dataset 5 | 6 | 7 | @pytest.mark.parametrize("dataset_size", [None, 400 * 2**20, 600 * 2**20]) 8 | @pytest.mark.parametrize("input_in_memory_max_size", ["default", 0, 100 * 2**20, 900 * 2**20]) 9 | def test_is_small_dataset(dataset_size, input_in_memory_max_size, monkeypatch): 10 | if input_in_memory_max_size != "default": 11 | monkeypatch.setattr(datasets.config, "IN_MEMORY_MAX_SIZE", input_in_memory_max_size) 12 | in_memory_max_size = datasets.config.IN_MEMORY_MAX_SIZE 13 | if input_in_memory_max_size == "default": 14 | assert in_memory_max_size == 0 15 | else: 16 | assert in_memory_max_size == input_in_memory_max_size 17 | if dataset_size and in_memory_max_size: 18 | expected = dataset_size < in_memory_max_size 19 | else: 20 | expected = False 21 | result = is_small_dataset(dataset_size) 22 | assert result == expected 23 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 15 | 16 | Each major version is currently being supported with security updates. 17 | 18 | | Version | Supported | 19 | |---------|--------------------| 20 | | 1.x.x | :white_check_mark: | 21 | | 2.x.x | :white_check_mark: | 22 | 23 | 24 | ## Reporting a Vulnerability 25 | 32 | 33 | To report a security vulnerability, please contact: security@huggingface.co 34 | -------------------------------------------------------------------------------- /.dvc/plots/smooth.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": { 8 | "type": "line" 9 | }, 10 | "encoding": { 11 | "x": { 12 | "field": "", 13 | "type": "quantitative", 14 | "title": "" 15 | }, 16 | "y": { 17 | "field": "", 18 | "type": "quantitative", 19 | "title": "", 20 | "scale": { 21 | "zero": false 22 | } 23 | }, 24 | "color": { 25 | "field": "rev", 26 | "type": "nominal" 27 | } 28 | }, 29 | "transform": [ 30 | { 31 | "loess": "", 32 | "on": "", 33 | "groupby": [ 34 | "rev" 35 | ], 36 | "bandwidth": 0.3 37 | } 38 | ] 39 | } 40 | -------------------------------------------------------------------------------- /tests/test_exceptions.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import pytest 4 | 5 | import datasets.utils.deprecation_utils 6 | from datasets.exceptions import ( 7 | ChecksumVerificationError, 8 | ExpectedMoreDownloadedFilesError, 9 | ExpectedMoreSplitsError, 10 | NonMatchingChecksumError, 11 | NonMatchingSplitsSizesError, 12 | SplitsVerificationError, 13 | UnexpectedDownloadedFileError, 14 | UnexpectedSplitsError, 15 | ) 16 | 17 | 18 | @pytest.mark.parametrize( 19 | "error", 20 | [ 21 | ChecksumVerificationError, 22 | UnexpectedDownloadedFileError, 23 | ExpectedMoreDownloadedFilesError, 24 | NonMatchingChecksumError, 25 | SplitsVerificationError, 26 | UnexpectedSplitsError, 27 | ExpectedMoreSplitsError, 28 | NonMatchingSplitsSizesError, 29 | ], 30 | ) 31 | def test_error_not_deprecated(error, monkeypatch): 32 | monkeypatch.setattr(datasets.utils.deprecation_utils, "_emitted_deprecation_warnings", set()) 33 | with warnings.catch_warnings(): 34 | warnings.simplefilter("error") 35 | error() 36 | -------------------------------------------------------------------------------- /src/datasets/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import tqdm as _tqdm # _tqdm is the module 16 | from .experimental import experimental 17 | from .info_utils import VerificationMode 18 | from .logging import disable_progress_bar, enable_progress_bar, is_progress_bar_enabled 19 | from .tqdm import ( 20 | are_progress_bars_disabled, 21 | disable_progress_bars, 22 | enable_progress_bars, 23 | tqdm, 24 | ) 25 | from .version import Version 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.yml: -------------------------------------------------------------------------------- 1 | name: Feature request 2 | description: Suggest an idea for this project 3 | labels: ["enhancement"] 4 | body: 5 | - type: textarea 6 | id: feature-request 7 | attributes: 8 | label: Feature request 9 | description: A clear and concise description of the feature proposal. 10 | validations: 11 | required: true 12 | 13 | - type: textarea 14 | id: motivation 15 | validations: 16 | required: true 17 | attributes: 18 | label: Motivation 19 | description: | 20 | Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. 21 | 22 | - type: textarea 23 | id: contribution 24 | validations: 25 | required: true 26 | attributes: 27 | label: Your contribution 28 | description: | 29 | Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md). 30 | -------------------------------------------------------------------------------- /docs/source/tutorial.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Welcome to the 🤗 Datasets tutorials! These beginner-friendly tutorials will guide you through the fundamentals of working with 🤗 Datasets. You'll load and prepare a dataset for training with your machine learning framework of choice. Along the way, you'll learn how to load different dataset configurations and splits, interact with and see what's inside your dataset, preprocess, and share a dataset to the [Hub](https://huggingface.co/datasets). 4 | 5 | The tutorials assume some basic knowledge of Python and a machine learning framework like PyTorch or TensorFlow. If you're already familiar with these, feel free to check out the [quickstart](./quickstart) to see what you can do with 🤗 Datasets. 6 | 7 | > [!TIP] 8 | > The tutorials only cover the basic skills you need to use 🤗 Datasets. There are many other useful functionalities and applications that aren't discussed here. If you're interested in learning more, take a look at [Chapter 5](https://huggingface.co/course/chapter5/1?fw=pt) of the Hugging Face course. 9 | 10 | If you have any questions about 🤗 Datasets, feel free to join and ask the community on our [forum](https://discuss.huggingface.co/c/datasets/10). 11 | 12 | Let's get started! 🏁 13 | -------------------------------------------------------------------------------- /.github/conda/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "datasets" %} 2 | 3 | package: 4 | name: "{{ name|lower }}" 5 | version: "{{ DATASETS_VERSION }}" 6 | 7 | source: 8 | path: ../../ 9 | 10 | build: 11 | noarch: python 12 | 13 | requirements: 14 | host: 15 | - python 16 | - pip 17 | - numpy >=1.17 18 | - pyarrow >=16.0.0 19 | - python-xxhash 20 | - dill 21 | - pandas 22 | - requests >=2.19.0 23 | - httpx <1.0.0 24 | - tqdm >=4.66.3 25 | - dataclasses 26 | - multiprocess 27 | - fsspec 28 | - huggingface_hub >=0.25.0,<2.0.0 29 | - packaging 30 | run: 31 | - python 32 | - pip 33 | - numpy >=1.17 34 | - pyarrow >=16.0.0 35 | - python-xxhash 36 | - dill 37 | - pandas 38 | - requests >=2.19.0 39 | - httpx <1.0.0 40 | - tqdm >=4.66.3 41 | - dataclasses 42 | - multiprocess 43 | - fsspec 44 | - huggingface_hub >=0.25.0,<2.0.0 45 | - packaging 46 | 47 | test: 48 | imports: 49 | - datasets 50 | 51 | about: 52 | home: https://huggingface.co 53 | license: Apache License 2.0 54 | license_file: LICENSE 55 | summary: "🤗 The largest hub of ready-to-use NLP datasets for ML models with fast, easy-to-use and efficient data manipulation tools" 56 | -------------------------------------------------------------------------------- /.github/workflows/release-conda.yml: -------------------------------------------------------------------------------- 1 | name: Release - Conda 2 | 3 | on: 4 | push: 5 | tags: 6 | - "[0-9]+.[0-9]+.[0-9]+*" 7 | 8 | env: 9 | ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }} 10 | 11 | jobs: 12 | build_and_package: 13 | runs-on: ubuntu-22.04 14 | defaults: 15 | run: 16 | shell: bash -l {0} 17 | 18 | steps: 19 | - name: Checkout repository 20 | uses: actions/checkout@v4 21 | 22 | - name: Install miniconda 23 | uses: conda-incubator/setup-miniconda@v2 24 | with: 25 | auto-update-conda: true 26 | auto-activate-base: false 27 | activate-environment: "build-datasets" 28 | python-version: 3.9 29 | channels: huggingface 30 | 31 | - name: Setup conda env 32 | run: | 33 | conda install -c defaults anaconda-client conda-build 34 | 35 | - name: Extract version 36 | run: echo "DATASETS_VERSION=`python setup.py --version`" >> $GITHUB_ENV 37 | 38 | - name: Build conda packages 39 | run: | 40 | conda info 41 | conda build .github/conda 42 | 43 | - name: Upload to Anaconda 44 | run: | 45 | anaconda upload `conda build .github/conda --output -c conda-forge` --force 46 | -------------------------------------------------------------------------------- /src/datasets/utils/experimental.py: -------------------------------------------------------------------------------- 1 | """Contains utilities to flag a feature as "experimental" in datasets.""" 2 | 3 | import warnings 4 | from functools import wraps 5 | from typing import Callable 6 | 7 | 8 | def experimental(fn: Callable) -> Callable: 9 | """Decorator to flag a feature as experimental. 10 | 11 | An experimental feature trigger a warning when used as it might be subject to breaking changes in the future. 12 | 13 | Args: 14 | fn (`Callable`): 15 | The function to flag as experimental. 16 | 17 | Returns: 18 | `Callable`: The decorated function. 19 | 20 | Example: 21 | 22 | ```python 23 | >>> from datasets.utils import experimental 24 | 25 | >>> @experimental 26 | ... def my_function(): 27 | ... print("Hello world!") 28 | 29 | >>> my_function() 30 | UserWarning: 'my_function' is experimental and might be subject to breaking changes in the future. 31 | Hello world! 32 | ``` 33 | """ 34 | 35 | @wraps(fn) 36 | def _inner_fn(*args, **kwargs): 37 | warnings.warn( 38 | (f"'{fn.__name__}' is experimental and might be subject to breaking changes in the future."), 39 | UserWarning, 40 | ) 41 | return fn(*args, **kwargs) 42 | 43 | return _inner_fn 44 | -------------------------------------------------------------------------------- /src/datasets/commands/datasets_cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from argparse import ArgumentParser 3 | 4 | from datasets.commands.delete_from_hub import DeleteFromHubCommand 5 | from datasets.commands.env import EnvironmentCommand 6 | from datasets.commands.test import TestCommand 7 | from datasets.utils.logging import set_verbosity_info 8 | 9 | 10 | def parse_unknown_args(unknown_args): 11 | return {key.lstrip("-"): value for key, value in zip(unknown_args[::2], unknown_args[1::2])} 12 | 13 | 14 | def main(): 15 | parser = ArgumentParser( 16 | "HuggingFace Datasets CLI tool", usage="datasets-cli []", allow_abbrev=False 17 | ) 18 | commands_parser = parser.add_subparsers(help="datasets-cli command helpers") 19 | set_verbosity_info() 20 | 21 | # Register commands 22 | EnvironmentCommand.register_subcommand(commands_parser) 23 | TestCommand.register_subcommand(commands_parser) 24 | DeleteFromHubCommand.register_subcommand(commands_parser) 25 | 26 | # Parse args 27 | args, unknown_args = parser.parse_known_args() 28 | if not hasattr(args, "func"): 29 | parser.print_help() 30 | exit(1) 31 | kwargs = parse_unknown_args(unknown_args) 32 | 33 | # Run 34 | service = args.func(args, **kwargs) 35 | service.run() 36 | 37 | 38 | if __name__ == "__main__": 39 | main() 40 | -------------------------------------------------------------------------------- /src/datasets/commands/env.py: -------------------------------------------------------------------------------- 1 | import platform 2 | from argparse import ArgumentParser 3 | 4 | import fsspec 5 | import huggingface_hub 6 | import pandas 7 | import pyarrow 8 | 9 | from datasets import __version__ as version 10 | from datasets.commands import BaseDatasetsCLICommand 11 | 12 | 13 | def info_command_factory(_): 14 | return EnvironmentCommand() 15 | 16 | 17 | class EnvironmentCommand(BaseDatasetsCLICommand): 18 | @staticmethod 19 | def register_subcommand(parser: ArgumentParser): 20 | download_parser = parser.add_parser("env", help="Print relevant system environment info.") 21 | download_parser.set_defaults(func=info_command_factory) 22 | 23 | def run(self): 24 | info = { 25 | "`datasets` version": version, 26 | "Platform": platform.platform(), 27 | "Python version": platform.python_version(), 28 | "`huggingface_hub` version": huggingface_hub.__version__, 29 | "PyArrow version": pyarrow.__version__, 30 | "Pandas version": pandas.__version__, 31 | "`fsspec` version": fsspec.__version__, 32 | } 33 | 34 | print("\nCopy-and-paste the text below in your GitHub issue.\n") 35 | print(self.format_dict(info)) 36 | 37 | return info 38 | 39 | @staticmethod 40 | def format_dict(d): 41 | return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n" 42 | -------------------------------------------------------------------------------- /benchmarks/results/benchmark_array_xd.json: -------------------------------------------------------------------------------- 1 | {"write_array2d": 0.14168284999323077, "read_unformated after write_array2d": 0.04353281999647152, "read_formatted_as_numpy after write_array2d": 0.1285462469968479, "read_batch_unformated after write_array2d": 0.023109222995117307, "read_batch_formatted_as_numpy after write_array2d": 0.011352884990628809, "read_col_unformated after write_array2d": 0.037052362007671036, "read_col_formatted_as_numpy after write_array2d": 0.007985618998645805, "write_nested_sequence": 1.4927163410029607, "read_unformated after write_nested_sequence": 0.28319963401008863, "read_formatted_as_numpy after write_nested_sequence": 0.419271487990045, "read_batch_unformated after write_nested_sequence": 0.3234798710036557, "read_batch_formatted_as_numpy after write_nested_sequence": 0.03850809299910907, "read_col_unformated after write_nested_sequence": 0.29384092400141526, "read_col_formatted_as_numpy after write_nested_sequence": 0.004250421989127062, "write_flattened_sequence": 1.4521546780015342, "read_unformated after write_flattened_sequence": 0.25513897799828555, "read_formatted_as_numpy after write_flattened_sequence": 0.07564631900459062, "read_batch_unformated after write_flattened_sequence": 0.2758980469952803, "read_batch_formatted_as_numpy after write_flattened_sequence": 0.011008214991306886, "read_col_unformated after write_flattened_sequence": 0.25848906899045687, "read_col_formatted_as_numpy after write_flattened_sequence": 0.004328447001171298} -------------------------------------------------------------------------------- /src/datasets/packaged_modules/generator/generator.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Callable, Optional 3 | 4 | import datasets 5 | from datasets.builder import Key 6 | from datasets.utils.sharding import _number_of_shards_in_gen_kwargs, _split_gen_kwargs 7 | 8 | 9 | @dataclass 10 | class GeneratorConfig(datasets.BuilderConfig): 11 | generator: Optional[Callable] = None 12 | gen_kwargs: Optional[dict] = None 13 | features: Optional[datasets.Features] = None 14 | split: datasets.NamedSplit = datasets.Split.TRAIN 15 | 16 | def __post_init__(self): 17 | super().__post_init__() 18 | if self.generator is None: 19 | raise ValueError("generator must be specified") 20 | 21 | if self.gen_kwargs is None: 22 | self.gen_kwargs = {} 23 | 24 | 25 | class Generator(datasets.GeneratorBasedBuilder): 26 | BUILDER_CONFIG_CLASS = GeneratorConfig 27 | 28 | def _info(self): 29 | return datasets.DatasetInfo(features=self.config.features) 30 | 31 | def _split_generators(self, dl_manager): 32 | return [datasets.SplitGenerator(name=self.config.split, gen_kwargs=self.config.gen_kwargs)] 33 | 34 | def _generate_examples(self, **gen_kwargs): 35 | num_shards = _number_of_shards_in_gen_kwargs(gen_kwargs) 36 | for shard_idx, shard_gen_kwargs in enumerate(_split_gen_kwargs(gen_kwargs, max_num_jobs=num_shards)): 37 | for sample_idx, sample in enumerate(self.config.generator(**shard_gen_kwargs)): 38 | yield Key(shard_idx, sample_idx), sample 39 | -------------------------------------------------------------------------------- /src/datasets/commands/delete_from_hub.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from typing import Optional 3 | 4 | from datasets.commands import BaseDatasetsCLICommand 5 | from datasets.hub import delete_from_hub 6 | 7 | 8 | def _command_factory(args): 9 | return DeleteFromHubCommand( 10 | args.dataset_id, 11 | args.config_name, 12 | args.token, 13 | args.revision, 14 | ) 15 | 16 | 17 | class DeleteFromHubCommand(BaseDatasetsCLICommand): 18 | @staticmethod 19 | def register_subcommand(parser): 20 | parser: ArgumentParser = parser.add_parser("delete_from_hub", help="Delete dataset config from the Hub") 21 | parser.add_argument( 22 | "dataset_id", help="source dataset ID, e.g. USERNAME/DATASET_NAME or ORGANIZATION/DATASET_NAME" 23 | ) 24 | parser.add_argument("config_name", help="config name to delete") 25 | parser.add_argument("--token", help="access token to the Hugging Face Hub") 26 | parser.add_argument("--revision", help="source revision") 27 | parser.set_defaults(func=_command_factory) 28 | 29 | def __init__( 30 | self, 31 | dataset_id: str, 32 | config_name: str, 33 | token: Optional[str], 34 | revision: Optional[str], 35 | ): 36 | self._dataset_id = dataset_id 37 | self._config_name = config_name 38 | self._token = token 39 | self._revision = revision 40 | 41 | def run(self) -> None: 42 | _ = delete_from_hub(self._dataset_id, self._config_name, revision=self._revision, token=self._token) 43 | -------------------------------------------------------------------------------- /docs/source/cli.mdx: -------------------------------------------------------------------------------- 1 | # Command Line Interface (CLI) 2 | 3 | 🤗 Datasets provides a command line interface (CLI) with useful shell commands to interact with your dataset. 4 | 5 | You can check the available commands: 6 | ```bash 7 | >>> datasets-cli --help 8 | usage: datasets-cli [] 9 | 10 | positional arguments: 11 | {env,test,delete_from_hub} 12 | datasets-cli command helpers 13 | env Print relevant system environment info. 14 | test Test dataset loading. 15 | delete_from_hub Delete dataset config from the Hub 16 | 17 | optional arguments: 18 | -h, --help show this help message and exit 19 | ``` 20 | 21 | ## Delete from Hub 22 | 23 | Delete a dataset configuration from a [supported dataset](repository_structure) on the Hub. 24 | 25 | ```bash 26 | >>> datasets-cli delete_from_hub --help 27 | usage: datasets-cli [] delete_from_hub [-h] [--token TOKEN] [--revision REVISION] dataset_id config_name 28 | 29 | positional arguments: 30 | dataset_id source dataset ID, e.g. USERNAME/DATASET_NAME or ORGANIZATION/DATASET_NAME 31 | config_name config name to delete 32 | 33 | optional arguments: 34 | -h, --help show this help message and exit 35 | --token TOKEN access token to the Hugging Face Hub 36 | --revision REVISION source revision 37 | ``` 38 | 39 | For example: 40 | ```bash 41 | >>> datasets-cli delete_from_hub USERNAME/DATASET_NAME CONFIG_NAME 42 | ``` 43 | 44 | > [!TIP] 45 | > Do not forget that you need to log in first to your Hugging Face account: 46 | > ```bash 47 | > >>> hf auth login 48 | > ``` 49 | -------------------------------------------------------------------------------- /src/datasets/distributed.py: -------------------------------------------------------------------------------- 1 | from typing import TypeVar 2 | 3 | from .arrow_dataset import Dataset, _split_by_node_map_style_dataset 4 | from .iterable_dataset import IterableDataset, _split_by_node_iterable_dataset 5 | 6 | 7 | DatasetType = TypeVar("DatasetType", Dataset, IterableDataset) 8 | 9 | 10 | def split_dataset_by_node(dataset: DatasetType, rank: int, world_size: int) -> DatasetType: 11 | """ 12 | Split a dataset for the node at rank `rank` in a pool of nodes of size `world_size`. 13 | 14 | For map-style datasets: 15 | 16 | Each node is assigned a chunk of data, e.g. rank 0 is given the first chunk of the dataset. 17 | To maximize data loading throughput, chunks are made of contiguous data on disk if possible. 18 | 19 | For iterable datasets: 20 | 21 | If the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.num_shards % world_size == 0`), 22 | then the shards are evenly assigned across the nodes, which is the most optimized. 23 | Otherwise, each node keeps 1 example out of `world_size`, skipping the other examples. 24 | 25 | Args: 26 | dataset ([`Dataset`] or [`IterableDataset`]): 27 | The dataset to split by node. 28 | rank (`int`): 29 | Rank of the current node. 30 | world_size (`int`): 31 | Total number of nodes. 32 | 33 | Returns: 34 | [`Dataset`] or [`IterableDataset`]: The dataset to be used on the node at rank `rank`. 35 | """ 36 | if isinstance(dataset, Dataset): 37 | return _split_by_node_map_style_dataset(dataset, rank=rank, world_size=world_size) 38 | else: 39 | return _split_by_node_iterable_dataset(dataset, rank=rank, world_size=world_size) 40 | -------------------------------------------------------------------------------- /src/datasets/filesystems/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import shutil 3 | import warnings 4 | from typing import List 5 | 6 | import fsspec 7 | import fsspec.asyn 8 | from fsspec.implementations.local import LocalFileSystem 9 | 10 | from . import compression 11 | 12 | 13 | COMPRESSION_FILESYSTEMS: list[compression.BaseCompressedFileFileSystem] = [ 14 | compression.Bz2FileSystem, 15 | compression.GzipFileSystem, 16 | compression.Lz4FileSystem, 17 | compression.XzFileSystem, 18 | compression.ZstdFileSystem, 19 | ] 20 | 21 | # Register custom filesystems 22 | for fs_class in COMPRESSION_FILESYSTEMS: 23 | if fs_class.protocol in fsspec.registry and fsspec.registry[fs_class.protocol] is not fs_class: 24 | warnings.warn(f"A filesystem protocol was already set for {fs_class.protocol} and will be overwritten.") 25 | fsspec.register_implementation(fs_class.protocol, fs_class, clobber=True) 26 | 27 | 28 | def is_remote_filesystem(fs: fsspec.AbstractFileSystem) -> bool: 29 | """ 30 | Checks if `fs` is a remote filesystem. 31 | 32 | Args: 33 | fs (`fsspec.spec.AbstractFileSystem`): 34 | An abstract super-class for pythonic file-systems, e.g. `fsspec.filesystem(\'file\')` or `s3fs.S3FileSystem`. 35 | """ 36 | return not isinstance(fs, LocalFileSystem) 37 | 38 | 39 | def rename(fs: fsspec.AbstractFileSystem, src: str, dst: str): 40 | """ 41 | Renames the file `src` in `fs` to `dst`. 42 | """ 43 | if not is_remote_filesystem(fs): 44 | # LocalFileSystem.mv does copy + rm, it is more efficient to simply move a local directory 45 | shutil.move(fs._strip_protocol(src), fs._strip_protocol(dst)) 46 | else: 47 | fs.mv(src, dst, recursive=True) 48 | -------------------------------------------------------------------------------- /docs/source/how_to.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | The how-to guides offer a more comprehensive overview of all the tools 🤗 Datasets offers and how to use them. This will help you tackle messier real-world datasets where you may need to manipulate the dataset structure or content to get it ready for training. 4 | 5 | The guides assume you are familiar and comfortable with the 🤗 Datasets basics. We recommend newer users check out our [tutorials](tutorial) first. 6 | 7 | > [!TIP] 8 | > Interested in learning more? Take a look at [Chapter 5](https://huggingface.co/course/chapter5/1?fw=pt) of the Hugging Face course! 9 | 10 | The guides are organized into six sections: 11 | 12 | - General usage: Functions for general dataset loading and processing. The functions shown in this section are applicable across all dataset modalities. 13 | - Audio: How to load, process, and share audio datasets. 14 | - Vision: How to load, process, and share image and video datasets. 15 | - Text: How to load, process, and share text datasets. 16 | - Tabular: How to load, process, and share tabular datasets. 17 | - Dataset repository: How to share and upload a dataset to the Hub. 18 | 19 | If you have any questions about 🤗 Datasets, feel free to join and ask the community on our [forum](https://discuss.huggingface.co/c/datasets/10). 20 | -------------------------------------------------------------------------------- /benchmarks/format.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | 5 | def format_json_to_md(input_json_file, output_md_file): 6 | with open(input_json_file, encoding="utf-8") as f: 7 | results = json.load(f) 8 | 9 | output_md = ["
", "Show updated benchmarks!", " "] 10 | 11 | for benchmark_name in sorted(results): 12 | benchmark_res = results[benchmark_name] 13 | 14 | benchmark_file_name = benchmark_name.split("/")[-1] 15 | output_md.append(f"### Benchmark: {benchmark_file_name}") 16 | 17 | title = "| metric |" 18 | lines = "|--------|" 19 | value = "| new / old (diff) |" 20 | for metric_name in sorted(benchmark_res): 21 | metric_vals = benchmark_res[metric_name] 22 | new_val = metric_vals["new"] 23 | old_val = metric_vals.get("old", None) 24 | dif_val = metric_vals.get("diff", None) 25 | 26 | val_str = f" {new_val:f}" if isinstance(new_val, (int, float)) else "None" 27 | 28 | if old_val is not None: 29 | val_str += f" / {old_val:f}" if isinstance(old_val, (int, float)) else "None" 30 | if dif_val is not None: 31 | val_str += f" ({dif_val:f})" if isinstance(dif_val, (int, float)) else "None" 32 | 33 | title += " " + metric_name + " |" 34 | lines += "---|" 35 | value += val_str + " |" 36 | 37 | output_md += [title, lines, value, " "] 38 | 39 | output_md.append("
") 40 | 41 | with open(output_md_file, "w", encoding="utf-8") as f: 42 | f.writelines("\n".join(output_md)) 43 | 44 | 45 | if __name__ == "__main__": 46 | input_json_file = sys.argv[1] 47 | output_md_file = sys.argv[2] 48 | 49 | format_json_to_md(input_json_file, output_md_file) 50 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.yml: -------------------------------------------------------------------------------- 1 | name: Bug report 2 | description: Create a report to help reproduce and fix the bug 3 | body: 4 | - type: textarea 5 | id: description 6 | attributes: 7 | label: Describe the bug 8 | description: A clear and concise description of what the bug is 9 | validations: 10 | required: true 11 | 12 | - type: textarea 13 | id: reproduction 14 | attributes: 15 | label: Steps to reproduce the bug 16 | description: | 17 | Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet. 18 | If you have code snippets, error messages, stack traces please provide them here as well. 19 | Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting 20 | Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code. 21 | placeholder: | 22 | Steps to reproduce the behavior: 23 | 24 | 1. 25 | 2. 26 | 3. 27 | validations: 28 | required: true 29 | 30 | - type: textarea 31 | id: expected-behavior 32 | validations: 33 | required: true 34 | attributes: 35 | label: Expected behavior 36 | description: A clear and concise description of the expected results. 37 | 38 | - type: textarea 39 | id: environment-info 40 | attributes: 41 | label: Environment info 42 | description: Please share your environemnt info with us. You can run the command `datasets-cli env` and copy-paste its output below. 43 | placeholder: datasets version, platform, python version, ... 44 | validations: 45 | required: true 46 | -------------------------------------------------------------------------------- /src/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | __version__ = "4.4.2.dev0" 16 | 17 | from .arrow_dataset import Column, Dataset 18 | from .arrow_reader import ReadInstruction 19 | from .builder import ArrowBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder 20 | from .combine import concatenate_datasets, interleave_datasets 21 | from .dataset_dict import DatasetDict, IterableDatasetDict 22 | from .download import * 23 | from .features import * 24 | from .fingerprint import disable_caching, enable_caching, is_caching_enabled 25 | from .info import DatasetInfo 26 | from .inspect import ( 27 | get_dataset_config_info, 28 | get_dataset_config_names, 29 | get_dataset_default_config_name, 30 | get_dataset_infos, 31 | get_dataset_split_names, 32 | ) 33 | from .iterable_dataset import IterableColumn, IterableDataset 34 | from .load import load_dataset, load_dataset_builder, load_from_disk 35 | from .splits import ( 36 | NamedSplit, 37 | NamedSplitAll, 38 | Split, 39 | SplitBase, 40 | SplitDict, 41 | SplitGenerator, 42 | SplitInfo, 43 | SubSplitInfo, 44 | percent, 45 | ) 46 | from .utils import * 47 | from .utils import logging 48 | -------------------------------------------------------------------------------- /tests/distributed_scripts/run_torch_distributed.py: -------------------------------------------------------------------------------- 1 | import os 2 | from argparse import ArgumentParser 3 | from typing import List 4 | 5 | import torch.utils.data 6 | 7 | from datasets import Dataset, IterableDataset 8 | from datasets.distributed import split_dataset_by_node 9 | 10 | 11 | NUM_SHARDS = 4 12 | NUM_ITEMS_PER_SHARD = 3 13 | 14 | 15 | class FailedTestError(RuntimeError): 16 | pass 17 | 18 | 19 | def gen(shards: List[str]): 20 | for shard in shards: 21 | for i in range(NUM_ITEMS_PER_SHARD): 22 | yield {"i": i, "shard": shard} 23 | 24 | 25 | def main(): 26 | rank = int(os.environ["RANK"]) 27 | world_size = int(os.environ["WORLD_SIZE"]) 28 | 29 | parser = ArgumentParser() 30 | parser.add_argument("--streaming", type=bool) 31 | parser.add_argument("--local_rank", type=int) 32 | parser.add_argument("--num_workers", type=int, default=0) 33 | args = parser.parse_args() 34 | streaming = args.streaming 35 | num_workers = args.num_workers 36 | 37 | gen_kwargs = {"shards": [f"shard_{shard_idx}" for shard_idx in range(NUM_SHARDS)]} 38 | ds = IterableDataset.from_generator(gen, gen_kwargs=gen_kwargs) 39 | if not streaming: 40 | ds = Dataset.from_list(list(ds)) 41 | 42 | ds = split_dataset_by_node(ds, rank=rank, world_size=world_size) 43 | dataloader = torch.utils.data.DataLoader(ds, num_workers=num_workers) 44 | 45 | full_size = NUM_SHARDS * NUM_ITEMS_PER_SHARD 46 | expected_local_size = full_size // world_size 47 | expected_local_size += int(rank < (full_size % world_size)) 48 | 49 | local_size = sum(1 for _ in dataloader) 50 | if local_size != expected_local_size: 51 | raise FailedTestError(f"local_size {local_size} != expected_local_size {expected_local_size}") 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /src/datasets/io/abc.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Optional, Union 3 | 4 | from .. import Dataset, DatasetDict, Features, IterableDataset, IterableDatasetDict, NamedSplit 5 | from ..utils.typing import NestedDataStructureLike, PathLike 6 | 7 | 8 | class AbstractDatasetReader(ABC): 9 | def __init__( 10 | self, 11 | path_or_paths: Optional[NestedDataStructureLike[PathLike]] = None, 12 | split: Optional[NamedSplit] = None, 13 | features: Optional[Features] = None, 14 | cache_dir: str = None, 15 | keep_in_memory: bool = False, 16 | streaming: bool = False, 17 | num_proc: Optional[int] = None, 18 | **kwargs, 19 | ): 20 | self.path_or_paths = path_or_paths 21 | self.split = split if split or isinstance(path_or_paths, dict) else "train" 22 | self.features = features 23 | self.cache_dir = cache_dir 24 | self.keep_in_memory = keep_in_memory 25 | self.streaming = streaming 26 | self.num_proc = num_proc 27 | self.kwargs = kwargs 28 | 29 | @abstractmethod 30 | def read(self) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]: 31 | pass 32 | 33 | 34 | class AbstractDatasetInputStream(ABC): 35 | def __init__( 36 | self, 37 | features: Optional[Features] = None, 38 | cache_dir: str = None, 39 | keep_in_memory: bool = False, 40 | streaming: bool = False, 41 | num_proc: Optional[int] = None, 42 | **kwargs, 43 | ): 44 | self.features = features 45 | self.cache_dir = cache_dir 46 | self.keep_in_memory = keep_in_memory 47 | self.streaming = streaming 48 | self.num_proc = num_proc 49 | self.kwargs = kwargs 50 | 51 | @abstractmethod 52 | def read(self) -> Union[Dataset, IterableDataset]: 53 | pass 54 | -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | # 🤗 Datasets Notebooks 18 | 19 | You can find here a list of the official notebooks provided by Hugging Face. 20 | 21 | Also, we would like to list here interesting content created by the community. 22 | If you wrote some notebook(s) leveraging 🤗 Datasets and would like it to be listed here, please open a 23 | Pull Request so it can be included under the Community notebooks. 24 | 25 | ## Hugging Face's notebooks 🤗 26 | 27 | ### Documentation notebooks 28 | 29 | You can open any page of the documentation as a notebook in Colab (there is a button directly on said pages) but they are also listed here if you need them: 30 | 31 | | Notebook | Description | | | 32 | |:----------|:-------------|:-------------|------:| 33 | | [Quickstart](https://github.com/huggingface/notebooks/blob/main/datasets_doc/en/quickstart.ipynb) | A quick presentation on integrating Datasets into a model training workflow |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/datasets_doc/en/quickstart.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/datasets_doc/en/quickstart.ipynb)| 34 | -------------------------------------------------------------------------------- /benchmarks/benchmark_indices_mapping.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import tempfile 4 | 5 | import datasets 6 | from utils import generate_example_dataset, get_duration 7 | 8 | 9 | SPEED_TEST_N_EXAMPLES = 500_000 10 | 11 | RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__) 12 | RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json")) 13 | 14 | 15 | @get_duration 16 | def select(dataset: datasets.Dataset): 17 | _ = dataset.select(range(0, len(dataset), 2)) 18 | 19 | 20 | @get_duration 21 | def sort(dataset: datasets.Dataset): 22 | _ = dataset.sort("numbers") 23 | 24 | 25 | @get_duration 26 | def shuffle(dataset: datasets.Dataset): 27 | _ = dataset.shuffle() 28 | 29 | 30 | @get_duration 31 | def train_test_split(dataset: datasets.Dataset): 32 | _ = dataset.train_test_split(0.1) 33 | 34 | 35 | @get_duration 36 | def shard(dataset: datasets.Dataset, num_shards=10): 37 | for shard_id in range(num_shards): 38 | _ = dataset.shard(num_shards, shard_id) 39 | 40 | 41 | def benchmark_indices_mapping(): 42 | times = {"num examples": SPEED_TEST_N_EXAMPLES} 43 | functions = (select, sort, shuffle, train_test_split, shard) 44 | with tempfile.TemporaryDirectory() as tmp_dir: 45 | print("generating dataset") 46 | features = datasets.Features({"text": datasets.Value("string"), "numbers": datasets.Value("float32")}) 47 | dataset = generate_example_dataset( 48 | os.path.join(tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES 49 | ) 50 | print("Functions") 51 | for func in functions: 52 | print(func.__name__) 53 | times[func.__name__] = func(dataset) 54 | 55 | with open(RESULTS_FILE_PATH, "wb") as f: 56 | f.write(json.dumps(times).encode("utf-8")) 57 | 58 | 59 | if __name__ == "__main__": # useful to run the profiler 60 | benchmark_indices_mapping() 61 | -------------------------------------------------------------------------------- /tests/test_splits.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | import pytest 4 | 5 | from datasets.splits import Split, SplitDict, SplitInfo 6 | from datasets.utils.py_utils import asdict 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "split_dict", 11 | [ 12 | SplitDict(), 13 | SplitDict({"train": SplitInfo(name="train", num_bytes=1337, num_examples=42, dataset_name="my_dataset")}), 14 | SplitDict({"train": SplitInfo(name="train", num_bytes=1337, num_examples=42)}), 15 | SplitDict({"train": SplitInfo()}), 16 | ], 17 | ) 18 | def test_split_dict_to_yaml_list(split_dict: SplitDict): 19 | split_dict_yaml_list = split_dict._to_yaml_list() 20 | assert len(split_dict_yaml_list) == len(split_dict) 21 | reloaded = SplitDict._from_yaml_list(split_dict_yaml_list) 22 | for split_name, split_info in split_dict.items(): 23 | # dataset_name field is deprecated, and is therefore not part of the YAML dump 24 | split_info.dataset_name = None 25 | # the split name of split_dict takes over the name of the split info object 26 | split_info.name = split_name 27 | assert split_dict == reloaded 28 | 29 | 30 | @pytest.mark.parametrize( 31 | "split_info", [SplitInfo(), SplitInfo(dataset_name=None), SplitInfo(dataset_name="my_dataset")] 32 | ) 33 | def test_split_dict_asdict_has_dataset_name(split_info): 34 | # For backward compatibility, we need asdict(split_dict) to return split info dictrionaries with the "dataset_name" 35 | # field even if it's deprecated. This way old versionso of `datasets` can still reload dataset_infos.json files 36 | split_dict_asdict = asdict(SplitDict({"train": split_info})) 37 | assert "dataset_name" in split_dict_asdict["train"] 38 | assert split_dict_asdict["train"]["dataset_name"] == split_info.dataset_name 39 | 40 | 41 | def test_named_split_inequality(): 42 | # Used while building the docs, when set as a default parameter value in a function signature 43 | assert Split.TRAIN != inspect.Parameter.empty 44 | -------------------------------------------------------------------------------- /tests/test_parallel.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from datasets.parallel import ParallelBackendConfig, parallel_backend 4 | from datasets.utils.py_utils import map_nested 5 | 6 | from .utils import require_dill_gt_0_3_2, require_joblibspark, require_not_windows 7 | 8 | 9 | def add_one(i): # picklable for multiprocessing 10 | return i + 1 11 | 12 | 13 | @require_dill_gt_0_3_2 14 | @require_joblibspark 15 | @require_not_windows 16 | def test_parallel_backend_input(): 17 | with parallel_backend("spark"): 18 | assert ParallelBackendConfig.backend_name == "spark" 19 | 20 | lst = [1, 2, 3] 21 | with pytest.raises(ValueError): 22 | with parallel_backend("unsupported backend"): 23 | map_nested(add_one, lst, num_proc=2) 24 | 25 | with pytest.raises(ValueError): 26 | with parallel_backend("unsupported backend"): 27 | map_nested(add_one, lst, num_proc=-1) 28 | 29 | 30 | @require_dill_gt_0_3_2 31 | @require_joblibspark 32 | @require_not_windows 33 | @pytest.mark.parametrize("num_proc", [2, -1]) 34 | def test_parallel_backend_map_nested(num_proc): 35 | s1 = [1, 2] 36 | s2 = {"a": 1, "b": 2} 37 | s3 = {"a": [1, 2], "b": [3, 4]} 38 | s4 = {"a": {"1": 1}, "b": 2} 39 | s5 = {"a": 1, "b": 2, "c": 3, "d": 4} 40 | expected_map_nested_s1 = [2, 3] 41 | expected_map_nested_s2 = {"a": 2, "b": 3} 42 | expected_map_nested_s3 = {"a": [2, 3], "b": [4, 5]} 43 | expected_map_nested_s4 = {"a": {"1": 2}, "b": 3} 44 | expected_map_nested_s5 = {"a": 2, "b": 3, "c": 4, "d": 5} 45 | 46 | with parallel_backend("spark"): 47 | assert map_nested(add_one, s1, num_proc=num_proc) == expected_map_nested_s1 48 | assert map_nested(add_one, s2, num_proc=num_proc) == expected_map_nested_s2 49 | assert map_nested(add_one, s3, num_proc=num_proc) == expected_map_nested_s3 50 | assert map_nested(add_one, s4, num_proc=num_proc) == expected_map_nested_s4 51 | assert map_nested(add_one, s5, num_proc=num_proc) == expected_map_nested_s5 52 | -------------------------------------------------------------------------------- /docs/source/nlp_load.mdx: -------------------------------------------------------------------------------- 1 | # Load text data 2 | 3 | This guide shows you how to load text datasets. To learn how to load any type of dataset, take a look at the general loading guide. 4 | 5 | Text files are one of the most common file types for storing a dataset. By default, 🤗 Datasets samples a text file line by line to build the dataset. 6 | 7 | ```py 8 | >>> from datasets import load_dataset 9 | >>> dataset = load_dataset("text", data_files={"train": ["my_text_1.txt", "my_text_2.txt"], "test": "my_test_file.txt"}) 10 | 11 | # Load from a directory 12 | >>> dataset = load_dataset("text", data_dir="path/to/text/dataset") 13 | ``` 14 | 15 | To sample a text file by paragraph or even an entire document, use the `sample_by` parameter: 16 | 17 | ```py 18 | # Sample by paragraph 19 | >>> dataset = load_dataset("text", data_files={"train": "my_train_file.txt", "test": "my_test_file.txt"}, sample_by="paragraph") 20 | 21 | # Sample by document 22 | >>> dataset = load_dataset("text", data_files={"train": "my_train_file.txt", "test": "my_test_file.txt"}, sample_by="document") 23 | ``` 24 | 25 | You can also use grep patterns to load specific files: 26 | 27 | ```py 28 | >>> from datasets import load_dataset 29 | >>> c4_subset = load_dataset("allenai/c4", data_files="en/c4-train.0000*-of-01024.json.gz") 30 | ``` 31 | 32 | To load remote text files via HTTP, pass the URLs instead: 33 | 34 | ```py 35 | >>> dataset = load_dataset("text", data_files="https://huggingface.co/datasets/hf-internal-testing/dataset_with_data_files/resolve/main/data/train.txt") 36 | ``` 37 | 38 | To load XML data you can use the "xml" loader, which is equivalent to "text" with sample_by="document": 39 | 40 | ```py 41 | >>> from datasets import load_dataset 42 | >>> dataset = load_dataset("xml", data_files={"train": ["my_xml_1.xml", "my_xml_2.xml"], "test": "my_xml_file.xml"}) 43 | 44 | # Load from a directory 45 | >>> dataset = load_dataset("xml", data_dir="path/to/xml/dataset") 46 | ``` 47 | -------------------------------------------------------------------------------- /src/datasets/io/spark.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import pyspark 4 | 5 | from .. import Features, NamedSplit 6 | from ..download import DownloadMode 7 | from ..packaged_modules.spark.spark import Spark 8 | from .abc import AbstractDatasetReader 9 | 10 | 11 | class SparkDatasetReader(AbstractDatasetReader): 12 | """A dataset reader that reads from a Spark DataFrame. 13 | 14 | When caching, cache materialization is parallelized over Spark; an NFS that is accessible to the driver must be 15 | provided. Streaming is not currently supported. 16 | """ 17 | 18 | def __init__( 19 | self, 20 | df: pyspark.sql.DataFrame, 21 | split: Optional[NamedSplit] = None, 22 | features: Optional[Features] = None, 23 | streaming: bool = True, 24 | cache_dir: str = None, 25 | keep_in_memory: bool = False, 26 | working_dir: str = None, 27 | load_from_cache_file: bool = True, 28 | file_format: str = "arrow", 29 | **kwargs, 30 | ): 31 | super().__init__( 32 | split=split, 33 | features=features, 34 | cache_dir=cache_dir, 35 | keep_in_memory=keep_in_memory, 36 | streaming=streaming, 37 | **kwargs, 38 | ) 39 | self._load_from_cache_file = load_from_cache_file 40 | self._file_format = file_format 41 | self.builder = Spark( 42 | df=df, 43 | features=features, 44 | cache_dir=cache_dir, 45 | working_dir=working_dir, 46 | **kwargs, 47 | ) 48 | 49 | def read(self): 50 | if self.streaming: 51 | return self.builder.as_streaming_dataset(split=self.split) 52 | download_mode = None if self._load_from_cache_file else DownloadMode.FORCE_REDOWNLOAD 53 | self.builder.download_and_prepare( 54 | download_mode=download_mode, 55 | file_format=self._file_format, 56 | ) 57 | return self.builder.as_dataset(split=self.split) 58 | -------------------------------------------------------------------------------- /tests/test_dataset_list.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from datasets import List, Value 4 | from datasets.arrow_dataset import Dataset 5 | 6 | 7 | class DatasetListTest(TestCase): 8 | def _create_example_records(self): 9 | return [ 10 | {"col_1": 3, "col_2": "a"}, 11 | {"col_1": 2, "col_2": "b"}, 12 | {"col_1": 1, "col_2": "c"}, 13 | {"col_1": 0, "col_2": "d"}, 14 | ] 15 | 16 | def _create_example_dict(self): 17 | data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} 18 | return Dataset.from_dict(data) 19 | 20 | def test_create(self): 21 | example_records = self._create_example_records() 22 | dset = Dataset.from_list(example_records) 23 | self.assertListEqual(dset.column_names, ["col_1", "col_2"]) 24 | for i, r in enumerate(dset): 25 | self.assertDictEqual(r, example_records[i]) 26 | 27 | def test_list_dict_equivalent(self): 28 | example_records = self._create_example_records() 29 | dset = Dataset.from_list(example_records) 30 | dset_from_dict = Dataset.from_dict({k: [r[k] for r in example_records] for k in example_records[0]}) 31 | self.assertEqual(dset.info, dset_from_dict.info) 32 | 33 | def test_uneven_records(self): # checks what happens with missing columns 34 | uneven_records = [{"col_1": 1}, {"col_2": "x"}] 35 | dset = Dataset.from_list(uneven_records) 36 | self.assertDictEqual(dset[0], {"col_1": 1}) 37 | self.assertDictEqual(dset[1], {"col_1": None}) # NB: first record is used for columns 38 | 39 | def test_variable_list_records(self): # checks if the type can be inferred from the second record 40 | list_records = [{"col_1": []}, {"col_1": [1, 2]}] 41 | dset = Dataset.from_list(list_records) 42 | self.assertEqual(dset.info.features["col_1"], List(Value("int64"))) 43 | 44 | def test_create_empty(self): 45 | dset = Dataset.from_list([]) 46 | self.assertEqual(len(dset), 0) 47 | self.assertListEqual(dset.column_names, []) 48 | -------------------------------------------------------------------------------- /src/datasets/utils/track.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterable, Iterator 2 | 3 | 4 | class tracked_str(str): 5 | origins = {} 6 | 7 | def set_origin(self, origin: str): 8 | if super().__repr__() not in self.origins: 9 | self.origins[super().__repr__()] = origin 10 | 11 | def get_origin(self): 12 | return self.origins.get(super().__repr__(), str(self)) 13 | 14 | def __repr__(self) -> str: 15 | if super().__repr__() not in self.origins or self.origins[super().__repr__()] == self: 16 | return super().__repr__() 17 | else: 18 | return f"{str(self)} (origin={self.origins[super().__repr__()]})" 19 | 20 | 21 | class tracked_list(list): 22 | def __init__(self, *args, **kwargs) -> None: 23 | super().__init__(*args, **kwargs) 24 | self.last_item = None 25 | 26 | def __iter__(self) -> Iterator: 27 | for x in super().__iter__(): 28 | self.last_item = x 29 | yield x 30 | self.last_item = None 31 | 32 | def __repr__(self) -> str: 33 | if self.last_item is None: 34 | return super().__repr__() 35 | else: 36 | return f"{self.__class__.__name__}(current={self.last_item})" 37 | 38 | 39 | class TrackedIterableFromGenerator(Iterable): 40 | """Utility class to create an iterable from a generator function, in order to reset the generator when needed.""" 41 | 42 | def __init__(self, generator, *args): 43 | super().__init__() 44 | self.generator = generator 45 | self.args = args 46 | self.last_item = None 47 | 48 | def __iter__(self): 49 | for x in self.generator(*self.args): 50 | self.last_item = x 51 | yield x 52 | self.last_item = None 53 | 54 | def __repr__(self) -> str: 55 | if self.last_item is None: 56 | return super().__repr__() 57 | else: 58 | return f"{self.__class__.__name__}(current={self.last_item})" 59 | 60 | def __reduce__(self): 61 | return (self.__class__, (self.generator, *self.args)) 62 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/audiofolder/audiofolder.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | 3 | from ..folder_based_builder import folder_based_builder 4 | 5 | 6 | logger = datasets.utils.logging.get_logger(__name__) 7 | 8 | 9 | class AudioFolderConfig(folder_based_builder.FolderBasedBuilderConfig): 10 | """Builder Config for AudioFolder.""" 11 | 12 | drop_labels: bool = None 13 | drop_metadata: bool = None 14 | 15 | def __post_init__(self): 16 | super().__post_init__() 17 | 18 | 19 | class AudioFolder(folder_based_builder.FolderBasedBuilder): 20 | BASE_FEATURE = datasets.Audio 21 | BASE_COLUMN_NAME = "audio" 22 | BUILDER_CONFIG_CLASS = AudioFolderConfig 23 | EXTENSIONS: list[str] # definition at the bottom of the script 24 | 25 | 26 | # Obtained with: 27 | # ``` 28 | # import soundfile as sf 29 | # 30 | # AUDIO_EXTENSIONS = [f".{format.lower()}" for format in sf.available_formats().keys()] 31 | # 32 | # # .opus decoding is supported if libsndfile >= 1.0.31: 33 | # AUDIO_EXTENSIONS.extend([".opus"]) 34 | # ``` 35 | # We intentionally did not run this code on launch because: 36 | # (1) Soundfile was an optional dependency, so importing it in global namespace is not allowed 37 | # (2) To ensure the list of supported extensions is deterministic 38 | # (3) We use TorchCodec now anyways instead of Soundfile 39 | AUDIO_EXTENSIONS = [ 40 | ".aiff", 41 | ".au", 42 | ".avr", 43 | ".caf", 44 | ".flac", 45 | ".htk", 46 | ".svx", 47 | ".mat4", 48 | ".mat5", 49 | ".mpc2k", 50 | ".ogg", 51 | ".paf", 52 | ".pvf", 53 | ".raw", 54 | ".rf64", 55 | ".sd2", 56 | ".sds", 57 | ".ircam", 58 | ".voc", 59 | ".w64", 60 | ".wav", 61 | ".nist", 62 | ".wavex", 63 | ".wve", 64 | ".xi", 65 | ".mp3", 66 | ".opus", 67 | ".3gp", 68 | ".3g2", 69 | ".avi", 70 | ".asf", 71 | ".flv", 72 | ".mp4", 73 | ".mov", 74 | ".m4v", 75 | ".mkv", 76 | ".mpg", 77 | ".webm", 78 | ".f4v", 79 | ".wmv", 80 | ".wma", 81 | ".ogg", 82 | ".ogm", 83 | ".mxf", 84 | ".nut", 85 | ] 86 | AudioFolder.EXTENSIONS = AUDIO_EXTENSIONS 87 | -------------------------------------------------------------------------------- /tests/test_sharding_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from datasets.utils.sharding import _distribute_shards, _number_of_shards_in_gen_kwargs, _split_gen_kwargs 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "kwargs, expected", 8 | [ 9 | ({"num_shards": 0, "max_num_jobs": 1}, []), 10 | ({"num_shards": 10, "max_num_jobs": 1}, [range(10)]), 11 | ({"num_shards": 10, "max_num_jobs": 10}, [range(i, i + 1) for i in range(10)]), 12 | ({"num_shards": 1, "max_num_jobs": 10}, [range(1)]), 13 | ({"num_shards": 10, "max_num_jobs": 3}, [range(0, 4), range(4, 7), range(7, 10)]), 14 | ({"num_shards": 3, "max_num_jobs": 10}, [range(0, 1), range(1, 2), range(2, 3)]), 15 | ], 16 | ) 17 | def test_distribute_shards(kwargs, expected): 18 | out = _distribute_shards(**kwargs) 19 | assert out == expected 20 | 21 | 22 | @pytest.mark.parametrize( 23 | "gen_kwargs, max_num_jobs, expected", 24 | [ 25 | ({"foo": 0}, 10, [{"foo": 0}]), 26 | ({"shards": [0, 1, 2, 3]}, 1, [{"shards": [0, 1, 2, 3]}]), 27 | ({"shards": [0, 1, 2, 3]}, 4, [{"shards": [0]}, {"shards": [1]}, {"shards": [2]}, {"shards": [3]}]), 28 | ({"shards": [0, 1]}, 4, [{"shards": [0]}, {"shards": [1]}]), 29 | ({"shards": [0, 1, 2, 3]}, 2, [{"shards": [0, 1]}, {"shards": [2, 3]}]), 30 | ], 31 | ) 32 | def test_split_gen_kwargs(gen_kwargs, max_num_jobs, expected): 33 | out = _split_gen_kwargs(gen_kwargs, max_num_jobs) 34 | assert out == expected 35 | 36 | 37 | @pytest.mark.parametrize( 38 | "gen_kwargs, expected", 39 | [ 40 | ({"foo": 0}, 1), 41 | ({"shards": [0]}, 1), 42 | ({"shards": [0, 1, 2, 3]}, 4), 43 | ({"shards": [0, 1, 2, 3], "foo": 0}, 4), 44 | ({"shards": [0, 1, 2, 3], "other": (0, 1)}, 4), 45 | ({"shards": [0, 1, 2, 3], "shards2": [0, 1]}, RuntimeError), 46 | ], 47 | ) 48 | def test_number_of_shards_in_gen_kwargs(gen_kwargs, expected): 49 | if expected is RuntimeError: 50 | with pytest.raises(expected): 51 | _number_of_shards_in_gen_kwargs(gen_kwargs) 52 | else: 53 | out = _number_of_shards_in_gen_kwargs(gen_kwargs) 54 | assert out == expected 55 | -------------------------------------------------------------------------------- /src/datasets/io/text.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from .. import Features, NamedSplit 4 | from ..packaged_modules.text.text import Text 5 | from ..utils.typing import NestedDataStructureLike, PathLike 6 | from .abc import AbstractDatasetReader 7 | 8 | 9 | class TextDatasetReader(AbstractDatasetReader): 10 | def __init__( 11 | self, 12 | path_or_paths: NestedDataStructureLike[PathLike], 13 | split: Optional[NamedSplit] = None, 14 | features: Optional[Features] = None, 15 | cache_dir: str = None, 16 | keep_in_memory: bool = False, 17 | streaming: bool = False, 18 | num_proc: Optional[int] = None, 19 | **kwargs, 20 | ): 21 | super().__init__( 22 | path_or_paths, 23 | split=split, 24 | features=features, 25 | cache_dir=cache_dir, 26 | keep_in_memory=keep_in_memory, 27 | streaming=streaming, 28 | num_proc=num_proc, 29 | **kwargs, 30 | ) 31 | path_or_paths = path_or_paths if isinstance(path_or_paths, dict) else {self.split: path_or_paths} 32 | self.builder = Text( 33 | cache_dir=cache_dir, 34 | data_files=path_or_paths, 35 | features=features, 36 | **kwargs, 37 | ) 38 | 39 | def read(self): 40 | # Build iterable dataset 41 | if self.streaming: 42 | dataset = self.builder.as_streaming_dataset(split=self.split) 43 | # Build regular (map-style) dataset 44 | else: 45 | download_config = None 46 | download_mode = None 47 | verification_mode = None 48 | base_path = None 49 | 50 | self.builder.download_and_prepare( 51 | download_config=download_config, 52 | download_mode=download_mode, 53 | verification_mode=verification_mode, 54 | base_path=base_path, 55 | num_proc=self.num_proc, 56 | ) 57 | dataset = self.builder.as_dataset( 58 | split=self.split, verification_mode=verification_mode, in_memory=self.keep_in_memory 59 | ) 60 | return dataset 61 | -------------------------------------------------------------------------------- /tests/test_offline_util.py: -------------------------------------------------------------------------------- 1 | from tempfile import NamedTemporaryFile 2 | 3 | import httpx 4 | import pytest 5 | import requests 6 | from huggingface_hub import get_session 7 | from huggingface_hub.errors import OfflineModeIsEnabled 8 | 9 | from datasets.utils.file_utils import fsspec_get, fsspec_head 10 | 11 | from .utils import ( 12 | IS_HF_HUB_1_x, 13 | OfflineSimulationMode, 14 | RequestWouldHangIndefinitelyError, 15 | offline, 16 | require_not_windows, 17 | ) 18 | 19 | 20 | @pytest.mark.integration 21 | @require_not_windows # fsspec get keeps a file handle on windows that raises PermissionError 22 | def test_offline_with_timeout(): 23 | expected_exception = httpx.ReadTimeout if IS_HF_HUB_1_x else requests.ConnectTimeout 24 | with offline(OfflineSimulationMode.CONNECTION_TIMES_OUT): 25 | with pytest.raises(RequestWouldHangIndefinitelyError): 26 | get_session().request("GET", "https://huggingface.co") 27 | 28 | with pytest.raises(expected_exception): 29 | get_session().request("GET", "https://huggingface.co", timeout=1.0) 30 | 31 | with pytest.raises(expected_exception), NamedTemporaryFile() as temp_file: 32 | fsspec_get("hf://dummy", temp_file=temp_file) 33 | 34 | 35 | @pytest.mark.integration 36 | @require_not_windows # fsspec get keeps a file handle on windows that raises PermissionError 37 | def test_offline_with_connection_error(): 38 | expected_exception = httpx.ConnectError if IS_HF_HUB_1_x else requests.ConnectionError 39 | with offline(OfflineSimulationMode.CONNECTION_FAILS): 40 | with pytest.raises(expected_exception): 41 | get_session().request("GET", "https://huggingface.co") 42 | 43 | with pytest.raises(expected_exception), NamedTemporaryFile() as temp_file: 44 | fsspec_get("hf://dummy", temp_file=temp_file) 45 | 46 | 47 | def test_offline_with_datasets_offline_mode_enabled(): 48 | with offline(OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1): 49 | with pytest.raises(OfflineModeIsEnabled): 50 | fsspec_head("hf://dummy") 51 | with pytest.raises(OfflineModeIsEnabled), NamedTemporaryFile() as temp_file: 52 | fsspec_get("hf://dummy", temp_file=temp_file) 53 | -------------------------------------------------------------------------------- /benchmarks/utils.py: -------------------------------------------------------------------------------- 1 | import timeit 2 | 3 | import numpy as np 4 | 5 | import datasets 6 | from datasets.arrow_writer import ArrowWriter 7 | from datasets.features.features import _ArrayXD 8 | 9 | 10 | def get_duration(func): 11 | def wrapper(*args, **kwargs): 12 | starttime = timeit.default_timer() 13 | _ = func(*args, **kwargs) 14 | delta = timeit.default_timer() - starttime 15 | return delta 16 | 17 | wrapper.__name__ = func.__name__ 18 | 19 | return wrapper 20 | 21 | 22 | def generate_examples(features: dict, num_examples=100, seq_shapes=None): 23 | dummy_data = [] 24 | seq_shapes = seq_shapes or {} 25 | for i in range(num_examples): 26 | example = {} 27 | for col_id, (k, v) in enumerate(features.items()): 28 | if isinstance(v, _ArrayXD): 29 | data = np.random.rand(*v.shape).astype(v.dtype) 30 | elif isinstance(v, datasets.Value): 31 | if v.dtype == "string": 32 | data = "The small grey turtle was surprisingly fast when challenged." 33 | else: 34 | data = np.random.randint(10, size=1).astype(v.dtype).item() 35 | elif isinstance(v, datasets.Sequence): 36 | while isinstance(v, datasets.Sequence): 37 | v = v.feature 38 | shape = seq_shapes[k] 39 | data = np.random.rand(*shape).astype(v.dtype) 40 | example[k] = data 41 | 42 | dummy_data.append((i, example)) 43 | 44 | return dummy_data 45 | 46 | 47 | def generate_example_dataset(dataset_path, features, num_examples=100, seq_shapes=None): 48 | dummy_data = generate_examples(features, num_examples=num_examples, seq_shapes=seq_shapes) 49 | 50 | with ArrowWriter(features=features, path=dataset_path) as writer: 51 | for key, record in dummy_data: 52 | example = features.encode_example(record) 53 | writer.write(example) 54 | 55 | num_final_examples, num_bytes = writer.finalize() 56 | 57 | if not num_final_examples == num_examples: 58 | raise ValueError( 59 | f"Error writing the dataset, wrote {num_final_examples} examples but should have written {num_examples}." 60 | ) 61 | 62 | dataset = datasets.Dataset.from_file(filename=dataset_path, info=datasets.DatasetInfo(features=features)) 63 | 64 | return dataset 65 | -------------------------------------------------------------------------------- /src/datasets/io/generator.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Optional 2 | 3 | from .. import Features, NamedSplit, Split 4 | from ..packaged_modules.generator.generator import Generator 5 | from .abc import AbstractDatasetInputStream 6 | 7 | 8 | class GeneratorDatasetInputStream(AbstractDatasetInputStream): 9 | def __init__( 10 | self, 11 | generator: Callable, 12 | features: Optional[Features] = None, 13 | cache_dir: str = None, 14 | keep_in_memory: bool = False, 15 | streaming: bool = False, 16 | gen_kwargs: Optional[dict] = None, 17 | num_proc: Optional[int] = None, 18 | split: NamedSplit = Split.TRAIN, 19 | fingerprint: Optional[str] = None, 20 | **kwargs, 21 | ): 22 | super().__init__( 23 | features=features, 24 | cache_dir=cache_dir, 25 | keep_in_memory=keep_in_memory, 26 | streaming=streaming, 27 | num_proc=num_proc, 28 | **kwargs, 29 | ) 30 | self.builder = Generator( 31 | cache_dir=cache_dir, 32 | features=features, 33 | generator=generator, 34 | gen_kwargs=gen_kwargs, 35 | split=split, 36 | config_id="default-fingerprint=" + fingerprint if fingerprint else None, 37 | **kwargs, 38 | ) 39 | self.fingerprint = fingerprint 40 | 41 | def read(self): 42 | # Build iterable dataset 43 | if self.streaming: 44 | dataset = self.builder.as_streaming_dataset(split=self.builder.config.split) 45 | # Build regular (map-style) dataset 46 | else: 47 | download_config = None 48 | download_mode = None 49 | verification_mode = None 50 | base_path = None 51 | 52 | self.builder.download_and_prepare( 53 | download_config=download_config, 54 | download_mode=download_mode, 55 | verification_mode=verification_mode, 56 | base_path=base_path, 57 | num_proc=self.num_proc, 58 | ) 59 | dataset = self.builder.as_dataset( 60 | split=self.builder.config.split, verification_mode=verification_mode, in_memory=self.keep_in_memory 61 | ) 62 | if self.fingerprint: 63 | dataset._fingerprint = self.fingerprint 64 | return dataset 65 | -------------------------------------------------------------------------------- /tests/features/test_pdf.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from datasets import Dataset, Features, Pdf 6 | 7 | from ..utils import require_pdfplumber 8 | 9 | 10 | @require_pdfplumber 11 | @pytest.mark.parametrize( 12 | "build_example", 13 | [ 14 | lambda pdf_path: pdf_path, 15 | lambda pdf_path: Path(pdf_path), 16 | lambda pdf_path: open(pdf_path, "rb").read(), 17 | lambda pdf_path: {"path": pdf_path}, 18 | lambda pdf_path: {"path": pdf_path, "bytes": None}, 19 | lambda pdf_path: {"path": pdf_path, "bytes": open(pdf_path, "rb").read()}, 20 | lambda pdf_path: {"path": None, "bytes": open(pdf_path, "rb").read()}, 21 | lambda pdf_path: {"bytes": open(pdf_path, "rb").read()}, 22 | ], 23 | ) 24 | def test_pdf_feature_encode_example(shared_datadir, build_example): 25 | import pdfplumber 26 | 27 | pdf_path = str(shared_datadir / "test_pdf.pdf") 28 | pdf = Pdf() 29 | encoded_example = pdf.encode_example(build_example(pdf_path)) 30 | assert isinstance(encoded_example, dict) 31 | assert encoded_example.keys() == {"bytes", "path"} 32 | assert encoded_example["bytes"] is not None or encoded_example["path"] is not None 33 | decoded_example = pdf.decode_example(encoded_example) 34 | assert isinstance(decoded_example, pdfplumber.pdf.PDF) 35 | 36 | 37 | @require_pdfplumber 38 | def test_dataset_with_pdf_feature(shared_datadir): 39 | import pdfplumber 40 | 41 | pdf_path = str(shared_datadir / "test_pdf.pdf") 42 | data = {"pdf": [pdf_path]} 43 | features = Features({"pdf": Pdf()}) 44 | dset = Dataset.from_dict(data, features=features) 45 | item = dset[0] 46 | assert item.keys() == {"pdf"} 47 | assert isinstance(item["pdf"], pdfplumber.pdf.PDF) 48 | batch = dset[:1] 49 | assert len(batch) == 1 50 | assert batch.keys() == {"pdf"} 51 | assert isinstance(batch["pdf"], list) and all(isinstance(item, pdfplumber.pdf.PDF) for item in batch["pdf"]) 52 | column = dset["pdf"] 53 | assert len(column) == 1 54 | assert isinstance(column, list) and all(isinstance(item, pdfplumber.pdf.PDF) for item in column) 55 | 56 | # from bytes 57 | with open(pdf_path, "rb") as f: 58 | data = {"pdf": [f.read()]} 59 | dset = Dataset.from_dict(data, features=features) 60 | item = dset[0] 61 | assert item.keys() == {"pdf"} 62 | assert isinstance(item["pdf"], pdfplumber.pdf.PDF) 63 | -------------------------------------------------------------------------------- /benchmarks/benchmark_getitem_100B.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from dataclasses import dataclass 4 | 5 | import numpy as np 6 | import pyarrow as pa 7 | 8 | import datasets 9 | from utils import get_duration 10 | 11 | 12 | SPEED_TEST_N_EXAMPLES = 100_000_000_000 13 | SPEED_TEST_CHUNK_SIZE = 10_000 14 | 15 | RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__) 16 | RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json")) 17 | 18 | 19 | def generate_100B_dataset(num_examples: int, chunk_size: int) -> datasets.Dataset: 20 | table = pa.Table.from_pydict({"col": [0] * chunk_size}) 21 | table = pa.concat_tables([table] * (num_examples // chunk_size)) 22 | return datasets.Dataset(table, fingerprint="table_100B") 23 | 24 | 25 | @dataclass 26 | class RandIter: 27 | low: int 28 | high: int 29 | size: int 30 | seed: int 31 | 32 | def __post_init__(self): 33 | rng = np.random.default_rng(self.seed) 34 | self._sampled_values = rng.integers(low=self.low, high=self.high, size=self.size).tolist() 35 | 36 | def __iter__(self): 37 | return iter(self._sampled_values) 38 | 39 | def __len__(self): 40 | return self.size 41 | 42 | 43 | @get_duration 44 | def get_first_row(dataset: datasets.Dataset): 45 | _ = dataset[0] 46 | 47 | 48 | @get_duration 49 | def get_last_row(dataset: datasets.Dataset): 50 | _ = dataset[-1] 51 | 52 | 53 | @get_duration 54 | def get_batch_of_1024_rows(dataset: datasets.Dataset): 55 | _ = dataset[range(len(dataset) // 2, len(dataset) // 2 + 1024)] 56 | 57 | 58 | @get_duration 59 | def get_batch_of_1024_random_rows(dataset: datasets.Dataset): 60 | _ = dataset[RandIter(0, len(dataset), 1024, seed=42)] 61 | 62 | 63 | def benchmark_table_100B(): 64 | times = {"num examples": SPEED_TEST_N_EXAMPLES} 65 | functions = (get_first_row, get_last_row, get_batch_of_1024_rows, get_batch_of_1024_random_rows) 66 | print("generating dataset") 67 | dataset = generate_100B_dataset(num_examples=SPEED_TEST_N_EXAMPLES, chunk_size=SPEED_TEST_CHUNK_SIZE) 68 | print("Functions") 69 | for func in functions: 70 | print(func.__name__) 71 | times[func.__name__] = func(dataset) 72 | 73 | with open(RESULTS_FILE_PATH, "wb") as f: 74 | f.write(json.dumps(times).encode("utf-8")) 75 | 76 | 77 | if __name__ == "__main__": # useful to run the profiler 78 | benchmark_table_100B() 79 | -------------------------------------------------------------------------------- /tests/packaged_modules/test_arrow.py: -------------------------------------------------------------------------------- 1 | import pyarrow as pa 2 | import pytest 3 | 4 | from datasets.builder import InvalidConfigName 5 | from datasets.data_files import DataFilesList 6 | from datasets.packaged_modules.arrow.arrow import Arrow, ArrowConfig 7 | 8 | 9 | @pytest.fixture 10 | def arrow_file_streaming_format(tmp_path): 11 | filename = tmp_path / "stream.arrow" 12 | testdata = [[1, 1, 1], [0, 100, 6], [1, 90, 900]] 13 | 14 | schema = pa.schema([pa.field("input_ids", pa.list_(pa.int32()))]) 15 | array = pa.array(testdata, type=pa.list_(pa.int32())) 16 | table = pa.Table.from_arrays([array], schema=schema) 17 | with open(filename, "wb") as f: 18 | with pa.ipc.new_stream(f, schema) as writer: 19 | writer.write_table(table) 20 | return str(filename) 21 | 22 | 23 | @pytest.fixture 24 | def arrow_file_file_format(tmp_path): 25 | filename = tmp_path / "file.arrow" 26 | testdata = [[1, 1, 1], [0, 100, 6], [1, 90, 900]] 27 | 28 | schema = pa.schema([pa.field("input_ids", pa.list_(pa.int32()))]) 29 | array = pa.array(testdata, type=pa.list_(pa.int32())) 30 | table = pa.Table.from_arrays([array], schema=schema) 31 | with open(filename, "wb") as f: 32 | with pa.ipc.new_file(f, schema) as writer: 33 | writer.write_table(table) 34 | return str(filename) 35 | 36 | 37 | @pytest.mark.parametrize( 38 | "file_fixture, config_kwargs", 39 | [ 40 | ("arrow_file_streaming_format", {}), 41 | ("arrow_file_file_format", {}), 42 | ], 43 | ) 44 | def test_arrow_generate_tables(file_fixture, config_kwargs, request): 45 | arrow = Arrow(**config_kwargs) 46 | generator = arrow._generate_tables([[request.getfixturevalue(file_fixture)]]) 47 | pa_table = pa.concat_tables([table for _, table in generator]) 48 | 49 | expected = {"input_ids": [[1, 1, 1], [0, 100, 6], [1, 90, 900]]} 50 | assert pa_table.to_pydict() == expected 51 | 52 | 53 | def test_config_raises_when_invalid_name() -> None: 54 | with pytest.raises(InvalidConfigName, match="Bad characters"): 55 | _ = ArrowConfig(name="name-with-*-invalid-character") 56 | 57 | 58 | @pytest.mark.parametrize("data_files", ["str_path", ["str_path"], DataFilesList(["str_path"], [()])]) 59 | def test_config_raises_when_invalid_data_files(data_files) -> None: 60 | with pytest.raises(ValueError, match="Expected a DataFilesDict"): 61 | _ = ArrowConfig(name="name", data_files=data_files) 62 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import datasets 4 | import datasets.config 5 | 6 | 7 | # Import fixture modules as plugins 8 | pytest_plugins = ["tests.fixtures.files", "tests.fixtures.hub", "tests.fixtures.fsspec"] 9 | 10 | 11 | def pytest_collection_modifyitems(config, items): 12 | # Mark tests as "unit" by default if not marked as "integration" (or already marked as "unit") 13 | for item in items: 14 | if any(marker in item.keywords for marker in ["integration", "unit"]): 15 | continue 16 | item.add_marker(pytest.mark.unit) 17 | 18 | 19 | @pytest.fixture(autouse=True) 20 | def set_test_cache_config(tmp_path_factory, monkeypatch): 21 | # test_hf_cache_home = tmp_path_factory.mktemp("cache") # TODO: why a cache dir per test function does not work? 22 | test_hf_cache_home = tmp_path_factory.getbasetemp() / "cache" 23 | test_hf_datasets_cache = test_hf_cache_home / "datasets" 24 | monkeypatch.setattr("datasets.config.HF_DATASETS_CACHE", str(test_hf_datasets_cache)) 25 | test_downloaded_datasets_path = test_hf_datasets_cache / "downloads" 26 | monkeypatch.setattr("datasets.config.DOWNLOADED_DATASETS_PATH", str(test_downloaded_datasets_path)) 27 | test_extracted_datasets_path = test_hf_datasets_cache / "downloads" / "extracted" 28 | monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(test_extracted_datasets_path)) 29 | 30 | 31 | @pytest.fixture(autouse=True) 32 | def disable_implicit_token(monkeypatch): 33 | monkeypatch.setattr("huggingface_hub.constants.HF_HUB_DISABLE_IMPLICIT_TOKEN", True) 34 | 35 | 36 | @pytest.fixture(autouse=True, scope="session") 37 | def disable_tqdm_output(): 38 | datasets.disable_progress_bar() 39 | 40 | 41 | @pytest.fixture(autouse=True) 42 | def set_update_download_counts_to_false(monkeypatch): 43 | # don't take tests into account when counting downloads 44 | monkeypatch.setattr("datasets.config.HF_UPDATE_DOWNLOAD_COUNTS", False) 45 | 46 | 47 | @pytest.fixture 48 | def set_sqlalchemy_silence_uber_warning(monkeypatch): 49 | # Required to suppress RemovedIn20Warning when feature(s) are not compatible with SQLAlchemy 2.0 50 | # To be removed once SQLAlchemy 2.0 supported 51 | try: 52 | monkeypatch.setattr("sqlalchemy.util.deprecations.SILENCE_UBER_WARNING", True) 53 | except (ModuleNotFoundError, AttributeError): 54 | pass 55 | 56 | 57 | @pytest.fixture(autouse=True, scope="session") 58 | def zero_time_out_for_remote_code(): 59 | datasets.config.TIME_OUT_REMOTE_CODE = 0 60 | -------------------------------------------------------------------------------- /tests/test_filesystem.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import fsspec 4 | import pytest 5 | from fsspec.core import url_to_fs 6 | from fsspec.registry import _registry as _fsspec_registry 7 | 8 | from datasets.filesystems import COMPRESSION_FILESYSTEMS, is_remote_filesystem 9 | 10 | from .utils import require_lz4, require_zstandard 11 | 12 | 13 | def test_mockfs(mockfs): 14 | assert "mock" in _fsspec_registry 15 | assert "bz2" in _fsspec_registry 16 | 17 | 18 | def test_non_mockfs(): 19 | assert "mock" not in _fsspec_registry 20 | assert "bz2" in _fsspec_registry 21 | 22 | 23 | def test_is_remote_filesystem(mockfs): 24 | is_remote = is_remote_filesystem(mockfs) 25 | assert is_remote is True 26 | 27 | fs = fsspec.filesystem("file") 28 | 29 | is_remote = is_remote_filesystem(fs) 30 | assert is_remote is False 31 | 32 | 33 | @pytest.mark.parametrize("compression_fs_class", COMPRESSION_FILESYSTEMS) 34 | def test_compression_filesystems(compression_fs_class, gz_file, bz2_file, lz4_file, zstd_file, xz_file, text_file): 35 | input_paths = {"gzip": gz_file, "xz": xz_file, "zstd": zstd_file, "bz2": bz2_file, "lz4": lz4_file} 36 | input_path = input_paths[compression_fs_class.protocol] 37 | if input_path is None: 38 | reason = f"for '{compression_fs_class.protocol}' compression protocol, " 39 | if compression_fs_class.protocol == "lz4": 40 | reason += require_lz4.kwargs["reason"] 41 | elif compression_fs_class.protocol == "zstd": 42 | reason += require_zstandard.kwargs["reason"] 43 | pytest.skip(reason) 44 | fs = fsspec.filesystem(compression_fs_class.protocol, fo=input_path) 45 | expected_filename = os.path.basename(input_path) 46 | expected_filename = expected_filename[: expected_filename.rindex(".")] 47 | assert fs.glob("*") == [expected_filename] 48 | with fs.open(expected_filename, "r", encoding="utf-8") as f, open(text_file, encoding="utf-8") as expected_file: 49 | assert f.read() == expected_file.read() 50 | 51 | 52 | @pytest.mark.parametrize("protocol", ["zip", "gzip"]) 53 | def test_fs_isfile(protocol, zip_jsonl_path, jsonl_gz_path): 54 | compressed_file_paths = {"zip": zip_jsonl_path, "gzip": jsonl_gz_path} 55 | compressed_file_path = compressed_file_paths[protocol] 56 | member_file_path = "dataset.jsonl" 57 | path = f"{protocol}://{member_file_path}::{compressed_file_path}" 58 | fs, *_ = url_to_fs(path) 59 | assert fs.isfile(member_file_path) 60 | assert not fs.isfile("non_existing_" + member_file_path) 61 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/imagefolder/imagefolder.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | 3 | from ..folder_based_builder import folder_based_builder 4 | 5 | 6 | logger = datasets.utils.logging.get_logger(__name__) 7 | 8 | 9 | class ImageFolderConfig(folder_based_builder.FolderBasedBuilderConfig): 10 | """BuilderConfig for ImageFolder.""" 11 | 12 | drop_labels: bool = None 13 | drop_metadata: bool = None 14 | 15 | def __post_init__(self): 16 | super().__post_init__() 17 | 18 | 19 | class ImageFolder(folder_based_builder.FolderBasedBuilder): 20 | BASE_FEATURE = datasets.Image 21 | BASE_COLUMN_NAME = "image" 22 | BUILDER_CONFIG_CLASS = ImageFolderConfig 23 | EXTENSIONS: list[str] # definition at the bottom of the script 24 | 25 | 26 | # Obtained with: 27 | # ``` 28 | # import PIL.Image 29 | # IMAGE_EXTENSIONS = [] 30 | # PIL.Image.init() 31 | # for ext, format in PIL.Image.EXTENSION.items(): 32 | # if format in PIL.Image.OPEN: 33 | # IMAGE_EXTENSIONS.append(ext[1:]) 34 | # ``` 35 | # We intentionally do not run this code on launch because: 36 | # (1) Pillow is an optional dependency, so importing Pillow in global namespace is not allowed 37 | # (2) To ensure the list of supported extensions is deterministic 38 | IMAGE_EXTENSIONS = [ 39 | ".blp", 40 | ".bmp", 41 | ".dib", 42 | ".bufr", 43 | ".cur", 44 | ".pcx", 45 | ".dcx", 46 | ".dds", 47 | ".ps", 48 | ".eps", 49 | ".fit", 50 | ".fits", 51 | ".fli", 52 | ".flc", 53 | ".ftc", 54 | ".ftu", 55 | ".gbr", 56 | ".gif", 57 | ".grib", 58 | # ".h5", # may contain zero or several images 59 | # ".hdf", # may contain zero or several images 60 | ".png", 61 | ".apng", 62 | ".jp2", 63 | ".j2k", 64 | ".jpc", 65 | ".jpf", 66 | ".jpx", 67 | ".j2c", 68 | ".icns", 69 | ".ico", 70 | ".im", 71 | ".iim", 72 | ".tif", 73 | ".tiff", 74 | ".jfif", 75 | ".jpe", 76 | ".jpg", 77 | ".jpeg", 78 | ".mpg", 79 | ".mpeg", 80 | ".msp", 81 | ".pcd", 82 | ".pxr", 83 | ".pbm", 84 | ".pgm", 85 | ".ppm", 86 | ".pnm", 87 | ".psd", 88 | ".bw", 89 | ".rgb", 90 | ".rgba", 91 | ".sgi", 92 | ".ras", 93 | ".tga", 94 | ".icb", 95 | ".vda", 96 | ".vst", 97 | ".webp", 98 | ".wmf", 99 | ".emf", 100 | ".xbm", 101 | ".xpm", 102 | ] 103 | ImageFolder.EXTENSIONS = IMAGE_EXTENSIONS 104 | -------------------------------------------------------------------------------- /docs/source/about_arrow.md: -------------------------------------------------------------------------------- 1 | # Datasets 🤝 Arrow 2 | 3 | ## What is Arrow? 4 | 5 | [Arrow](https://arrow.apache.org/) enables large amounts of data to be processed and moved quickly. It is a specific data format that stores data in a columnar memory layout. This provides several significant advantages: 6 | 7 | * Arrow's standard format allows [zero-copy reads](https://en.wikipedia.org/wiki/Zero-copy) which removes virtually all serialization overhead. 8 | * Arrow is language-agnostic so it supports different programming languages. 9 | * Arrow is column-oriented so it is faster at querying and processing slices or columns of data. 10 | * Arrow allows for copy-free hand-offs to standard machine learning tools such as NumPy, Pandas, PyTorch, and TensorFlow. 11 | * Arrow supports many, possibly nested, column types. 12 | 13 | ## Memory-mapping 14 | 15 | 🤗 Datasets uses Arrow for its local caching system. It allows datasets to be backed by an on-disk cache, which is memory-mapped for fast lookup. 16 | This architecture allows for large datasets to be used on machines with relatively small device memory. 17 | 18 | For example, loading the full English Wikipedia dataset only takes a few MB of RAM: 19 | 20 | ```python 21 | >>> import os; import psutil; import timeit 22 | >>> from datasets import load_dataset 23 | 24 | # Process.memory_info is expressed in bytes, so convert to megabytes 25 | >>> mem_before = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024) 26 | >>> wiki = load_dataset("wikimedia/wikipedia", "20220301.en", split="train") 27 | >>> mem_after = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024) 28 | 29 | >>> print(f"RAM memory used: {(mem_after - mem_before)} MB") 30 | RAM memory used: 50 MB 31 | ``` 32 | 33 | This is possible because the Arrow data is actually memory-mapped from disk, and not loaded in memory. 34 | Memory-mapping allows access to data on disk, and leverages virtual memory capabilities for fast lookups. 35 | 36 | ## Performance 37 | 38 | Iterating over a memory-mapped dataset using Arrow is fast. Iterating over Wikipedia on a laptop gives you speeds of 1-3 Gbit/s: 39 | 40 | ```python 41 | >>> s = """batch_size = 1000 42 | ... for batch in wiki.iter(batch_size): 43 | ... ... 44 | ... """ 45 | 46 | >>> elapsed_time = timeit.timeit(stmt=s, number=1, globals=globals()) 47 | >>> print(f"Time to iterate over the {wiki.dataset_size >> 30} GB dataset: {elapsed_time:.1f} sec, " 48 | ... f"ie. {float(wiki.dataset_size >> 27)/elapsed_time:.1f} Gb/s") 49 | Time to iterate over the 18 GB dataset: 31.8 sec, ie. 4.8 Gb/s 50 | ``` 51 | -------------------------------------------------------------------------------- /src/datasets/utils/_filelock.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2023 The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License 15 | """Utilities to handle file locking in `datasets`.""" 16 | 17 | import os 18 | 19 | from filelock import FileLock as FileLock_ 20 | from filelock import UnixFileLock 21 | from filelock import __version__ as _filelock_version 22 | from packaging import version 23 | 24 | 25 | class FileLock(FileLock_): 26 | """ 27 | A `filelock.FileLock` initializer that handles long paths. 28 | It also uses the current umask for lock files. 29 | """ 30 | 31 | MAX_FILENAME_LENGTH = 255 32 | 33 | def __init__(self, lock_file, *args, **kwargs): 34 | # The "mode" argument is required if we want to use the current umask in filelock >= 3.10 35 | # In previous previous it was already using the current umask. 36 | if "mode" not in kwargs and version.parse(_filelock_version) >= version.parse("3.10.0"): 37 | umask = os.umask(0o666) 38 | os.umask(umask) 39 | kwargs["mode"] = 0o666 & ~umask 40 | lock_file = self.hash_filename_if_too_long(lock_file) 41 | super().__init__(lock_file, *args, **kwargs) 42 | 43 | @classmethod 44 | def hash_filename_if_too_long(cls, path: str) -> str: 45 | path = os.path.abspath(os.path.expanduser(path)) 46 | filename = os.path.basename(path) 47 | max_filename_length = cls.MAX_FILENAME_LENGTH 48 | if issubclass(cls, UnixFileLock): 49 | max_filename_length = min(max_filename_length, os.statvfs(os.path.dirname(path)).f_namemax) 50 | if len(filename) > max_filename_length: 51 | dirname = os.path.dirname(path) 52 | hashed_filename = str(hash(filename)) 53 | new_filename = ( 54 | filename[: max_filename_length - len(hashed_filename) - 8] + "..." + hashed_filename + ".lock" 55 | ) 56 | return os.path.join(dirname, new_filename) 57 | else: 58 | return path 59 | -------------------------------------------------------------------------------- /docs/source/package_reference/utilities.mdx: -------------------------------------------------------------------------------- 1 | # Utilities 2 | 3 | ## Configure logging 4 | 5 | 🤗 Datasets strives to be transparent and explicit about how it works, but this can be quite verbose at times. We have included a series of logging methods which allow you to easily adjust the level of verbosity of the entire library. Currently the default verbosity of the library is set to `WARNING`. 6 | 7 | To change the level of verbosity, use one of the direct setters. For instance, here is how to change the verbosity to the `INFO` level: 8 | 9 | ```py 10 | import datasets 11 | datasets.logging.set_verbosity_info() 12 | ``` 13 | 14 | You can also use the environment variable `DATASETS_VERBOSITY` to override the default verbosity, and set it to one of the following: `debug`, `info`, `warning`, `error`, `critical`: 15 | 16 | ```bash 17 | DATASETS_VERBOSITY=error ./myprogram.py 18 | ``` 19 | 20 | All the methods of this logging module are documented below. The main ones are: 21 | 22 | - [`logging.get_verbosity`] to get the current level of verbosity in the logger 23 | - [`logging.set_verbosity`] to set the verbosity to the level of your choice 24 | 25 | In order from the least to the most verbose (with their corresponding `int` values): 26 | 27 | 1. `logging.CRITICAL` or `logging.FATAL` (int value, 50): only report the most critical errors. 28 | 2. `logging.ERROR` (int value, 40): only report errors. 29 | 3. `logging.WARNING` or `logging.WARN` (int value, 30): only reports error and warnings. This the default level used by the library. 30 | 4. `logging.INFO` (int value, 20): reports error, warnings and basic information. 31 | 5. `logging.DEBUG` (int value, 10): report all information. 32 | 33 | [[autodoc]] datasets.logging.get_verbosity 34 | 35 | [[autodoc]] datasets.logging.set_verbosity 36 | 37 | [[autodoc]] datasets.logging.set_verbosity_info 38 | 39 | [[autodoc]] datasets.logging.set_verbosity_warning 40 | 41 | [[autodoc]] datasets.logging.set_verbosity_debug 42 | 43 | [[autodoc]] datasets.logging.set_verbosity_error 44 | 45 | [[autodoc]] datasets.logging.disable_propagation 46 | 47 | [[autodoc]] datasets.logging.enable_propagation 48 | 49 | ## Configure progress bars 50 | 51 | By default, `tqdm` progress bars will be displayed during dataset download and preprocessing. You can disable them globally by setting `HF_DATASETS_DISABLE_PROGRESS_BARS` 52 | environment variable. You can also enable/disable them using [`~utils.enable_progress_bars`] and [`~utils.disable_progress_bars`]. If set, the environment variable has priority on the helpers. 53 | 54 | [[autodoc]] datasets.utils.enable_progress_bars 55 | 56 | [[autodoc]] datasets.utils.disable_progress_bars 57 | 58 | [[autodoc]] datasets.utils.are_progress_bars_disabled -------------------------------------------------------------------------------- /docs/source/dataset_card.mdx: -------------------------------------------------------------------------------- 1 | # Create a dataset card 2 | 3 | Each dataset should have a dataset card to promote responsible usage and inform users of any potential biases within the dataset. 4 | This idea was inspired by the Model Cards proposed by [Mitchell, 2018](https://huggingface.co/papers/1810.03993). 5 | Dataset cards help users understand a dataset's contents, the context for using the dataset, how it was created, and any other considerations a user should be aware of. 6 | 7 | Creating a dataset card is easy and can be done in just a few steps: 8 | 9 | 1. Go to your dataset repository on the [Hub](https://hf.co/new-dataset) and click on **Create Dataset Card** to create a new `README.md` file in your repository. 10 | 11 | 2. Use the **Metadata UI** to select the tags that describe your dataset. You can add a license, language, pretty_name, the task_categories, size_categories, and any other tags that you think are relevant. These tags help users discover and find your dataset on the Hub. 12 | 13 |
14 | 15 | 16 |
17 | 18 | > [!TIP] 19 | > For a complete, but not required, set of tag options you can also look at the [Dataset Card specifications](https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1). This'll have a few more tag options like `multilinguality` and `language_creators` which are useful but not absolutely necessary. 20 | 21 | 3. Click on the **Import dataset card template** link to automatically create a template with all the relevant fields to complete. Fill out the template sections to the best of your ability. Take a look at the [Dataset Card Creation Guide](https://github.com/huggingface/datasets/blob/main/templates/README_guide.md) for more detailed information about what to include in each section of the card. For fields you are unable to complete, you can write **[More Information Needed]**. 22 | 23 | 4. Once you're done, commit the changes to the `README.md` file and you'll see the completed dataset card on your repository. 24 | 25 | YAML also allows you to customize the way your dataset is loaded by [defining splits and/or configurations](./repository_structure#define-your-splits-and-subsets-in-yaml) without the need to write any code. 26 | 27 | Feel free to take a look at the [SNLI](https://huggingface.co/datasets/stanfordnlp/snli), [CNN/DailyMail](https://huggingface.co/datasets/abisee/cnn_dailymail), and [Allociné](https://huggingface.co/datasets/tblard/allocine) dataset cards as examples to help you get started. 28 | -------------------------------------------------------------------------------- /benchmarks/benchmark_map_filter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import tempfile 4 | 5 | import transformers 6 | 7 | import datasets 8 | from utils import generate_example_dataset, get_duration 9 | 10 | 11 | SPEED_TEST_N_EXAMPLES = 500_000 12 | 13 | RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__) 14 | RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json")) 15 | 16 | 17 | @get_duration 18 | def map(dataset: datasets.Dataset, **kwargs): 19 | _ = dataset.map(**kwargs) 20 | 21 | 22 | @get_duration 23 | def filter(dataset: datasets.Dataset, **kwargs): 24 | _ = dataset.filter(**kwargs) 25 | 26 | 27 | def benchmark_map_filter(): 28 | times = {"num examples": SPEED_TEST_N_EXAMPLES} 29 | with tempfile.TemporaryDirectory() as tmp_dir: 30 | features = datasets.Features({"text": datasets.Value("string"), "numbers": datasets.Value("float32")}) 31 | dataset = generate_example_dataset( 32 | os.path.join(tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES 33 | ) 34 | 35 | tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True) 36 | 37 | def tokenize(examples): 38 | return tokenizer(examples["text"]) 39 | 40 | times["map identity"] = map(dataset) 41 | 42 | times["map identity batched"] = map(dataset, batched=True) 43 | 44 | times["map no-op batched"] = map(dataset, function=lambda x: None, batched=True) 45 | 46 | with dataset.formatted_as(type="numpy"): 47 | times["map no-op batched numpy"] = map(dataset, function=lambda x: None, batched=True) 48 | 49 | with dataset.formatted_as(type="pandas"): 50 | times["map no-op batched pandas"] = map(dataset, function=lambda x: None, batched=True) 51 | 52 | with dataset.formatted_as(type="torch", columns="numbers"): 53 | times["map no-op batched pytorch"] = map(dataset, function=lambda x: None, batched=True) 54 | 55 | with dataset.formatted_as(type="tensorflow", columns="numbers"): 56 | times["map no-op batched tensorflow"] = map(dataset, function=lambda x: None, batched=True) 57 | 58 | times["map fast-tokenizer batched"] = map(dataset, function=tokenize, batched=True) 59 | 60 | times["filter"] = filter(dataset) 61 | 62 | # Activate later when tokenizer support batched inputs 63 | # with dataset.formatted_as(type='numpy'): 64 | # times[func.__name__ + " fast-tokenizer batched numpy"] = func(dataset, function=tokenize, batched=True) 65 | 66 | with open(RESULTS_FILE_PATH, "wb") as f: 67 | f.write(json.dumps(times).encode("utf-8")) 68 | 69 | 70 | if __name__ == "__main__": # useful to run the profiler 71 | benchmark_map_filter() 72 | -------------------------------------------------------------------------------- /docs/source/use_with_pandas.mdx: -------------------------------------------------------------------------------- 1 | # Use with Pandas 2 | 3 | This document is a quick introduction to using `datasets` with Pandas, with a particular focus on how to process 4 | datasets using Pandas functions, and how to convert a dataset to Pandas or from Pandas. 5 | 6 | This is particularly useful as it allows fast operations, since `datasets` uses PyArrow under the hood and PyArrow is well integrated with Pandas. 7 | 8 | ## Dataset format 9 | 10 | By default, datasets return regular Python objects: integers, floats, strings, lists, etc. 11 | 12 | To get Pandas DataFrames or Series instead, you can set the format of the dataset to `pandas` using [`Dataset.with_format`]: 13 | 14 | ```py 15 | >>> from datasets import Dataset 16 | >>> data = {"col_0": ["a", "b", "c", "d"], "col_1": [0., 0., 1., 1.]} 17 | >>> ds = Dataset.from_dict(data) 18 | >>> ds = ds.with_format("pandas") 19 | >>> ds[0] # pd.DataFrame 20 | col_0 col_1 21 | 0 a 0.0 22 | >>> ds[:2] # pd.DataFrame 23 | col_0 col_1 24 | 0 a 0.0 25 | 1 b 0.0 26 | >>> ds["data"] # pd.Series 27 | 0 a 28 | 1 b 29 | 2 c 30 | 3 d 31 | Name: col_0, dtype: object 32 | ``` 33 | 34 | This also works for `IterableDataset` objects obtained e.g. using `load_dataset(..., streaming=True)`: 35 | 36 | ```py 37 | >>> ds = ds.with_format("pandas") 38 | >>> for df in ds.iter(batch_size=2): 39 | ... print(df) 40 | ... break 41 | col_0 col_1 42 | 0 a 0.0 43 | 1 b 0.0 44 | ``` 45 | 46 | ## Process data 47 | 48 | Pandas functions are generally faster than regular hand-written python functions, and therefore they are a good option to optimize data processing. You can use Pandas functions to process a dataset in [`Dataset.map`] or [`Dataset.filter`]: 49 | 50 | ```python 51 | >>> from datasets import Dataset 52 | >>> data = {"col_0": ["a", "b", "c", "d"], "col_1": [0., 0., 1., 1.]} 53 | >>> ds = Dataset.from_dict(data) 54 | >>> ds = ds.with_format("pandas") 55 | >>> ds = ds.map(lambda df: df.assign(col_2=df.col_1 + 1), batched=True) 56 | >>> ds[:2] 57 | col_0 col_1 col_2 58 | 0 a 0.0 1.0 59 | 1 b 0.0 1.0 60 | >>> ds = ds.filter(lambda df: df.col_0 == "b", batched=True) 61 | >>> ds[0] 62 | col_0 col_1 col_2 63 | 0 b 0.0 1.0 64 | ``` 65 | 66 | We use `batched=True` because it is faster to process batches of data in Pandas rather than row by row. It's also possible to use `batch_size=` in `map()` to set the size of each `df`. 67 | 68 | This also works for [`IterableDataset.map`] and [`IterableDataset.filter`]. 69 | 70 | ## Import or Export from Pandas 71 | 72 | To import data from Pandas, you can use [`Dataset.from_pandas`]: 73 | 74 | ```python 75 | ds = Dataset.from_pandas(df) 76 | ``` 77 | 78 | And you can use [`Dataset.to_pandas`] to export a Dataset to a Pandas DataFrame: 79 | 80 | 81 | ```python 82 | df = Dataset.to_pandas() 83 | ``` 84 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/pandas/pandas.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import warnings 3 | from dataclasses import dataclass 4 | from typing import Optional 5 | 6 | import pandas as pd 7 | import pyarrow as pa 8 | 9 | import datasets 10 | from datasets.builder import Key 11 | from datasets.table import table_cast 12 | 13 | 14 | @dataclass 15 | class PandasConfig(datasets.BuilderConfig): 16 | """BuilderConfig for Pandas.""" 17 | 18 | features: Optional[datasets.Features] = None 19 | 20 | def __post_init__(self): 21 | super().__post_init__() 22 | 23 | 24 | class Pandas(datasets.ArrowBasedBuilder): 25 | BUILDER_CONFIG_CLASS = PandasConfig 26 | 27 | def _info(self): 28 | warnings.warn( 29 | "The Pandas builder is deprecated and will be removed in the next major version of datasets.", 30 | FutureWarning, 31 | ) 32 | return datasets.DatasetInfo(features=self.config.features) 33 | 34 | def _split_generators(self, dl_manager): 35 | """We handle string, list and dicts in datafiles""" 36 | if not self.config.data_files: 37 | raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}") 38 | data_files = dl_manager.download_and_extract(self.config.data_files) 39 | if isinstance(data_files, (str, list, tuple)): 40 | files = data_files 41 | if isinstance(files, str): 42 | files = [files] 43 | # Use `dl_manager.iter_files` to skip hidden files in an extracted archive 44 | files = [dl_manager.iter_files(file) for file in files] 45 | return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files})] 46 | splits = [] 47 | for split_name, files in data_files.items(): 48 | if isinstance(files, str): 49 | files = [files] 50 | # Use `dl_manager.iter_files` to skip hidden files in an extracted archive 51 | files = [dl_manager.iter_files(file) for file in files] 52 | splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files})) 53 | return splits 54 | 55 | def _cast_table(self, pa_table: pa.Table) -> pa.Table: 56 | if self.config.features is not None: 57 | # more expensive cast to support nested features with keys in a different order 58 | # allows str <-> int/float or str to Audio for example 59 | pa_table = table_cast(pa_table, self.config.features.arrow_schema) 60 | return pa_table 61 | 62 | def _generate_tables(self, files): 63 | for i, file in enumerate(itertools.chain.from_iterable(files)): 64 | with open(file, "rb") as f: 65 | pa_table = pa.Table.from_pandas(pd.read_pickle(f)) 66 | yield Key(i, 0), self._cast_table(pa_table) 67 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/eval/eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from itertools import islice 4 | 5 | import pyarrow as pa 6 | 7 | import datasets 8 | from datasets.builder import Key 9 | 10 | 11 | logger = datasets.utils.logging.get_logger(__name__) 12 | 13 | 14 | class Eval(datasets.GeneratorBasedBuilder): 15 | NUM_EXAMPLES_FOR_FEATURES_INFERENCE = 5 16 | 17 | def _info(self): 18 | return datasets.DatasetInfo() 19 | 20 | def _split_generators(self, dl_manager): 21 | """We handle string, list and dicts in datafiles""" 22 | if not self.config.data_files: 23 | raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}") 24 | dl_manager.download_config.extract_on_the_fly = True 25 | data_files = dl_manager.download_and_extract(self.config.data_files) 26 | splits = [] 27 | for split_name, logs in data_files.items(): 28 | if isinstance(logs, str): 29 | logs = [logs] 30 | logs_files = [dl_manager.iter_files(log) for log in logs] 31 | splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"logs_files": logs_files})) 32 | if not self.info.features: 33 | first_examples = list( 34 | islice(self._iter_samples_from_log_files(logs_files[0]), self.NUM_EXAMPLES_FOR_FEATURES_INFERENCE) 35 | ) 36 | pa_tables = [pa.Table.from_pylist([example]) for example in first_examples] 37 | inferred_arrow_schema = pa.concat_tables(pa_tables, promote_options="default").schema 38 | self.info.features = datasets.Features.from_arrow_schema(inferred_arrow_schema) 39 | 40 | return splits 41 | 42 | def _sort_samples_key(self, sample_path: str): 43 | # looks like "{sample_idx}_epoch_{epoch_idx}"" 44 | (sample_idx_str, epoch_idx_str) = os.path.splitext(os.path.basename(sample_path))[0].split("_epoch_") 45 | return (int(epoch_idx_str), int(sample_idx_str)) 46 | 47 | def _iter_samples_from_log_files(self, log_files: list[str]): 48 | sample_files = [log_file for log_file in log_files if os.path.basename(os.path.dirname(log_file)) == "samples"] 49 | sample_files.sort(key=self._sort_samples_key) 50 | for sample_file in sample_files: 51 | with open(sample_file) as f: 52 | sample = json.load(f) 53 | for field in sample: 54 | if isinstance(sample[field], dict): 55 | sample[field] = json.dumps(sample[field]) 56 | if isinstance(sample[field], list): 57 | sample[field] = [json.dumps(x) for x in sample[field]] 58 | yield sample 59 | 60 | def _generate_examples(self, logs_files): 61 | for file_idx, log_files in enumerate(logs_files): 62 | for sample_idx, sample in enumerate(self._iter_samples_from_log_files(log_files)): 63 | yield Key(file_idx, sample_idx), sample 64 | -------------------------------------------------------------------------------- /docs/source/index.mdx: -------------------------------------------------------------------------------- 1 | # Datasets 2 | 3 | 4 | 5 | 🤗 Datasets is a library for easily accessing and sharing AI datasets for Audio, Computer Vision, and Natural Language Processing (NLP) tasks. 6 | 7 | Load a dataset in a single line of code, and use our powerful data processing and streaming methods to quickly get your dataset ready for training in a deep learning model. Backed by the Apache Arrow format, process large datasets with zero-copy reads without any memory constraints for optimal speed and efficiency. We also feature a deep integration with the [Hugging Face Hub](https://huggingface.co/datasets), allowing you to easily load and share a dataset with the wider machine learning community. 8 | 9 | Find your dataset today on the [Hugging Face Hub](https://huggingface.co/datasets), and take an in-depth look inside of it with the live viewer. 10 | 11 | 31 | -------------------------------------------------------------------------------- /docs/source/package_reference/table_classes.mdx: -------------------------------------------------------------------------------- 1 | # Table Classes 2 | 3 | Each `Dataset` object is backed by a PyArrow Table. 4 | A Table can be loaded from either the disk (memory mapped) or in memory. 5 | Several Table types are available, and they all inherit from [`table.Table`]. 6 | 7 | ## Table 8 | 9 | [[autodoc]] datasets.table.Table 10 | - validate 11 | - equals 12 | - to_batches 13 | - to_pydict 14 | - to_pandas 15 | - to_string 16 | - field 17 | - column 18 | - itercolumns 19 | - schema 20 | - columns 21 | - num_columns 22 | - num_rows 23 | - shape 24 | - nbytes 25 | 26 | ## InMemoryTable 27 | 28 | [[autodoc]] datasets.table.InMemoryTable 29 | - validate 30 | - equals 31 | - to_batches 32 | - to_pydict 33 | - to_pandas 34 | - to_string 35 | - field 36 | - column 37 | - itercolumns 38 | - schema 39 | - columns 40 | - num_columns 41 | - num_rows 42 | - shape 43 | - nbytes 44 | - column_names 45 | - slice 46 | - filter 47 | - flatten 48 | - combine_chunks 49 | - cast 50 | - replace_schema_metadata 51 | - add_column 52 | - append_column 53 | - remove_column 54 | - set_column 55 | - rename_columns 56 | - select 57 | - drop 58 | - from_file 59 | - from_buffer 60 | - from_pandas 61 | - from_arrays 62 | - from_pydict 63 | - from_batches 64 | 65 | ## MemoryMappedTable 66 | 67 | [[autodoc]] datasets.table.MemoryMappedTable 68 | - validate 69 | - equals 70 | - to_batches 71 | - to_pydict 72 | - to_pandas 73 | - to_string 74 | - field 75 | - column 76 | - itercolumns 77 | - schema 78 | - columns 79 | - num_columns 80 | - num_rows 81 | - shape 82 | - nbytes 83 | - column_names 84 | - slice 85 | - filter 86 | - flatten 87 | - combine_chunks 88 | - cast 89 | - replace_schema_metadata 90 | - add_column 91 | - append_column 92 | - remove_column 93 | - set_column 94 | - rename_columns 95 | - select 96 | - drop 97 | - from_file 98 | 99 | ## ConcatenationTable 100 | 101 | [[autodoc]] datasets.table.ConcatenationTable 102 | - validate 103 | - equals 104 | - to_batches 105 | - to_pydict 106 | - to_pandas 107 | - to_string 108 | - field 109 | - column 110 | - itercolumns 111 | - schema 112 | - columns 113 | - num_columns 114 | - num_rows 115 | - shape 116 | - nbytes 117 | - column_names 118 | - slice 119 | - filter 120 | - flatten 121 | - combine_chunks 122 | - cast 123 | - replace_schema_metadata 124 | - add_column 125 | - append_column 126 | - remove_column 127 | - set_column 128 | - rename_columns 129 | - select 130 | - drop 131 | - from_blocks 132 | - from_tables 133 | 134 | ## Utils 135 | 136 | [[autodoc]] datasets.table.concat_tables 137 | 138 | [[autodoc]] datasets.table.list_table_cache_files 139 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/xml/xml.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from dataclasses import dataclass 3 | from typing import Optional 4 | 5 | import pyarrow as pa 6 | 7 | import datasets 8 | from datasets.features.features import require_storage_cast 9 | from datasets.table import table_cast 10 | 11 | 12 | logger = datasets.utils.logging.get_logger(__name__) 13 | 14 | 15 | @dataclass 16 | class XmlConfig(datasets.BuilderConfig): 17 | """BuilderConfig for xml files.""" 18 | 19 | features: Optional[datasets.Features] = None 20 | encoding: str = "utf-8" 21 | encoding_errors: Optional[str] = None 22 | 23 | 24 | class Xml(datasets.ArrowBasedBuilder): 25 | BUILDER_CONFIG_CLASS = XmlConfig 26 | 27 | def _info(self): 28 | return datasets.DatasetInfo(features=self.config.features) 29 | 30 | def _split_generators(self, dl_manager): 31 | """The `data_files` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]]. 32 | 33 | If str or List[str], then the dataset returns only the 'train' split. 34 | If dict, then keys should be from the `datasets.Split` enum. 35 | """ 36 | if not self.config.data_files: 37 | raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}") 38 | dl_manager.download_config.extract_on_the_fly = True 39 | data_files = dl_manager.download_and_extract(self.config.data_files) 40 | splits = [] 41 | for split_name, files in data_files.items(): 42 | if isinstance(files, str): 43 | files = [files] 44 | files = [dl_manager.iter_files(file) for file in files] 45 | splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files})) 46 | return splits 47 | 48 | def _cast_table(self, pa_table: pa.Table) -> pa.Table: 49 | if self.config.features is not None: 50 | schema = self.config.features.arrow_schema 51 | if all(not require_storage_cast(feature) for feature in self.config.features.values()): 52 | # cheaper cast 53 | pa_table = pa_table.cast(schema) 54 | else: 55 | # more expensive cast; allows str <-> int/float or str to Audio for example 56 | pa_table = table_cast(pa_table, schema) 57 | return pa_table 58 | else: 59 | return pa_table.cast(pa.schema({"xml": pa.string()})) 60 | 61 | def _generate_tables(self, files): 62 | pa_table_names = list(self.config.features) if self.config.features is not None else ["xml"] 63 | for file_idx, file in enumerate(itertools.chain.from_iterable(files)): 64 | # open in text mode, by default translates universal newlines ("\n", "\r\n" and "\r") into "\n" 65 | with open(file, encoding=self.config.encoding, errors=self.config.encoding_errors) as f: 66 | xml = f.read() 67 | pa_table = pa.Table.from_arrays([pa.array([xml])], names=pa_table_names) 68 | yield (file_idx, 0), self._cast_table(pa_table) 69 | -------------------------------------------------------------------------------- /docs/source/use_with_spark.mdx: -------------------------------------------------------------------------------- 1 | # Use with Spark 2 | 3 | This document is a quick introduction to using 🤗 Datasets with Spark, with a particular focus on how to load a Spark DataFrame into a [`Dataset`] object. 4 | 5 | From there, you have fast access to any element and you can use it as a data loader to train models. 6 | 7 | ## Load from Spark 8 | 9 | A [`Dataset`] object is a wrapper of an Arrow table, which allows fast reads from arrays in the dataset to PyTorch, TensorFlow and JAX tensors. 10 | The Arrow table is memory mapped from disk, which can load datasets bigger than your available RAM. 11 | 12 | You can get a [`Dataset`] from a Spark DataFrame using [`Dataset.from_spark`]: 13 | 14 | ```py 15 | >>> from datasets import Dataset 16 | >>> df = spark.createDataFrame( 17 | ... data=[[1, "Elia"], [2, "Teo"], [3, "Fang"]], 18 | ... columns=["id", "name"], 19 | ... ) 20 | >>> ds = Dataset.from_spark(df) 21 | ``` 22 | 23 | The Spark workers write the dataset on disk in a cache directory as Arrow files, and the [`Dataset`] is loaded from there. 24 | 25 | Alternatively, you can skip materialization by using [`IterableDataset.from_spark`], which returns an [`IterableDataset`]: 26 | 27 | ```py 28 | >>> from datasets import IterableDataset 29 | >>> df = spark.createDataFrame( 30 | ... data=[[1, "Elia"], [2, "Teo"], [3, "Fang"]], 31 | ... columns=["id", "name"], 32 | ... ) 33 | >>> ds = IterableDataset.from_spark(df) 34 | >>> print(next(iter(ds))) 35 | {"id": 1, "name": "Elia"} 36 | ``` 37 | 38 | ### Caching 39 | 40 | When using [`Dataset.from_spark`], the resulting [`Dataset`] is cached; if you call [`Dataset.from_spark`] multiple 41 | times on the same DataFrame it won't re-run the Spark job that writes the dataset as Arrow files on disk. 42 | 43 | You can set the cache location by passing `cache_dir=` to [`Dataset.from_spark`]. 44 | Make sure to use a disk that is available to both your workers and your current machine (the driver). 45 | 46 | > [!WARNING] 47 | > In a different session, a Spark DataFrame doesn't have the same [semantic hash](https://spark.apache.org/docs/3.2.0/api/python/reference/api/pyspark.sql.DataFrame.semanticHash.html), and it will rerun a Spark job and store it in a new cache. 48 | 49 | ### Feature types 50 | 51 | If your dataset is made of images, audio data or N-dimensional arrays, you can specify the `features=` argument in 52 | [`Dataset.from_spark`] (or [`IterableDataset.from_spark`]): 53 | 54 | ```py 55 | >>> from datasets import Dataset, Features, Image, Value 56 | >>> data = [(0, open("image.png", "rb").read())] 57 | >>> df = spark.createDataFrame(data, "idx: int, image: binary") 58 | >>> # Also works if you have arrays 59 | >>> # data = [(0, np.zeros(shape=(32, 32, 3), dtype=np.int32).tolist())] 60 | >>> # df = spark.createDataFrame(data, "idx: int, image: array>>") 61 | >>> features = Features({"idx": Value("int64"), "image": Image()}) 62 | >>> dataset = Dataset.from_spark(df, features=features) 63 | >>> dataset[0] 64 | {'idx': 0, 'image': } 65 | ``` 66 | 67 | You can check the [`Features`] documentation to know about all the feature types available. 68 | -------------------------------------------------------------------------------- /tests/commands/test_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import namedtuple 3 | 4 | import pytest 5 | 6 | from datasets import ClassLabel, Features, List, Value 7 | from datasets.commands.test import TestCommand 8 | from datasets.info import DatasetInfo, DatasetInfosDict 9 | 10 | 11 | _TestCommandArgs = namedtuple( 12 | "_TestCommandArgs", 13 | [ 14 | "dataset", 15 | "name", 16 | "cache_dir", 17 | "data_dir", 18 | "all_configs", 19 | "save_infos", 20 | "ignore_verifications", 21 | "force_redownload", 22 | "clear_cache", 23 | "num_proc", 24 | ], 25 | defaults=[None, None, None, False, False, False, False, False, None], 26 | ) 27 | 28 | 29 | def is_1percent_close(source, target): 30 | return (abs(source - target) / target) < 0.01 31 | 32 | 33 | @pytest.mark.integration 34 | def test_test_command(dataset_dir): 35 | args = _TestCommandArgs(dataset=dataset_dir, all_configs=True, save_infos=True) 36 | test_command = TestCommand(*args) 37 | test_command.run() 38 | dataset_readme_path = os.path.join(dataset_dir, "README.md") 39 | assert os.path.exists(dataset_readme_path) 40 | dataset_infos = DatasetInfosDict.from_directory(dataset_dir) 41 | expected_dataset_infos = DatasetInfosDict( 42 | { 43 | "default": DatasetInfo( 44 | features=Features( 45 | { 46 | "tokens": List(Value("string")), 47 | "ner_tags": List( 48 | ClassLabel(names=["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]) 49 | ), 50 | "langs": List(Value("string")), 51 | "spans": List(Value("string")), 52 | } 53 | ), 54 | splits=[ 55 | { 56 | "name": "train", 57 | "num_bytes": 2351563, 58 | "num_examples": 10000, 59 | }, 60 | { 61 | "name": "validation", 62 | "num_bytes": 238418, 63 | "num_examples": 1000, 64 | }, 65 | ], 66 | download_size=3940680, 67 | dataset_size=2589981, 68 | ) 69 | } 70 | ) 71 | assert dataset_infos.keys() == expected_dataset_infos.keys() 72 | for key in DatasetInfo._INCLUDED_INFO_IN_YAML: 73 | result, expected = getattr(dataset_infos["default"], key), getattr(expected_dataset_infos["default"], key) 74 | if key == "num_bytes": 75 | assert is_1percent_close(result, expected) 76 | elif key == "splits": 77 | assert list(result) == list(expected) 78 | for split in result: 79 | assert result[split].name == expected[split].name 80 | assert result[split].num_examples == expected[split].num_examples 81 | assert is_1percent_close(result[split].num_bytes, expected[split].num_bytes) 82 | else: 83 | result == expected 84 | -------------------------------------------------------------------------------- /docs/source/package_reference/loading_methods.mdx: -------------------------------------------------------------------------------- 1 | # Loading methods 2 | 3 | Methods for listing and loading datasets: 4 | 5 | ## Datasets 6 | 7 | [[autodoc]] datasets.load_dataset 8 | 9 | [[autodoc]] datasets.load_from_disk 10 | 11 | [[autodoc]] datasets.load_dataset_builder 12 | 13 | [[autodoc]] datasets.get_dataset_config_names 14 | 15 | [[autodoc]] datasets.get_dataset_infos 16 | 17 | [[autodoc]] datasets.get_dataset_split_names 18 | 19 | ## From files 20 | 21 | Configurations used to load data files. 22 | They are used when loading local files or a dataset repository: 23 | 24 | - local files: `load_dataset("parquet", data_dir="path/to/data/dir")` 25 | - dataset repository: `load_dataset("allenai/c4")` 26 | 27 | You can pass arguments to `load_dataset` to configure data loading. 28 | For example you can specify the `sep` parameter to define the [`~datasets.packaged_modules.csv.CsvConfig`] that is used to load the data: 29 | 30 | ```python 31 | load_dataset("csv", data_dir="path/to/data/dir", sep="\t") 32 | ``` 33 | 34 | ### Text 35 | 36 | [[autodoc]] datasets.packaged_modules.text.TextConfig 37 | 38 | [[autodoc]] datasets.packaged_modules.text.Text 39 | 40 | ### CSV 41 | 42 | [[autodoc]] datasets.packaged_modules.csv.CsvConfig 43 | 44 | [[autodoc]] datasets.packaged_modules.csv.Csv 45 | 46 | ### JSON 47 | 48 | [[autodoc]] datasets.packaged_modules.json.JsonConfig 49 | 50 | [[autodoc]] datasets.packaged_modules.json.Json 51 | 52 | ### XML 53 | 54 | [[autodoc]] datasets.packaged_modules.xml.XmlConfig 55 | 56 | [[autodoc]] datasets.packaged_modules.xml.Xml 57 | 58 | ### Parquet 59 | 60 | [[autodoc]] datasets.packaged_modules.parquet.ParquetConfig 61 | 62 | [[autodoc]] datasets.packaged_modules.parquet.Parquet 63 | 64 | ### Arrow 65 | 66 | [[autodoc]] datasets.packaged_modules.arrow.ArrowConfig 67 | 68 | [[autodoc]] datasets.packaged_modules.arrow.Arrow 69 | 70 | ### SQL 71 | 72 | [[autodoc]] datasets.packaged_modules.sql.SqlConfig 73 | 74 | [[autodoc]] datasets.packaged_modules.sql.Sql 75 | 76 | ### Images 77 | 78 | [[autodoc]] datasets.packaged_modules.imagefolder.ImageFolderConfig 79 | 80 | [[autodoc]] datasets.packaged_modules.imagefolder.ImageFolder 81 | 82 | ### Audio 83 | 84 | [[autodoc]] datasets.packaged_modules.audiofolder.AudioFolderConfig 85 | 86 | [[autodoc]] datasets.packaged_modules.audiofolder.AudioFolder 87 | 88 | ### Videos 89 | 90 | [[autodoc]] datasets.packaged_modules.videofolder.VideoFolderConfig 91 | 92 | [[autodoc]] datasets.packaged_modules.videofolder.VideoFolder 93 | 94 | ### HDF5 95 | 96 | [[autodoc]] datasets.packaged_modules.hdf5.HDF5Config 97 | 98 | [[autodoc]] datasets.packaged_modules.hdf5.HDF5 99 | 100 | ### Pdf 101 | 102 | [[autodoc]] datasets.packaged_modules.pdffolder.PdfFolderConfig 103 | 104 | [[autodoc]] datasets.packaged_modules.pdffolder.PdfFolder 105 | 106 | ### Nifti 107 | 108 | [[autodoc]] datasets.packaged_modules.niftifolder.NiftiFolderConfig 109 | 110 | [[autodoc]] datasets.packaged_modules.niftifolder.NiftiFolder 111 | 112 | ### WebDataset 113 | 114 | [[autodoc]] datasets.packaged_modules.webdataset.WebDataset 115 | -------------------------------------------------------------------------------- /docs/source/filesystems.mdx: -------------------------------------------------------------------------------- 1 | # Cloud storage 2 | 3 | ## Hugging Face Datasets 4 | 5 | The Hugging Face Dataset Hub is home to a growing collection of datasets that span a variety of domains and tasks. 6 | 7 | It's more than a cloud storage: the Dataset Hub is a platform that provides data versioning thanks to git, as well as a Dataset Viewer to explore the data, making it a great place to store AI-ready datasets. 8 | 9 | This guide shows how to import data from other cloud storage using the filesystems implementations from `fsspec`. 10 | 11 | ## Import data from a cloud storage 12 | 13 | Most cloud storage providers have a `fsspec` FileSystem implementation, which is useful to import data from any cloud provider with the same code. 14 | This is especially useful to publish datasets on Hugging Face. 15 | 16 | Take a look at the following table for some example of supported cloud storage providers: 17 | 18 | | Storage provider | Filesystem implementation | 19 | |----------------------|---------------------------------------------------------------| 20 | | Amazon S3 | [s3fs](https://s3fs.readthedocs.io/en/latest/) | 21 | | Google Cloud Storage | [gcsfs](https://gcsfs.readthedocs.io/en/latest/) | 22 | | Azure Blob/DataLake | [adlfs](https://github.com/fsspec/adlfs) | 23 | | Oracle Cloud Storage | [ocifs](https://ocifs.readthedocs.io/en/latest/) | 24 | 25 | This guide will show you how to import data files from any cloud storage and save a dataset on Hugging Face. 26 | 27 | Let's say we want to publish a dataset on Hugging Face from Parquet files from a cloud storage. 28 | 29 | First, instantiate your cloud storage filesystem and list the files you'd like to import: 30 | 31 | ```python 32 | >>> import fsspec 33 | >>> fs = fsspec.filesystem("...") # s3 / gcs / abfs / adl / oci / ... 34 | >>> data_dir = "path/to/my/data/" 35 | >>> pattern = "*.parquet" 36 | >>> data_files = fs.glob(data_dir + pattern) 37 | ["path/to/my/data/0001.parquet", "path/to/my/data/0001.parquet", ...] 38 | ``` 39 | 40 | Then you can create a dataset on Hugging Face and import the data files, using for example: 41 | 42 | ```python 43 | >>> from huggingface_hub import create_repo, upload_file 44 | >>> from tqdm.auto import tqdm 45 | >>> destination_dataset = "username/my-dataset" 46 | >>> create_repo(destination_dataset, repo_type="dataset") 47 | >>> for data_file in tqdm(fs.glob(data_dir + pattern)): 48 | ... with fs.open(data_file) as fileobj: 49 | ... path_in_repo = data_file[len(data_dir):] 50 | ... upload_file( 51 | ... path_or_fileobj=fileobj, 52 | ... path_in_repo=path_in_repo, 53 | ... repo_id=destination_dataset, 54 | ... repo_type="dataset", 55 | ... ) 56 | ``` 57 | 58 | Check out the [huggingface_hub](https://huggingface.co/docs/huggingface_hub) documentation on files uploads [here](https://huggingface.co/docs/huggingface_hub/en/guides/upload) if you're looking for more upload options. 59 | 60 | Finally you can now load the dataset using 🤗 Datasets: 61 | 62 | ```python 63 | >>> from datasets import load_dataset 64 | >>> ds = load_dataset("username/my-dataset") 65 | ``` 66 | -------------------------------------------------------------------------------- /templates/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | TODO: "Add YAML tags here. Delete these instructions and copy-paste the YAML tags obtained with the online tagging app: https://huggingface.co/spaces/huggingface/datasets-tagging" 3 | --- 4 | 5 | # Dataset Card for [Dataset Name] 6 | 7 | ## Table of Contents 8 | - [Table of Contents](#table-of-contents) 9 | - [Dataset Description](#dataset-description) 10 | - [Dataset Summary](#dataset-summary) 11 | - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards) 12 | - [Languages](#languages) 13 | - [Dataset Structure](#dataset-structure) 14 | - [Data Instances](#data-instances) 15 | - [Data Fields](#data-fields) 16 | - [Data Splits](#data-splits) 17 | - [Dataset Creation](#dataset-creation) 18 | - [Curation Rationale](#curation-rationale) 19 | - [Source Data](#source-data) 20 | - [Annotations](#annotations) 21 | - [Personal and Sensitive Information](#personal-and-sensitive-information) 22 | - [Considerations for Using the Data](#considerations-for-using-the-data) 23 | - [Social Impact of Dataset](#social-impact-of-dataset) 24 | - [Discussion of Biases](#discussion-of-biases) 25 | - [Other Known Limitations](#other-known-limitations) 26 | - [Additional Information](#additional-information) 27 | - [Dataset Curators](#dataset-curators) 28 | - [Licensing Information](#licensing-information) 29 | - [Citation Information](#citation-information) 30 | - [Contributions](#contributions) 31 | 32 | ## Dataset Description 33 | 34 | - **Homepage:** 35 | - **Repository:** 36 | - **Paper:** 37 | - **Leaderboard:** 38 | - **Point of Contact:** 39 | 40 | ### Dataset Summary 41 | 42 | [More Information Needed] 43 | 44 | ### Supported Tasks and Leaderboards 45 | 46 | [More Information Needed] 47 | 48 | ### Languages 49 | 50 | [More Information Needed] 51 | 52 | ## Dataset Structure 53 | 54 | ### Data Instances 55 | 56 | [More Information Needed] 57 | 58 | ### Data Fields 59 | 60 | [More Information Needed] 61 | 62 | ### Data Splits 63 | 64 | [More Information Needed] 65 | 66 | ## Dataset Creation 67 | 68 | ### Curation Rationale 69 | 70 | [More Information Needed] 71 | 72 | ### Source Data 73 | 74 | #### Initial Data Collection and Normalization 75 | 76 | [More Information Needed] 77 | 78 | #### Who are the source language producers? 79 | 80 | [More Information Needed] 81 | 82 | ### Annotations 83 | 84 | #### Annotation process 85 | 86 | [More Information Needed] 87 | 88 | #### Who are the annotators? 89 | 90 | [More Information Needed] 91 | 92 | ### Personal and Sensitive Information 93 | 94 | [More Information Needed] 95 | 96 | ## Considerations for Using the Data 97 | 98 | ### Social Impact of Dataset 99 | 100 | [More Information Needed] 101 | 102 | ### Discussion of Biases 103 | 104 | [More Information Needed] 105 | 106 | ### Other Known Limitations 107 | 108 | [More Information Needed] 109 | 110 | ## Additional Information 111 | 112 | ### Dataset Curators 113 | 114 | [More Information Needed] 115 | 116 | ### Licensing Information 117 | 118 | [More Information Needed] 119 | 120 | ### Citation Information 121 | 122 | [More Information Needed] 123 | 124 | ### Contributions 125 | 126 | Thanks to [@github-username](https://github.com/) for adding this dataset. 127 | -------------------------------------------------------------------------------- /src/datasets/naming.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Lint as: python3 16 | """Utilities for file names.""" 17 | 18 | import itertools 19 | import os 20 | import re 21 | 22 | 23 | _uppercase_uppercase_re = re.compile(r"([A-Z]+)([A-Z][a-z])") 24 | _lowercase_uppercase_re = re.compile(r"([a-z\d])([A-Z])") 25 | 26 | _single_underscore_re = re.compile(r"(?:/\|?*" 32 | 33 | 34 | def camelcase_to_snakecase(name): 35 | """Convert camel-case string to snake-case.""" 36 | name = _uppercase_uppercase_re.sub(r"\1_\2", name) 37 | name = _lowercase_uppercase_re.sub(r"\1_\2", name) 38 | return name.lower() 39 | 40 | 41 | def snakecase_to_camelcase(name): 42 | """Convert snake-case string to camel-case string.""" 43 | name = _single_underscore_re.split(name) 44 | name = [_multiple_underscores_re.split(n) for n in name] 45 | return "".join(n.capitalize() for n in itertools.chain.from_iterable(name) if n != "") 46 | 47 | 48 | def filename_prefix_for_name(name): 49 | if os.path.basename(name) != name: 50 | raise ValueError(f"Should be a dataset name, not a path: {name}") 51 | return camelcase_to_snakecase(name) 52 | 53 | 54 | def filename_prefix_for_split(name, split): 55 | if os.path.basename(name) != name: 56 | raise ValueError(f"Should be a dataset name, not a path: {name}") 57 | if not re.match(_split_re, split): 58 | raise ValueError(f"Split name should match '{_split_re}'' but got '{split}'.") 59 | return f"{filename_prefix_for_name(name)}-{split}" 60 | 61 | 62 | def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None): 63 | prefix = filename_prefix_for_split(dataset_name, split) 64 | if filetype_suffix: 65 | prefix += f".{filetype_suffix}" 66 | filepath = os.path.join(data_dir, prefix) 67 | return f"{filepath}*" 68 | 69 | 70 | def filenames_for_dataset_split(path, dataset_name, split, filetype_suffix=None, shard_lengths=None): 71 | prefix = filename_prefix_for_split(dataset_name, split) 72 | prefix = os.path.join(path, prefix) 73 | 74 | if shard_lengths and len(shard_lengths) > 1: 75 | num_shards = len(shard_lengths) 76 | filenames = [f"{prefix}-{shard_id:05d}-of-{num_shards:05d}" for shard_id in range(num_shards)] 77 | if filetype_suffix: 78 | filenames = [filename + f".{filetype_suffix}" for filename in filenames] 79 | return filenames 80 | else: 81 | filename = prefix 82 | if filetype_suffix: 83 | filename += f".{filetype_suffix}" 84 | return [filename] 85 | -------------------------------------------------------------------------------- /docs/source/nlp_process.mdx: -------------------------------------------------------------------------------- 1 | # Process text data 2 | 3 | This guide shows specific methods for processing text datasets. Learn how to: 4 | 5 | - Tokenize a dataset with [`~Dataset.map`]. 6 | - Align dataset labels with label ids for NLI datasets. 7 | 8 | For a guide on how to process any type of dataset, take a look at the general process guide. 9 | 10 | ## Map 11 | 12 | The [`~Dataset.map`] function supports processing batches of examples at once which speeds up tokenization. 13 | 14 | Load a tokenizer from 🤗 [Transformers](https://huggingface.co/transformers/): 15 | 16 | ```py 17 | >>> from transformers import AutoTokenizer 18 | 19 | >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") 20 | ``` 21 | 22 | Set the `batched` parameter to `True` in the [`~Dataset.map`] function to apply the tokenizer to batches of examples: 23 | 24 | ```py 25 | >>> dataset = dataset.map(lambda examples: tokenizer(examples["text"]), batched=True) 26 | >>> dataset[0] 27 | {'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 28 | 'label': 1, 29 | 'input_ids': [101, 1996, 2600, 2003, 16036, 2000, 2022, 1996, 7398, 2301, 1005, 1055, 2047, 1000, 16608, 1000, 1998, 2008, 2002, 1005, 1055, 2183, 2000, 2191, 1037, 17624, 2130, 3618, 2084, 7779, 29058, 8625, 13327, 1010, 3744, 1011, 18856, 19513, 3158, 5477, 4168, 2030, 7112, 16562, 2140, 1012, 102], 30 | 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 31 | 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} 32 | ``` 33 | 34 | The [`~Dataset.map`] function converts the returned values to a PyArrow-supported format. But explicitly returning the tensors as NumPy arrays is faster because it is a natively supported PyArrow format. Set `return_tensors="np"` when you tokenize your text: 35 | 36 | ```py 37 | >>> dataset = dataset.map(lambda examples: tokenizer(examples["text"], return_tensors="np"), batched=True) 38 | ``` 39 | 40 | ## Align 41 | 42 | The [`~Dataset.align_labels_with_mapping`] function aligns a dataset label id with the label name. Not all 🤗 Transformers models follow the prescribed label mapping of the original dataset, especially for NLI datasets. For example, the [MNLI](https://huggingface.co/datasets/glue) dataset uses the following label mapping: 43 | 44 | ```py 45 | >>> label2id = {"entailment": 0, "neutral": 1, "contradiction": 2} 46 | ``` 47 | 48 | To align the dataset label mapping with the mapping used by a model, create a dictionary of the label name and id to align on: 49 | 50 | ```py 51 | >>> label2id = {"contradiction": 0, "neutral": 1, "entailment": 2} 52 | ``` 53 | 54 | Pass the dictionary of the label mappings to the [`~Dataset.align_labels_with_mapping`] function, and the column to align on: 55 | 56 | ```py 57 | >>> from datasets import load_dataset 58 | 59 | >>> mnli = load_dataset("nyu-mll/glue", "mnli", split="train") 60 | >>> mnli_aligned = mnli.align_labels_with_mapping(label2id, "label") 61 | ``` 62 | 63 | You can also use this function to assign a custom mapping of labels to ids. -------------------------------------------------------------------------------- /tests/test_hub.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | from types import SimpleNamespace 3 | from unittest.mock import patch 4 | from urllib.parse import quote 5 | 6 | import pytest 7 | from huggingface_hub import CommitOperationAdd, CommitOperationDelete 8 | 9 | import datasets 10 | from datasets.config import METADATA_CONFIGS_FIELD 11 | from datasets.hub import delete_from_hub 12 | from datasets.utils.hub import hf_dataset_url 13 | 14 | 15 | @pytest.mark.parametrize("repo_id", ["canonical_dataset_name", "org-name/dataset-name"]) 16 | @pytest.mark.parametrize("filename", ["filename.csv", "filename with blanks.csv"]) 17 | @pytest.mark.parametrize("revision", [None, "v2"]) 18 | def test_dataset_url(repo_id, filename, revision): 19 | url = hf_dataset_url(repo_id=repo_id, filename=filename, revision=revision) 20 | assert url == f"https://huggingface.co/datasets/{repo_id}/resolve/{revision or 'main'}/{quote(filename)}" 21 | 22 | 23 | def test_delete_from_hub(temporary_repo, hf_api, hf_token, csv_path, ci_hub_config) -> None: 24 | with temporary_repo() as repo_id: 25 | hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset") 26 | hf_api.upload_file( 27 | path_or_fileobj=str(csv_path), 28 | path_in_repo="cats/train/0000.csv", 29 | repo_id=repo_id, 30 | repo_type="dataset", 31 | token=hf_token, 32 | ) 33 | hf_api.upload_file( 34 | path_or_fileobj=str(csv_path), 35 | path_in_repo="dogs/train/0000.csv", 36 | repo_id=repo_id, 37 | repo_type="dataset", 38 | token=hf_token, 39 | ) 40 | hf_api.upload_file( 41 | token=hf_token, 42 | path_or_fileobj=dedent( 43 | f"""\ 44 | --- 45 | {METADATA_CONFIGS_FIELD}: 46 | - config_name: cats 47 | data_files: 48 | - split: train 49 | path: cats/train/* 50 | - config_name: dogs 51 | data_files: 52 | - split: train 53 | path: dogs/train/* 54 | --- 55 | """ 56 | ).encode(), 57 | path_in_repo="README.md", 58 | repo_id=repo_id, 59 | repo_type="dataset", 60 | ) 61 | commit_info = SimpleNamespace( 62 | pr_url="https:///hub-ci.huggingface.co/datasets/__DUMMY_USER__/__DUMMY_DATASET__/refs%2Fpr%2F1" 63 | ) 64 | with patch.object(datasets.hub.HfApi, "create_commit", return_value=commit_info) as mock_method: 65 | _ = delete_from_hub(repo_id, "dogs") 66 | assert mock_method.called 67 | assert mock_method.call_args.kwargs.get("commit_message") == "Delete 'dogs' config" 68 | assert mock_method.call_args.kwargs.get("create_pr") 69 | expected_operations = [ 70 | CommitOperationDelete(path_in_repo="dogs/train/0000.csv", is_folder=False), 71 | CommitOperationAdd( 72 | path_in_repo="README.md", 73 | path_or_fileobj=dedent( 74 | f"""\ 75 | --- 76 | {METADATA_CONFIGS_FIELD}: 77 | - config_name: cats 78 | data_files: 79 | - split: train 80 | path: cats/train/* 81 | --- 82 | """ 83 | ).encode(), 84 | ), 85 | ] 86 | assert mock_method.call_args.kwargs.get("operations") == expected_operations 87 | -------------------------------------------------------------------------------- /docs/source/image_classification.mdx: -------------------------------------------------------------------------------- 1 | # Image classification 2 | 3 | Image classification datasets are used to train a model to classify an entire image. There are a wide variety of applications enabled by these datasets such as identifying endangered wildlife species or screening for disease in medical images. This guide will show you how to apply transformations to an image classification dataset. 4 | 5 | Before you start, make sure you have up-to-date versions of `albumentations` and `cv2` installed: 6 | 7 | ```bash 8 | pip install -U albumentations opencv-python 9 | ``` 10 | 11 | This guide uses the [Beans](https://huggingface.co/datasets/beans) dataset for identifying the type of bean plant disease based on an image of its leaf. 12 | 13 | Load the dataset and take a look at an example: 14 | 15 | ```py 16 | >>> from datasets import load_dataset 17 | 18 | >>> dataset = load_dataset("AI-Lab-Makerere/beans") 19 | >>> dataset["train"][10] 20 | {'image': , 21 | 'image_file_path': '/root/.cache/huggingface/datasets/downloads/extracted/b0a21163f78769a2cf11f58dfc767fb458fc7cea5c05dccc0144a2c0f0bc1292/train/angular_leaf_spot/angular_leaf_spot_train.204.jpg', 22 | 'labels': 0} 23 | ``` 24 | 25 | The dataset has three fields: 26 | 27 | * `image`: a PIL image object. 28 | * `image_file_path`: the path to the image file. 29 | * `labels`: the label or category of the image. 30 | 31 | Next, check out an image: 32 | 33 |
34 | 35 |
36 | 37 | Now apply some augmentations with `albumentations`. You'll randomly crop the image, flip it horizontally, and adjust its brightness. 38 | 39 | ```py 40 | >>> import cv2 41 | >>> import albumentations 42 | >>> import numpy as np 43 | 44 | >>> transform = albumentations.Compose([ 45 | ... albumentations.RandomCrop(width=256, height=256), 46 | ... albumentations.HorizontalFlip(p=0.5), 47 | ... albumentations.RandomBrightnessContrast(p=0.2), 48 | ... ]) 49 | ``` 50 | 51 | Create a function to apply the transformation to the images: 52 | 53 | ```py 54 | >>> def transforms(examples): 55 | ... examples["pixel_values"] = [ 56 | ... transform(image=np.array(image))["image"] for image in examples["image"] 57 | ... ] 58 | ... 59 | ... return examples 60 | ``` 61 | 62 | Use the [`~Dataset.set_transform`] function to apply the transformation on-the-fly to batches of the dataset to consume less disk space: 63 | 64 | ```py 65 | >>> dataset.set_transform(transforms) 66 | ``` 67 | 68 | You can verify the transformation worked by indexing into the `pixel_values` of the first example: 69 | 70 | ```py 71 | >>> import numpy as np 72 | >>> import matplotlib.pyplot as plt 73 | 74 | >>> img = dataset["train"][0]["pixel_values"] 75 | >>> plt.imshow(img) 76 | ``` 77 | 78 |
79 | 80 | 81 |
82 | 83 | > [!TIP] 84 | > Now that you know how to process a dataset for image classification, learn 85 | > [how to train an image classification model](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb) 86 | > and use it for inference. -------------------------------------------------------------------------------- /docs/source/about_map_batch.mdx: -------------------------------------------------------------------------------- 1 | # Batch mapping 2 | 3 | Combining the utility of [`Dataset.map`] with batch mode is very powerful. It allows you to speed up processing, and freely control the size of the generated dataset. 4 | 5 | ## Need for speed 6 | 7 | The primary objective of batch mapping is to speed up processing. Often times, it is faster to work with batches of data instead of single examples. Naturally, batch mapping lends itself to tokenization. For example, the 🤗 [Tokenizers](https://huggingface.co/docs/tokenizers/python/latest/) library works faster with batches because it parallelizes the tokenization of all the examples in a batch. 8 | 9 | ## Input size != output size 10 | 11 | The ability to control the size of the generated dataset can be leveraged for many interesting use-cases. In the How-to [map](#map) section, there are examples of using batch mapping to: 12 | 13 | - Split long sentences into shorter chunks. 14 | - Augment a dataset with additional tokens. 15 | 16 | It is helpful to understand how this works, so you can come up with your own ways to use batch mapping. At this point, you may be wondering how you can control the size of the generated dataset. The answer is: **the mapped function does not have to return an output batch of the same size**. 17 | 18 | In other words, your mapped function input can be a batch of size `N` and return a batch of size `M`. The output `M` can be greater than or less than `N`. This means you can concatenate your examples, divide it up, and even add more examples! 19 | 20 | However, remember that all values in the output dictionary must contain the **same number of elements** as the other fields in the output dictionary. Otherwise, it is not possible to define the number of examples in the output returned by the mapped function. The number can vary between successive batches processed by the mapped function. For a single batch though, all values of the output dictionary should have the same length (i.e., the number of elements). 21 | 22 | For example, from a dataset of 1 column and 3 rows, if you use `map` to return a new column with twice as many rows, then you will have an error. 23 | In this case, you end up with one column with 3 rows, and one column with 6 rows. As you can see, the table will not be valid: 24 | 25 | ```py 26 | >>> from datasets import Dataset 27 | >>> dataset = Dataset.from_dict({"a": [0, 1, 2]}) 28 | >>> dataset.map(lambda batch: {"b": batch["a"] * 2}, batched=True) # new column with 6 elements: [0, 1, 2, 0, 1, 2] 29 | 'ArrowInvalid: Column 1 named b expected length 3 but got length 6' 30 | ``` 31 | 32 | To make it valid, you have to drop one of the columns: 33 | 34 | ```py 35 | >>> from datasets import Dataset 36 | >>> dataset = Dataset.from_dict({"a": [0, 1, 2]}) 37 | >>> dataset_with_duplicates = dataset.map(lambda batch: {"b": batch["a"] * 2}, remove_columns=["a"], batched=True) 38 | >>> len(dataset_with_duplicates) 39 | 6 40 | ``` 41 | Alternatively, you can overwrite the existing column to achieve the same result. 42 | For example, here’s how to duplicate every row in the dataset by overwriting column `"a"`: 43 | 44 | ```py 45 | >>> from datasets import Dataset 46 | >>> dataset = Dataset.from_dict({"a": [0, 1, 2]}) 47 | # overwrites the existing "a" column with duplicated values 48 | >>> duplicated_dataset = dataset.map( 49 | ... lambda batch: {"a": [x for x in batch["a"] for _ in range(2)]}, 50 | ... batched=True 51 | ... ) 52 | >>> duplicated_dataset 53 | Dataset({ 54 | features: ['a'], 55 | num_rows: 6 56 | }) 57 | >>> duplicated_dataset["a"] 58 | [0, 0, 1, 1, 2, 2] 59 | ``` 60 | -------------------------------------------------------------------------------- /docs/source/about_cache.mdx: -------------------------------------------------------------------------------- 1 | # The cache 2 | 3 | The cache is one of the reasons why 🤗 Datasets is so efficient. It stores previously downloaded and processed datasets so when you need to use them again, they are reloaded directly from the cache. This avoids having to download a dataset all over again, or reapplying processing functions. Even after you close and start another Python session, 🤗 Datasets will reload your dataset directly from the cache! 4 | 5 | ## Fingerprint 6 | 7 | How does the cache keeps track of what transforms are applied to a dataset? Well, 🤗 Datasets assigns a fingerprint to the cache file. A fingerprint keeps track of the current state of a dataset. The initial fingerprint is computed using a hash from the Arrow table, or a hash of the Arrow files if the dataset is on disk. Subsequent fingerprints are computed by combining the fingerprint of the previous state, and a hash of the latest transform applied. 8 | 9 | > [!TIP] 10 | > Transforms are any of the processing methods from the [How-to Process](./process) guides such as [`Dataset.map`] or [`Dataset.shuffle`]. 11 | 12 | Here are what the actual fingerprints look like: 13 | 14 | ```py 15 | >>> from datasets import Dataset 16 | >>> dataset1 = Dataset.from_dict({"a": [0, 1, 2]}) 17 | >>> dataset2 = dataset1.map(lambda x: {"a": x["a"] + 1}) 18 | >>> print(dataset1._fingerprint, dataset2._fingerprint) 19 | d19493523d95e2dc 5b86abacd4b42434 20 | ``` 21 | 22 | In order for a transform to be hashable, it needs to be picklable by [dill](https://dill.readthedocs.io/en/latest/) or [pickle](https://docs.python.org/3/library/pickle). 23 | 24 | When you use a non-hashable transform, 🤗 Datasets uses a random fingerprint instead and raises a warning. The non-hashable transform is considered different from the previous transforms. As a result, 🤗 Datasets will recompute all the transforms. Make sure your transforms are serializable with pickle or dill to avoid this! 25 | 26 | An example of when 🤗 Datasets recomputes everything is when caching is disabled. When this happens, the cache files are generated every time and they get written to a temporary directory. Once your Python session ends, the cache files in the temporary directory are deleted. A random hash is assigned to these cache files, instead of a fingerprint. 27 | 28 | > [!TIP] 29 | > When caching is disabled, use [`Dataset.save_to_disk`] to save your transformed dataset or it will be deleted once the session ends. 30 | 31 | ## Hashing 32 | 33 | The fingerprint of a dataset is updated by hashing the function passed to `map` as well as the `map` parameters (`batch_size`, `remove_columns`, etc.). 34 | 35 | You can check the hash of any Python object using the [`fingerprint.Hasher`]: 36 | 37 | ```py 38 | >>> from datasets.fingerprint import Hasher 39 | >>> my_func = lambda example: {"length": len(example["text"])} 40 | >>> print(Hasher.hash(my_func)) 41 | '3d35e2b3e94c81d6' 42 | ``` 43 | 44 | The hash is computed by dumping the object using a `dill` pickler and hashing the dumped bytes. 45 | The pickler recursively dumps all the variables used in your function, so any change you do to an object that is used in your function, will cause the hash to change. 46 | 47 | If one of your functions doesn't seem to have the same hash across sessions, it means at least one of its variables contains a Python object that is not deterministic. 48 | When this happens, feel free to hash any object you find suspicious to try to find the object that caused the hash to change. 49 | For example, if you use a list for which the order of its elements is not deterministic across sessions, then the hash won't be the same across sessions either. 50 | -------------------------------------------------------------------------------- /docs/source/image_process.mdx: -------------------------------------------------------------------------------- 1 | # Process image data 2 | 3 | This guide shows specific methods for processing image datasets. Learn how to: 4 | 5 | - Use [`~Dataset.map`] with image dataset. 6 | - Apply data augmentations to a dataset with [`~Dataset.set_transform`]. 7 | 8 | For a guide on how to process any type of dataset, take a look at the general process guide. 9 | 10 | ## Map 11 | 12 | The [`~Dataset.map`] function can apply transforms over an entire dataset. 13 | 14 | For example, create a basic [`Resize`](https://pytorch.org/vision/stable/generated/torchvision.transforms.Resize.html) function: 15 | 16 | ```py 17 | >>> def transforms(examples): 18 | ... examples["pixel_values"] = [image.convert("RGB").resize((100,100)) for image in examples["image"]] 19 | ... return examples 20 | ``` 21 | 22 | Now use the [`~Dataset.map`] function to resize the entire dataset, and set `batched=True` to speed up the process by accepting batches of examples. The transform returns `pixel_values` as a cacheable `PIL.Image` object: 23 | 24 | ```py 25 | >>> dataset = dataset.map(transforms, remove_columns=["image"], batched=True) 26 | >>> dataset[0] 27 | {'label': 6, 28 | 'pixel_values': } 29 | ``` 30 | 31 | The cache file saves time because you don't have to execute the same transform twice. The [`~Dataset.map`] function is best for operations you only run once per training - like resizing an image - instead of using it for operations executed for each epoch, like data augmentations. 32 | 33 | [`~Dataset.map`] takes up some memory, but you can reduce its memory requirements with the following parameters: 34 | 35 | - [`batch_size`](./package_reference/main_classes#datasets.DatasetDict.map.batch_size) determines the number of examples that are processed in one call to the transform function. 36 | - [`writer_batch_size`](./package_reference/main_classes#datasets.DatasetDict.map.writer_batch_size) determines the number of processed examples that are kept in memory before they are stored away. 37 | 38 | Both parameter values default to 1000, which can be expensive if you are storing images. Lower these values to use less memory when you use [`~Dataset.map`]. 39 | 40 | ## Apply transforms 41 | 42 | 🤗 Datasets applies data augmentations from any library or package to your dataset. Transforms can be applied on-the-fly on batches of data with [`~Dataset.set_transform`], which consumes less disk space. 43 | 44 | > [!TIP] 45 | > The following example uses [torchvision](https://pytorch.org/vision/stable/index.html), but feel free to use other data augmentation libraries like [Albumentations](https://albumentations.ai/docs/), [Kornia](https://kornia.readthedocs.io/en/latest/), and [imgaug](https://imgaug.readthedocs.io/en/latest/). 46 | 47 | For example, if you'd like to change the color properties of an image randomly: 48 | 49 | ```py 50 | >>> from torchvision.transforms import Compose, ColorJitter, ToTensor 51 | 52 | >>> jitter = Compose( 53 | ... [ 54 | ... ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.7), 55 | ... ToTensor(), 56 | ... ] 57 | ... ) 58 | ``` 59 | 60 | Create a function to apply the `ColorJitter` transform: 61 | 62 | ```py 63 | >>> def transforms(examples): 64 | ... examples["pixel_values"] = [jitter(image.convert("RGB")) for image in examples["image"]] 65 | ... return examples 66 | ``` 67 | 68 | Apply the transform with the [`~Dataset.set_transform`] function: 69 | 70 | ```py 71 | >>> dataset.set_transform(transforms) 72 | ``` -------------------------------------------------------------------------------- /docs/source/audio_process.mdx: -------------------------------------------------------------------------------- 1 | # Process audio data 2 | 3 | This guide shows specific methods for processing audio datasets. Learn how to: 4 | 5 | - Resample the sampling rate. 6 | - Use [`~Dataset.map`] with audio datasets. 7 | 8 | For a guide on how to process any type of dataset, take a look at the general process guide. 9 | 10 | ## Cast 11 | 12 | The [`~Dataset.cast_column`] function is used to cast a column to another feature to be decoded. When you use this function with the [`Audio`] feature, you can resample the sampling rate: 13 | 14 | ```py 15 | >>> from datasets import load_dataset, Audio 16 | 17 | >>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train") 18 | >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) 19 | ``` 20 | 21 | Audio files are decoded and resampled on-the-fly, so the next time you access an example, the audio file is resampled to 16kHz: 22 | 23 | ```py 24 | >>> audio = dataset[0]["audio"] 25 | 26 | >>> audio = audio_dataset[0]["audio"] 27 | >>> samples = audio.get_all_samples() 28 | >>> samples.data 29 | tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3447e-06, 30 | -1.9127e-04, -5.3330e-05]] 31 | >>> samples.sample_rate 32 | 16000 33 | ``` 34 | 35 |
36 | 40 | 44 |
45 | 46 | ## Map 47 | 48 | The [`~Dataset.map`] function helps preprocess your entire dataset at once. Depending on the type of model you're working with, you'll need to either load a [feature extractor](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoFeatureExtractor) or a [processor](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoProcessor). 49 | 50 | - For pretrained speech recognition models, load a feature extractor and tokenizer and combine them in a `processor`: 51 | 52 | ```py 53 | >>> from transformers import AutoTokenizer, AutoFeatureExtractor, AutoProcessor 54 | 55 | >>> model_checkpoint = "facebook/wav2vec2-large-xlsr-53" 56 | # after defining a vocab.json file you can instantiate a tokenizer object: 57 | >>> tokenizer = AutoTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") 58 | >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint) 59 | >>> processor = AutoProcessor.from_pretrained(feature_extractor=feature_extractor, tokenizer=tokenizer) 60 | ``` 61 | 62 | - For fine-tuned speech recognition models, you only need to load a `processor`: 63 | 64 | ```py 65 | >>> from transformers import AutoProcessor 66 | 67 | >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") 68 | ``` 69 | 70 | When you use [`~Dataset.map`] with your preprocessing function, include the `audio` column to ensure you're actually resampling the audio data: 71 | 72 | ```py 73 | >>> def prepare_dataset(batch): 74 | ... audio = batch["audio"] 75 | ... batch["input_values"] = processor(audio.get_all_samples().data, sampling_rate=audio["sampling_rate"]).input_values[0] 76 | ... batch["input_length"] = len(batch["input_values"]) 77 | ... with processor.as_target_processor(): 78 | ... batch["labels"] = processor(batch["sentence"]).input_ids 79 | ... return batch 80 | >>> dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names) 81 | ``` 82 | -------------------------------------------------------------------------------- /src/datasets/utils/version.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Lint as: python3 16 | """Version utils.""" 17 | 18 | import dataclasses 19 | import re 20 | from dataclasses import dataclass 21 | from functools import total_ordering 22 | from typing import Optional, Union 23 | 24 | 25 | _VERSION_REG = re.compile(r"^(?P\d+)" r"\.(?P\d+)" r"\.(?P\d+)$") 26 | 27 | 28 | @total_ordering 29 | @dataclass 30 | class Version: 31 | """Dataset version `MAJOR.MINOR.PATCH`. 32 | 33 | Args: 34 | version_str (`str`): 35 | The dataset version. 36 | description (`str`): 37 | A description of what is new in this version. 38 | major (`str`): 39 | minor (`str`): 40 | patch (`str`): 41 | 42 | Example: 43 | 44 | ```py 45 | >>> VERSION = datasets.Version("1.0.0") 46 | ``` 47 | """ 48 | 49 | version_str: str 50 | description: Optional[str] = None 51 | major: Optional[Union[str, int]] = None 52 | minor: Optional[Union[str, int]] = None 53 | patch: Optional[Union[str, int]] = None 54 | 55 | def __post_init__(self): 56 | self.major, self.minor, self.patch = _str_to_version_tuple(self.version_str) 57 | 58 | def __repr__(self): 59 | return f"{self.tuple[0]}.{self.tuple[1]}.{self.tuple[2]}" 60 | 61 | @property 62 | def tuple(self): 63 | return self.major, self.minor, self.patch 64 | 65 | def _validate_operand(self, other): 66 | if isinstance(other, str): 67 | return Version(other) 68 | elif isinstance(other, Version): 69 | return other 70 | raise TypeError(f"{other} (type {type(other)}) cannot be compared to version.") 71 | 72 | def __eq__(self, other): 73 | try: 74 | other = self._validate_operand(other) 75 | except (TypeError, ValueError): 76 | return False 77 | else: 78 | return self.tuple == other.tuple 79 | 80 | def __lt__(self, other): 81 | other = self._validate_operand(other) 82 | return self.tuple < other.tuple 83 | 84 | def __hash__(self): 85 | return hash(_version_tuple_to_str(self.tuple)) 86 | 87 | @classmethod 88 | def from_dict(cls, dic): 89 | field_names = {f.name for f in dataclasses.fields(cls)} 90 | return cls(**{k: v for k, v in dic.items() if k in field_names}) 91 | 92 | def _to_yaml_string(self) -> str: 93 | return self.version_str 94 | 95 | 96 | def _str_to_version_tuple(version_str): 97 | """Return the tuple (major, minor, patch) version extracted from the str.""" 98 | res = _VERSION_REG.match(version_str) 99 | if not res: 100 | raise ValueError(f"Invalid version '{version_str}'. Format should be x.y.z with {{x,y,z}} being digits.") 101 | return tuple(int(v) for v in [res.group("major"), res.group("minor"), res.group("patch")]) 102 | 103 | 104 | def _version_tuple_to_str(version_tuple): 105 | """Return the str version from the version tuple (major, minor, patch).""" 106 | return ".".join(str(v) for v in version_tuple) 107 | -------------------------------------------------------------------------------- /src/datasets/packaged_modules/arrow/arrow.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from dataclasses import dataclass 3 | from typing import Optional 4 | 5 | import pyarrow as pa 6 | 7 | import datasets 8 | from datasets.builder import Key 9 | from datasets.table import table_cast 10 | 11 | 12 | logger = datasets.utils.logging.get_logger(__name__) 13 | 14 | 15 | @dataclass 16 | class ArrowConfig(datasets.BuilderConfig): 17 | """BuilderConfig for Arrow.""" 18 | 19 | features: Optional[datasets.Features] = None 20 | 21 | def __post_init__(self): 22 | super().__post_init__() 23 | 24 | 25 | class Arrow(datasets.ArrowBasedBuilder): 26 | BUILDER_CONFIG_CLASS = ArrowConfig 27 | 28 | def _info(self): 29 | return datasets.DatasetInfo(features=self.config.features) 30 | 31 | def _split_generators(self, dl_manager): 32 | """We handle string, list and dicts in datafiles""" 33 | if not self.config.data_files: 34 | raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}") 35 | dl_manager.download_config.extract_on_the_fly = True 36 | data_files = dl_manager.download_and_extract(self.config.data_files) 37 | splits = [] 38 | for split_name, files in data_files.items(): 39 | if isinstance(files, str): 40 | files = [files] 41 | # Use `dl_manager.iter_files` to skip hidden files in an extracted archive 42 | files = [dl_manager.iter_files(file) for file in files] 43 | # Infer features if they are stored in the arrow schema 44 | if self.info.features is None: 45 | for file in itertools.chain.from_iterable(files): 46 | with open(file, "rb") as f: 47 | try: 48 | reader = pa.ipc.open_stream(f) 49 | except (OSError, pa.lib.ArrowInvalid): 50 | reader = pa.ipc.open_file(f) 51 | self.info.features = datasets.Features.from_arrow_schema(reader.schema) 52 | break 53 | splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files})) 54 | return splits 55 | 56 | def _cast_table(self, pa_table: pa.Table) -> pa.Table: 57 | if self.info.features is not None: 58 | # more expensive cast to support nested features with keys in a different order 59 | # allows str <-> int/float or str to Audio for example 60 | pa_table = table_cast(pa_table, self.info.features.arrow_schema) 61 | return pa_table 62 | 63 | def _generate_tables(self, files): 64 | for file_idx, file in enumerate(itertools.chain.from_iterable(files)): 65 | with open(file, "rb") as f: 66 | try: 67 | try: 68 | batches = pa.ipc.open_stream(f) 69 | except (OSError, pa.lib.ArrowInvalid): 70 | reader = pa.ipc.open_file(f) 71 | batches = (reader.get_batch(i) for i in range(reader.num_record_batches)) 72 | for batch_idx, record_batch in enumerate(batches): 73 | pa_table = pa.Table.from_batches([record_batch]) 74 | # Uncomment for debugging (will print the Arrow table size and elements) 75 | # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}") 76 | # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows))) 77 | yield Key(file_idx, batch_idx), self._cast_table(pa_table) 78 | except ValueError as e: 79 | logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}") 80 | raise 81 | -------------------------------------------------------------------------------- /.zenodo.json: -------------------------------------------------------------------------------- 1 | { 2 | "license": "Apache-2.0", 3 | "creators": [ 4 | { 5 | "affiliation": "Hugging Face", 6 | "name": "Quentin Lhoest" 7 | }, 8 | { 9 | "orcid": "0000-0003-1727-1045", 10 | "affiliation": "Hugging Face", 11 | "name": "Albert Villanova del Moral" 12 | }, 13 | { 14 | "affiliation": "Hugging Face", 15 | "name": "Patrick von Platen" 16 | }, 17 | { 18 | "affiliation": "Hugging Face", 19 | "name": "Thomas Wolf" 20 | }, 21 | { 22 | "affiliation": "Hugging Face", 23 | "name": "Mario Šaško" 24 | }, 25 | { 26 | "affiliation": "Hugging Face", 27 | "name": "Yacine Jernite" 28 | }, 29 | { 30 | "affiliation": "Hugging Face", 31 | "name": "Abhishek Thakur" 32 | }, 33 | { 34 | "affiliation": "Hugging Face", 35 | "name": "Lewis Tunstall" 36 | }, 37 | { 38 | "affiliation": "Hugging Face", 39 | "name": "Suraj Patil" 40 | }, 41 | { 42 | "affiliation": "Hugging Face", 43 | "name": "Mariama Drame" 44 | }, 45 | { 46 | "affiliation": "Hugging Face", 47 | "name": "Julien Chaumond" 48 | }, 49 | { 50 | "affiliation": "Hugging Face", 51 | "name": "Julien Plu" 52 | }, 53 | { 54 | "affiliation": "Hugging Face", 55 | "name": "Joe Davison" 56 | }, 57 | { 58 | "affiliation": "Hugging Face", 59 | "name": "Simon Brandeis" 60 | }, 61 | { 62 | "affiliation": "Hugging Face", 63 | "name": "Victor Sanh" 64 | }, 65 | { 66 | "affiliation": "Hugging Face", 67 | "name": "Teven Le Scao" 68 | }, 69 | { 70 | "affiliation": "Hugging Face", 71 | "name": "Kevin Canwen Xu" 72 | }, 73 | { 74 | "affiliation": "Hugging Face", 75 | "name": "Nicolas Patry" 76 | }, 77 | { 78 | "affiliation": "Hugging Face", 79 | "name": "Steven Liu" 80 | }, 81 | { 82 | "affiliation": "Hugging Face", 83 | "name": "Angelina McMillan-Major" 84 | }, 85 | { 86 | "affiliation": "Hugging Face", 87 | "name": "Philipp Schmid" 88 | }, 89 | { 90 | "affiliation": "Hugging Face", 91 | "name": "Sylvain Gugger" 92 | }, 93 | { 94 | "affiliation": "Hugging Face", 95 | "name": "Nathan Raw" 96 | }, 97 | { 98 | "affiliation": "Hugging Face", 99 | "name": "Sylvain Lesage" 100 | }, 101 | { 102 | "affiliation": "Hugging Face", 103 | "name": "Anton Lozhkov" 104 | }, 105 | { 106 | "affiliation": "Hugging Face", 107 | "name": "Matthew Carrigan" 108 | }, 109 | { 110 | "affiliation": "Hugging Face", 111 | "name": "Th\u00e9o Matussi\u00e8re" 112 | }, 113 | { 114 | "affiliation": "Hugging Face", 115 | "name": "Leandro von Werra" 116 | }, 117 | { 118 | "affiliation": "Hugging Face", 119 | "name": "Lysandre Debut" 120 | }, 121 | { 122 | "affiliation": "Hugging Face", 123 | "name": "Stas Bekman" 124 | }, 125 | { 126 | "affiliation": "Hugging Face", 127 | "name": "Cl\u00e9ment Delangue" 128 | } 129 | ] 130 | } -------------------------------------------------------------------------------- /src/datasets/utils/deprecation_utils.py: -------------------------------------------------------------------------------- 1 | import enum 2 | import inspect 3 | import warnings 4 | from functools import wraps 5 | from typing import Callable, Optional 6 | 7 | from .logging import get_logger 8 | 9 | 10 | _emitted_deprecation_warnings = set() 11 | logger = get_logger(__name__) 12 | 13 | 14 | def deprecated(help_message: Optional[str] = None): 15 | """Decorator to mark a class or a function as deprecated. 16 | 17 | Args: 18 | help_message (:obj:`str`, optional): An optional message to guide the user on how to 19 | switch to non-deprecated usage of the library. 20 | """ 21 | 22 | def decorator(deprecated_class_or_function: Callable): 23 | global _emitted_deprecation_warnings 24 | 25 | if inspect.isclass(deprecated_class_or_function): 26 | deprecated_function = deprecated_class_or_function.__init__ 27 | name = deprecated_class_or_function.__name__ 28 | else: 29 | deprecated_function = deprecated_class_or_function 30 | name = deprecated_function.__name__ 31 | # Support deprecating __init__ class method: class name instead 32 | name = name if name != "__init__" else deprecated_function.__qualname__.split(".")[-2] 33 | 34 | warning_msg = ( 35 | f"{name} is deprecated and will be removed in the next major version of datasets." + f" {help_message}" 36 | if help_message 37 | else "" 38 | ) 39 | 40 | @wraps(deprecated_function) 41 | def wrapper(*args, **kwargs): 42 | func_hash = hash(deprecated_function) 43 | if func_hash not in _emitted_deprecation_warnings: 44 | warnings.warn(warning_msg, category=FutureWarning, stacklevel=2) 45 | _emitted_deprecation_warnings.add(func_hash) 46 | return deprecated_function(*args, **kwargs) 47 | 48 | wrapper._decorator_name_ = "deprecated" 49 | 50 | if inspect.isclass(deprecated_class_or_function): 51 | deprecated_class_or_function.__init__ = wrapper 52 | return deprecated_class_or_function 53 | else: 54 | return wrapper 55 | 56 | return decorator 57 | 58 | 59 | class OnAccess(enum.EnumMeta): 60 | """ 61 | Enum metaclass that calls a user-specified function whenever a member is accessed. 62 | """ 63 | 64 | def __getattribute__(cls, name): 65 | obj = super().__getattribute__(name) 66 | if isinstance(obj, enum.Enum) and obj._on_access: 67 | obj._on_access() 68 | return obj 69 | 70 | def __getitem__(cls, name): 71 | member = super().__getitem__(name) 72 | if member._on_access: 73 | member._on_access() 74 | return member 75 | 76 | def __call__(cls, value, names=None, *, module=None, qualname=None, type=None, start=1): 77 | obj = super().__call__(value, names, module=module, qualname=qualname, type=type, start=start) 78 | if isinstance(obj, enum.Enum) and obj._on_access: 79 | obj._on_access() 80 | return obj 81 | 82 | 83 | class DeprecatedEnum(enum.Enum, metaclass=OnAccess): 84 | """ 85 | Enum class that calls `deprecate` method whenever a member is accessed. 86 | """ 87 | 88 | def __new__(cls, value): 89 | member = object.__new__(cls) 90 | member._value_ = value 91 | member._on_access = member.deprecate 92 | return member 93 | 94 | @property 95 | def help_message(self): 96 | return "" 97 | 98 | def deprecate(self): 99 | help_message = f" {self.help_message}" if self.help_message else "" 100 | warnings.warn( 101 | f"'{self.__objclass__.__name__}' is deprecated and will be removed in the next major version of datasets." 102 | + help_message, 103 | FutureWarning, 104 | stacklevel=3, 105 | ) 106 | -------------------------------------------------------------------------------- /docs/source/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | Before you start, you'll need to setup your environment and install the appropriate packages. 🤗 Datasets is tested on **Python 3.9+**. 4 | 5 | > [!TIP] 6 | > If you want to use 🤗 Datasets with TensorFlow or PyTorch, you'll need to install them separately. Refer to the [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2-packages-are-available) or the [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) for the specific install command for your framework. 7 | 8 | ## Virtual environment 9 | 10 | You should install 🤗 Datasets in a [virtual environment](https://docs.python.org/3/library/venv.html) to keep things tidy and avoid dependency conflicts. 11 | 12 | 1. Create and navigate to your project directory: 13 | 14 | ```bash 15 | mkdir ~/my-project 16 | cd ~/my-project 17 | ``` 18 | 19 | 2. Start a virtual environment inside your directory: 20 | 21 | ```bash 22 | python -m venv .env 23 | ``` 24 | 25 | 3. Activate and deactivate the virtual environment with the following commands: 26 | 27 | ```bash 28 | # Activate the virtual environment 29 | source .env/bin/activate 30 | 31 | # Deactivate the virtual environment 32 | source .env/bin/deactivate 33 | ``` 34 | 35 | Once you've created your virtual environment, you can install 🤗 Datasets in it. 36 | 37 | ## pip 38 | 39 | The most straightforward way to install 🤗 Datasets is with pip: 40 | 41 | ```bash 42 | pip install datasets 43 | ``` 44 | 45 | Run the following command to check if 🤗 Datasets has been properly installed: 46 | 47 | ```bash 48 | python -c "from datasets import load_dataset; print(load_dataset('rajpurkar/squad', split='train')[0])" 49 | ``` 50 | 51 | This command downloads version 1 of the [Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer/), loads the training split, and prints the first training example. You should see: 52 | 53 | ```python 54 | {'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']}, 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'id': '5733be284776f41900661182', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'title': 'University_of_Notre_Dame'} 55 | ``` 56 | 57 | ## Audio 58 | 59 | To work with audio datasets, you need to install the [`Audio`] feature as an extra dependency: 60 | 61 | ```bash 62 | pip install datasets[audio] 63 | ``` 64 | 65 | ## Vision 66 | 67 | To work with image datasets, you need to install the [`Image`] feature as an extra dependency: 68 | 69 | ```bash 70 | pip install datasets[vision] 71 | ``` 72 | 73 | ## source 74 | 75 | Building 🤗 Datasets from source lets you make changes to the code base. To install from the source, clone the repository and install with the following commands: 76 | 77 | ```bash 78 | git clone https://github.com/huggingface/datasets.git 79 | cd datasets 80 | pip install -e . 81 | ``` 82 | 83 | Again, you can check if 🤗 Datasets was properly installed with the following command: 84 | 85 | ```bash 86 | python -c "from datasets import load_dataset; print(load_dataset('rajpurkar/squad', split='train')[0])" 87 | ``` 88 | 89 | ## conda 90 | 91 | 🤗 Datasets can also be installed from conda, a package management system: 92 | 93 | ```bash 94 | conda install -c huggingface -c conda-forge datasets 95 | ``` 96 | -------------------------------------------------------------------------------- /docs/source/use_with_pyarrow.mdx: -------------------------------------------------------------------------------- 1 | # Use with PyArrow 2 | 3 | This document is a quick introduction to using `datasets` with PyArrow, with a particular focus on how to process 4 | datasets using Arrow compute functions, and how to convert a dataset to PyArrow or from PyArrow. 5 | 6 | This is particularly useful as it allows fast zero-copy operations, since `datasets` uses PyArrow under the hood. 7 | 8 | ## Dataset format 9 | 10 | By default, datasets return regular Python objects: integers, floats, strings, lists, etc. 11 | 12 | To get PyArrow Tables or Arrays instead, you can set the format of the dataset to `pyarrow` using [`Dataset.with_format`]: 13 | 14 | ```py 15 | >>> from datasets import Dataset 16 | >>> data = {"col_0": ["a", "b", "c", "d"], "col_1": [0., 0., 1., 1.]} 17 | >>> ds = Dataset.from_dict(data) 18 | >>> ds = ds.with_format("arrow") 19 | >>> ds[0] # pa.Table 20 | pyarrow.Table 21 | col_0: string 22 | col_1: double 23 | ---- 24 | col_0: [["a"]] 25 | col_1: [[0]] 26 | >>> ds[:2] # pa.Table 27 | pyarrow.Table 28 | col_0: string 29 | col_1: double 30 | ---- 31 | col_0: [["a","b"]] 32 | col_1: [[0,0]] 33 | >>> ds["data"] # pa.array 34 | 35 | [ 36 | [ 37 | "a", 38 | "b", 39 | "c", 40 | "d" 41 | ] 42 | ] 43 | ``` 44 | 45 | This also works for `IterableDataset` objects obtained e.g. using `load_dataset(..., streaming=True)`: 46 | 47 | ```py 48 | >>> ds = ds.with_format("arrow") 49 | >>> for table in ds.iter(batch_size=2): 50 | ... print(table) 51 | ... break 52 | pyarrow.Table 53 | col_0: string 54 | col_1: double 55 | ---- 56 | col_0: [["a","b"]] 57 | col_1: [[0,0]] 58 | ``` 59 | 60 | ## Process data 61 | 62 | PyArrow functions are generally faster than regular hand-written python functions, and therefore they are a good option to optimize data processing. You can use Arrow compute functions to process a dataset in [`Dataset.map`] or [`Dataset.filter`]: 63 | 64 | ```python 65 | >>> import pyarrow.compute as pc 66 | >>> from datasets import Dataset 67 | >>> data = {"col_0": ["a", "b", "c", "d"], "col_1": [0., 0., 1., 1.]} 68 | >>> ds = Dataset.from_dict(data) 69 | >>> ds = ds.with_format("arrow") 70 | >>> ds = ds.map(lambda t: t.append_column("col_2", pc.add(t["col_1"], 1)), batched=True) 71 | >>> ds[:2] 72 | pyarrow.Table 73 | col_0: string 74 | col_1: double 75 | col_2: double 76 | ---- 77 | col_0: [["a","b"]] 78 | col_1: [[0,0]] 79 | col_2: [[1,1]] 80 | >>> ds = ds.filter(lambda t: pc.equal(t["col_0"], "b"), batched=True) 81 | >>> ds[0] 82 | pyarrow.Table 83 | col_0: string 84 | col_1: double 85 | col_2: double 86 | ---- 87 | col_0: [["b"]] 88 | col_1: [[0]] 89 | col_2: [[1]] 90 | ``` 91 | 92 | We use `batched=True` because it is faster to process batches of data in PyArrow rather than row by row. It's also possible to use `batch_size=` in `map()` to set the size of each `table`. 93 | 94 | This also works for [`IterableDataset.map`] and [`IterableDataset.filter`]. 95 | 96 | ## Import or Export from PyArrow 97 | 98 | A [`Dataset`] is a wrapper of a PyArrow Table, you can instantiate a Dataset directly from the Table: 99 | 100 | ```python 101 | ds = Dataset(table) 102 | ``` 103 | 104 | You can access the PyArrow Table of a dataset using [`Dataset.data`], which returns a [`MemoryMappedTable`] or a [`InMemoryTable`] or a [`ConcatenationTable`], depending on the origin of the Arrow data and the operations that were applied. 105 | 106 | Those objects wrap the underlying PyArrow table accessible at `Dataset.data.table`. This table contains all the data of the dataset, but there might also be an indices mapping at `Dataset._indices` which maps the dataset rows indices to the PyArrow Table rows indices. This can happen if the dataset has been shuffled with [`Dataset.shuffle`] or if only a subset of the rows are used (e.g. after a [`Dataset.select`]). 107 | 108 | In the general case, you can export a dataset to a PyArrow Table using `table = ds.with_format("arrow")[:]`. 109 | -------------------------------------------------------------------------------- /src/datasets/download/download_config.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from dataclasses import dataclass, field 3 | from pathlib import Path 4 | from typing import Any, Optional, Union 5 | 6 | from .. import config 7 | 8 | 9 | @dataclass 10 | class DownloadConfig: 11 | """Configuration for our cached path manager. 12 | 13 | Attributes: 14 | cache_dir (`str` or `Path`, *optional*): 15 | Specify a cache directory to save the file to (overwrite the 16 | default cache dir). 17 | force_download (`bool`, defaults to `False`): 18 | If `True`, re-download the file even if it's already cached in 19 | the cache dir. 20 | resume_download (`bool`, defaults to `False`): 21 | If `True`, resume the download if an incompletely received file is 22 | found. 23 | proxies (`dict`, *optional*): 24 | user_agent (`str`, *optional*): 25 | Optional string or dict that will be appended to the user-agent on remote 26 | requests. 27 | extract_compressed_file (`bool`, defaults to `False`): 28 | If `True` and the path point to a zip or tar file, 29 | extract the compressed file in a folder along the archive. 30 | force_extract (`bool`, defaults to `False`): 31 | If `True` when `extract_compressed_file` is `True` and the archive 32 | was already extracted, re-extract the archive and override the folder where it was extracted. 33 | delete_extracted (`bool`, defaults to `False`): 34 | Whether to delete (or keep) the extracted files. 35 | extract_on_the_fly (`bool`, defaults to `False`): 36 | If `True`, extract compressed files while they are being read. 37 | use_etag (`bool`, defaults to `True`): 38 | Whether to use the ETag HTTP response header to validate the cached files. 39 | num_proc (`int`, *optional*): 40 | The number of processes to launch to download the files in parallel. 41 | max_retries (`int`, default to `1`): 42 | The number of times to retry an HTTP request if it fails. 43 | token (`str` or `bool`, *optional*): 44 | Optional string or boolean to use as Bearer token 45 | for remote files on the Datasets Hub. If `True`, or not specified, will get token from `~/.huggingface`. 46 | storage_options (`dict`, *optional*): 47 | Key/value pairs to be passed on to the dataset file-system backend, if any. 48 | download_desc (`str`, *optional*): 49 | A description to be displayed alongside with the progress bar while downloading the files. 50 | disable_tqdm (`bool`, defaults to `False`): 51 | Whether to disable the individual files download progress bar 52 | """ 53 | 54 | cache_dir: Optional[Union[str, Path]] = None 55 | force_download: bool = False 56 | resume_download: bool = False 57 | local_files_only: bool = False 58 | proxies: Optional[dict] = None 59 | user_agent: Optional[str] = None 60 | extract_compressed_file: bool = False 61 | force_extract: bool = False 62 | delete_extracted: bool = False 63 | extract_on_the_fly: bool = False 64 | use_etag: bool = True 65 | num_proc: Optional[int] = None 66 | max_retries: int = 1 67 | token: Optional[Union[str, bool]] = None 68 | storage_options: dict[str, Any] = field(default_factory=dict) 69 | download_desc: Optional[str] = None 70 | disable_tqdm: bool = False 71 | 72 | def copy(self) -> "DownloadConfig": 73 | return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()}) 74 | 75 | def __setattr__(self, name, value): 76 | if name == "token" and getattr(self, "storage_options", None) is not None: 77 | if "hf" not in self.storage_options: 78 | self.storage_options["hf"] = {"endpoint": config.HF_ENDPOINT, "token": value} 79 | elif getattr(self.storage_options["hf"], "token", None) is None: 80 | self.storage_options["hf"]["token"] = value 81 | super().__setattr__(name, value) 82 | --------------------------------------------------------------------------------