├── .github └── workflows │ ├── pypi-release.yml │ ├── testing.yml │ └── trufflehog.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.cff ├── LICENSE ├── Makefile ├── README.md ├── examples ├── exact_substrings.py ├── filter_hf_dataset.py ├── fineweb.py ├── inference_example_chunked.py ├── minhash_deduplication.py ├── process_common_crawl_dump.py ├── sentence_deduplication.py ├── summary_stats.py ├── tokenize_c4.py ├── tokenize_from_hf_to_s3.py └── url_deduplication.py ├── pyproject.toml ├── src └── datatrove │ ├── __init__.py │ ├── assets │ ├── banned_subwords.txt │ ├── banned_words.txt │ ├── soft_banned_words.txt │ ├── tokenizer_assignment.csv │ └── url_filterblacklistsv0_3_0.tar.gz │ ├── data.py │ ├── executor │ ├── __init__.py │ ├── base.py │ ├── local.py │ ├── ray.py │ └── slurm.py │ ├── io.py │ ├── pipeline │ ├── __init__.py │ ├── base.py │ ├── decont │ │ ├── __init__.py │ │ └── n_grams.py │ ├── dedup │ │ ├── __init__.py │ │ ├── bloom_filter.py │ │ ├── exact_substrings.py │ │ ├── minhash.py │ │ ├── sentence_dedup.py │ │ └── url_dedup.py │ ├── extractors │ │ ├── __init__.py │ │ ├── base.py │ │ ├── modular.py │ │ └── trafilatura.py │ ├── filters │ │ ├── __init__.py │ │ ├── base_filter.py │ │ ├── c4_filters.py │ │ ├── fasttext_filter.py │ │ ├── fineweb_quality_filter.py │ │ ├── gopher_quality_filter.py │ │ ├── gopher_repetition_filter.py │ │ ├── lambda_filter.py │ │ ├── language_filter.py │ │ ├── regex_filter.py │ │ ├── sampler_filter.py │ │ ├── unigram_log_probs.py │ │ └── url_filter.py │ ├── formatters │ │ ├── __init__.py │ │ ├── base.py │ │ ├── ftfy.py │ │ ├── pii.py │ │ └── symbol_lines_remover.py │ ├── inference │ │ ├── __init__.py │ │ ├── checkpointing.py │ │ ├── distributed │ │ │ ├── __init__.py │ │ │ ├── ray.py │ │ │ └── utils.py │ │ ├── metrics.py │ │ ├── run_inference.py │ │ ├── servers │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── dummy_server.py │ │ │ ├── endpoint_server.py │ │ │ ├── sglang_server.py │ │ │ └── vllm_server.py │ │ └── types.py │ ├── readers │ │ ├── __init__.py │ │ ├── base.py │ │ ├── csv.py │ │ ├── huggingface.py │ │ ├── ipc.py │ │ ├── jsonl.py │ │ ├── parquet.py │ │ └── warc.py │ ├── stats │ │ ├── __init__.py │ │ ├── base.py │ │ ├── config.py │ │ ├── contamination_stats.py │ │ ├── doc_stats.py │ │ ├── lang_stats.py │ │ ├── line_stats.py │ │ ├── merger.py │ │ ├── paragraph_stats.py │ │ ├── perplexity_stats.py │ │ ├── sentence_stats.py │ │ ├── token_stats.py │ │ └── word_stats.py │ ├── tokens │ │ ├── __init__.py │ │ ├── context_shuffler.py │ │ ├── counter.py │ │ ├── megatron_tokenizer.py │ │ ├── merger.py │ │ └── tokenizer.py │ └── writers │ │ ├── __init__.py │ │ ├── disk_base.py │ │ ├── huggingface.py │ │ ├── jsonl.py │ │ └── parquet.py │ ├── tools │ ├── __init__.py │ ├── check_dataset.py │ ├── failed_logs.py │ ├── fast_mh3 │ │ ├── Cargo.lock │ │ ├── Cargo.toml │ │ ├── README.md │ │ └── src │ │ │ ├── local_union_find.rs │ │ │ └── s3_union_find.rs │ ├── inspect_data.py │ ├── jobs_status.py │ ├── launch_pickled_pipeline.py │ ├── merge_stats.py │ └── track_jobs.py │ └── utils │ ├── __init__.py │ ├── _import_utils.py │ ├── batching.py │ ├── binaryio.py │ ├── dataset.py │ ├── hashes │ ├── sha1.py │ └── xxhash.py │ ├── hashing.py │ ├── japanese_tokenizer.py │ ├── jobs.py │ ├── lid.py │ ├── logging.py │ ├── perplexity.py │ ├── stats.py │ ├── text.py │ ├── tokenization.py │ ├── typeshelper.py │ └── word_tokenizers.py └── tests ├── __init__.py ├── executor ├── __init__.py ├── test_local.py └── test_ray.py ├── pipeline ├── __init__.py ├── inference │ └── __init__.py ├── test_adapter_reader.py ├── test_base.py ├── test_bloom_filter.py ├── test_exact_substrings.py ├── test_extractors.py ├── test_filters.py ├── test_hf_reader.py ├── test_inference.py ├── test_ipc_reader.py ├── test_jsonl_zstd_compression.py ├── test_minhash.py ├── test_ngrams_decont.py ├── test_parquet_reader.py ├── test_parquet_writer.py ├── test_parquet_zstd_compression.py ├── test_pii_removal.py ├── test_request_cache.py ├── test_sentence_deduplication.py ├── test_stats.py ├── test_symbollines.py ├── test_text.py ├── test_tokenization.py ├── test_url_deduplication.py └── test_word_tokenizers.py ├── test_io.py └── utils.py /.github/workflows/pypi-release.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/.github/workflows/pypi-release.yml -------------------------------------------------------------------------------- /.github/workflows/testing.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/.github/workflows/testing.yml -------------------------------------------------------------------------------- /.github/workflows/trufflehog.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/.github/workflows/trufflehog.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/.gitignore -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/CITATION.cff -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/LICENSE -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/README.md -------------------------------------------------------------------------------- /examples/exact_substrings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/examples/exact_substrings.py -------------------------------------------------------------------------------- /examples/filter_hf_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/examples/filter_hf_dataset.py -------------------------------------------------------------------------------- /examples/fineweb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/examples/fineweb.py -------------------------------------------------------------------------------- /examples/inference_example_chunked.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/examples/inference_example_chunked.py -------------------------------------------------------------------------------- /examples/minhash_deduplication.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/examples/minhash_deduplication.py -------------------------------------------------------------------------------- /examples/process_common_crawl_dump.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/examples/process_common_crawl_dump.py -------------------------------------------------------------------------------- /examples/sentence_deduplication.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/examples/sentence_deduplication.py -------------------------------------------------------------------------------- /examples/summary_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/examples/summary_stats.py -------------------------------------------------------------------------------- /examples/tokenize_c4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/examples/tokenize_c4.py -------------------------------------------------------------------------------- /examples/tokenize_from_hf_to_s3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/examples/tokenize_from_hf_to_s3.py -------------------------------------------------------------------------------- /examples/url_deduplication.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/examples/url_deduplication.py -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/pyproject.toml -------------------------------------------------------------------------------- /src/datatrove/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datatrove/assets/banned_subwords.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/assets/banned_subwords.txt -------------------------------------------------------------------------------- /src/datatrove/assets/banned_words.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/assets/banned_words.txt -------------------------------------------------------------------------------- /src/datatrove/assets/soft_banned_words.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/assets/soft_banned_words.txt -------------------------------------------------------------------------------- /src/datatrove/assets/tokenizer_assignment.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/assets/tokenizer_assignment.csv -------------------------------------------------------------------------------- /src/datatrove/assets/url_filterblacklistsv0_3_0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/assets/url_filterblacklistsv0_3_0.tar.gz -------------------------------------------------------------------------------- /src/datatrove/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/data.py -------------------------------------------------------------------------------- /src/datatrove/executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/executor/__init__.py -------------------------------------------------------------------------------- /src/datatrove/executor/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/executor/base.py -------------------------------------------------------------------------------- /src/datatrove/executor/local.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/executor/local.py -------------------------------------------------------------------------------- /src/datatrove/executor/ray.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/executor/ray.py -------------------------------------------------------------------------------- /src/datatrove/executor/slurm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/executor/slurm.py -------------------------------------------------------------------------------- /src/datatrove/io.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/io.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datatrove/pipeline/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/base.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/decont/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/decont/__init__.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/decont/n_grams.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/decont/n_grams.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/dedup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/dedup/__init__.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/dedup/bloom_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/dedup/bloom_filter.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/dedup/exact_substrings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/dedup/exact_substrings.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/dedup/minhash.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/dedup/minhash.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/dedup/sentence_dedup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/dedup/sentence_dedup.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/dedup/url_dedup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/dedup/url_dedup.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/extractors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/extractors/__init__.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/extractors/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/extractors/base.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/extractors/modular.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/extractors/modular.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/extractors/trafilatura.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/extractors/trafilatura.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/filters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/filters/__init__.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/filters/base_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/filters/base_filter.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/filters/c4_filters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/filters/c4_filters.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/filters/fasttext_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/filters/fasttext_filter.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/filters/fineweb_quality_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/filters/fineweb_quality_filter.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/filters/gopher_quality_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/filters/gopher_quality_filter.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/filters/gopher_repetition_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/filters/gopher_repetition_filter.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/filters/lambda_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/filters/lambda_filter.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/filters/language_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/filters/language_filter.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/filters/regex_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/filters/regex_filter.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/filters/sampler_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/filters/sampler_filter.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/filters/unigram_log_probs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/filters/unigram_log_probs.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/filters/url_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/filters/url_filter.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/formatters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/formatters/__init__.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/formatters/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/formatters/base.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/formatters/ftfy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/formatters/ftfy.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/formatters/pii.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/formatters/pii.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/formatters/symbol_lines_remover.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/formatters/symbol_lines_remover.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/inference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datatrove/pipeline/inference/checkpointing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/inference/checkpointing.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/inference/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datatrove/pipeline/inference/distributed/ray.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/inference/distributed/ray.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/inference/distributed/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/inference/distributed/utils.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/inference/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/inference/metrics.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/inference/run_inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/inference/run_inference.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/inference/servers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/inference/servers/__init__.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/inference/servers/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/inference/servers/base.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/inference/servers/dummy_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/inference/servers/dummy_server.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/inference/servers/endpoint_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/inference/servers/endpoint_server.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/inference/servers/sglang_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/inference/servers/sglang_server.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/inference/servers/vllm_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/inference/servers/vllm_server.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/inference/types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/inference/types.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/readers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/readers/__init__.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/readers/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/readers/base.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/readers/csv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/readers/csv.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/readers/huggingface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/readers/huggingface.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/readers/ipc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/readers/ipc.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/readers/jsonl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/readers/jsonl.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/readers/parquet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/readers/parquet.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/readers/warc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/readers/warc.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/stats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/stats/__init__.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/stats/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/stats/base.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/stats/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/stats/config.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/stats/contamination_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/stats/contamination_stats.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/stats/doc_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/stats/doc_stats.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/stats/lang_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/stats/lang_stats.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/stats/line_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/stats/line_stats.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/stats/merger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/stats/merger.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/stats/paragraph_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/stats/paragraph_stats.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/stats/perplexity_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/stats/perplexity_stats.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/stats/sentence_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/stats/sentence_stats.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/stats/token_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/stats/token_stats.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/stats/word_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/stats/word_stats.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/tokens/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/tokens/__init__.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/tokens/context_shuffler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/tokens/context_shuffler.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/tokens/counter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/tokens/counter.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/tokens/megatron_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/tokens/megatron_tokenizer.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/tokens/merger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/tokens/merger.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/tokens/tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/tokens/tokenizer.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/writers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/writers/__init__.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/writers/disk_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/writers/disk_base.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/writers/huggingface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/writers/huggingface.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/writers/jsonl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/writers/jsonl.py -------------------------------------------------------------------------------- /src/datatrove/pipeline/writers/parquet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/pipeline/writers/parquet.py -------------------------------------------------------------------------------- /src/datatrove/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datatrove/tools/check_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/tools/check_dataset.py -------------------------------------------------------------------------------- /src/datatrove/tools/failed_logs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/tools/failed_logs.py -------------------------------------------------------------------------------- /src/datatrove/tools/fast_mh3/Cargo.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/tools/fast_mh3/Cargo.lock -------------------------------------------------------------------------------- /src/datatrove/tools/fast_mh3/Cargo.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/tools/fast_mh3/Cargo.toml -------------------------------------------------------------------------------- /src/datatrove/tools/fast_mh3/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/tools/fast_mh3/README.md -------------------------------------------------------------------------------- /src/datatrove/tools/fast_mh3/src/local_union_find.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/tools/fast_mh3/src/local_union_find.rs -------------------------------------------------------------------------------- /src/datatrove/tools/fast_mh3/src/s3_union_find.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/tools/fast_mh3/src/s3_union_find.rs -------------------------------------------------------------------------------- /src/datatrove/tools/inspect_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/tools/inspect_data.py -------------------------------------------------------------------------------- /src/datatrove/tools/jobs_status.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/tools/jobs_status.py -------------------------------------------------------------------------------- /src/datatrove/tools/launch_pickled_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/tools/launch_pickled_pipeline.py -------------------------------------------------------------------------------- /src/datatrove/tools/merge_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/tools/merge_stats.py -------------------------------------------------------------------------------- /src/datatrove/tools/track_jobs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/tools/track_jobs.py -------------------------------------------------------------------------------- /src/datatrove/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datatrove/utils/_import_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/_import_utils.py -------------------------------------------------------------------------------- /src/datatrove/utils/batching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/batching.py -------------------------------------------------------------------------------- /src/datatrove/utils/binaryio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/binaryio.py -------------------------------------------------------------------------------- /src/datatrove/utils/dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/dataset.py -------------------------------------------------------------------------------- /src/datatrove/utils/hashes/sha1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/hashes/sha1.py -------------------------------------------------------------------------------- /src/datatrove/utils/hashes/xxhash.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/hashes/xxhash.py -------------------------------------------------------------------------------- /src/datatrove/utils/hashing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/hashing.py -------------------------------------------------------------------------------- /src/datatrove/utils/japanese_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/japanese_tokenizer.py -------------------------------------------------------------------------------- /src/datatrove/utils/jobs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/jobs.py -------------------------------------------------------------------------------- /src/datatrove/utils/lid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/lid.py -------------------------------------------------------------------------------- /src/datatrove/utils/logging.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/logging.py -------------------------------------------------------------------------------- /src/datatrove/utils/perplexity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/perplexity.py -------------------------------------------------------------------------------- /src/datatrove/utils/stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/stats.py -------------------------------------------------------------------------------- /src/datatrove/utils/text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/text.py -------------------------------------------------------------------------------- /src/datatrove/utils/tokenization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/tokenization.py -------------------------------------------------------------------------------- /src/datatrove/utils/typeshelper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/typeshelper.py -------------------------------------------------------------------------------- /src/datatrove/utils/word_tokenizers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/src/datatrove/utils/word_tokenizers.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/executor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/executor/test_local.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/executor/test_local.py -------------------------------------------------------------------------------- /tests/executor/test_ray.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/executor/test_ray.py -------------------------------------------------------------------------------- /tests/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/pipeline/inference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/pipeline/test_adapter_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_adapter_reader.py -------------------------------------------------------------------------------- /tests/pipeline/test_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_base.py -------------------------------------------------------------------------------- /tests/pipeline/test_bloom_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_bloom_filter.py -------------------------------------------------------------------------------- /tests/pipeline/test_exact_substrings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_exact_substrings.py -------------------------------------------------------------------------------- /tests/pipeline/test_extractors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_extractors.py -------------------------------------------------------------------------------- /tests/pipeline/test_filters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_filters.py -------------------------------------------------------------------------------- /tests/pipeline/test_hf_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_hf_reader.py -------------------------------------------------------------------------------- /tests/pipeline/test_inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_inference.py -------------------------------------------------------------------------------- /tests/pipeline/test_ipc_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_ipc_reader.py -------------------------------------------------------------------------------- /tests/pipeline/test_jsonl_zstd_compression.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_jsonl_zstd_compression.py -------------------------------------------------------------------------------- /tests/pipeline/test_minhash.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_minhash.py -------------------------------------------------------------------------------- /tests/pipeline/test_ngrams_decont.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_ngrams_decont.py -------------------------------------------------------------------------------- /tests/pipeline/test_parquet_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_parquet_reader.py -------------------------------------------------------------------------------- /tests/pipeline/test_parquet_writer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_parquet_writer.py -------------------------------------------------------------------------------- /tests/pipeline/test_parquet_zstd_compression.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_parquet_zstd_compression.py -------------------------------------------------------------------------------- /tests/pipeline/test_pii_removal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_pii_removal.py -------------------------------------------------------------------------------- /tests/pipeline/test_request_cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_request_cache.py -------------------------------------------------------------------------------- /tests/pipeline/test_sentence_deduplication.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_sentence_deduplication.py -------------------------------------------------------------------------------- /tests/pipeline/test_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_stats.py -------------------------------------------------------------------------------- /tests/pipeline/test_symbollines.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_symbollines.py -------------------------------------------------------------------------------- /tests/pipeline/test_text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_text.py -------------------------------------------------------------------------------- /tests/pipeline/test_tokenization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_tokenization.py -------------------------------------------------------------------------------- /tests/pipeline/test_url_deduplication.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_url_deduplication.py -------------------------------------------------------------------------------- /tests/pipeline/test_word_tokenizers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/pipeline/test_word_tokenizers.py -------------------------------------------------------------------------------- /tests/test_io.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/test_io.py -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datatrove/HEAD/tests/utils.py --------------------------------------------------------------------------------