├── .coveragerc
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.yml
│ ├── custom.md
│ ├── feature_request.yml
│ └── question.yml
└── workflows
│ ├── deploy_sphinx_docs.yml
│ ├── docker
│ └── docker-compose.yml
│ ├── perf-bench.yml
│ ├── pre-commit.yml
│ ├── publish-docker-oss.yml
│ ├── publish-docker.yml
│ ├── publish-pypi.yml
│ ├── sphinx_docs_linkcheck.yml
│ ├── stale.yml
│ ├── unit-test-partial.yml
│ └── unit-test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pre-commit-hooks
├── build_op_doc.py
└── tag_mappings.json
├── .secrets.baseline
├── Dockerfile
├── LICENSE
├── README.md
├── README_ZH.md
├── app.py
├── configs
├── annotation
│ ├── README.md
│ ├── annotation_default.yaml
│ └── annotation_with_notifications.yaml
├── config_all.yaml
├── config_min.yaml
├── data_juicer_recipes
│ ├── alpaca_cot
│ │ ├── README.md
│ │ ├── README_ZH.md
│ │ ├── alpaca-cot-en-refine.yaml
│ │ └── alpaca-cot-zh-refine.yaml
│ ├── data-juicer-sandbox-optimal.yaml
│ ├── data-juicer-sandbox-self-evolution.yaml
│ ├── general-video-refine-example.yaml
│ ├── github_code
│ │ ├── redpajama-code-refine.yaml
│ │ ├── redpajama-stack-code-deduplicate.yaml
│ │ └── stack-code-refine.yaml
│ ├── img-diff-recipe.yaml
│ ├── llava-pretrain-refine.yaml
│ ├── pile-europarl-refine.yaml
│ ├── pile-freelaw-refine.yaml
│ ├── pile-hackernews-refine.yaml
│ ├── pile-nih-refine.yaml
│ ├── pile-philpaper-refine.yaml
│ ├── pile-pubmed-abstract-refine.yaml
│ ├── pile-pubmed-central-refine.yaml
│ ├── pile-uspto-refine.yaml
│ ├── redpajama-arxiv-refine.yaml
│ ├── redpajama-book-refine.yaml
│ ├── redpajama-c4-refine.yaml
│ ├── redpajama-cc-2019-30-refine.yaml
│ ├── redpajama-cc-2020-05-refine.yaml
│ ├── redpajama-cc-2021-04-refine.yaml
│ ├── redpajama-cc-2022-05-refine.yaml
│ ├── redpajama-cc-2023-06-refine.yaml
│ ├── redpajama-pile-stackexchange-refine.yaml
│ └── redpajama-wiki-refine.yaml
├── datasets
│ ├── local_json.yaml
│ ├── local_parquet.yaml
│ ├── mixture.yaml
│ ├── remote_arxiv.yaml
│ ├── remote_commoncrawl.yaml
│ ├── remote_huggingface.yaml
│ ├── remote_modelscope.yaml
│ ├── remote_wiki.yaml
│ ├── validation_required_fields.yaml
│ └── validation_swift_messages.yaml
├── demo
│ ├── analyzer.yaml
│ ├── bench
│ │ ├── 1_single_op_pipeline.yaml
│ │ ├── 2_multi_op_pipeline.yaml
│ │ ├── 3_duplicate_pipeline.yaml
│ │ ├── model_infer.yaml
│ │ ├── model_train.yaml
│ │ ├── model_train_2_epoch.yaml
│ │ └── vbench_eval.yaml
│ ├── dedup.yaml
│ ├── process-huggingface.yaml
│ ├── process.yaml
│ └── sandbox
│ │ ├── gpt3_data_quality_eval_config.yaml
│ │ ├── gpt3_extra_train_config.json
│ │ ├── gpt3_extra_train_config.yaml
│ │ ├── inception_eval_config.yaml
│ │ └── sandbox.yaml
├── reproduced_bloom
│ ├── README.md
│ ├── README_ZH.md
│ └── bloom-oscar.yaml
└── reproduced_redpajama
│ ├── README.md
│ ├── README_ZH.md
│ ├── redpajama-arxiv.yaml
│ ├── redpajama-books.yaml
│ ├── redpajama-code.yaml
│ └── redpajama-stackexchange.yaml
├── data_juicer
├── __init__.py
├── analysis
│ ├── __init__.py
│ ├── collector.py
│ ├── column_wise_analysis.py
│ ├── diversity_analysis.py
│ ├── draw.py
│ ├── measure.py
│ └── overall_analysis.py
├── config
│ ├── __init__.py
│ └── config.py
├── core
│ ├── __init__.py
│ ├── adapter.py
│ ├── analyzer.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── config_validator.py
│ │ ├── data_validator.py
│ │ ├── dataset_builder.py
│ │ ├── dj_dataset.py
│ │ ├── load_strategy.py
│ │ ├── ray_dataset.py
│ │ └── schema.py
│ ├── executor
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── default_executor.py
│ │ ├── factory.py
│ │ └── ray_executor.py
│ ├── exporter.py
│ ├── monitor.py
│ ├── sandbox
│ │ ├── evaluators.py
│ │ ├── factories.py
│ │ ├── hooks.py
│ │ ├── model_executors.py
│ │ └── pipelines.py
│ └── tracer.py
├── download
│ ├── __init__.py
│ ├── arxiv.py
│ ├── commoncrawl.py
│ ├── downloader.py
│ └── wikipedia.py
├── format
│ ├── __init__.py
│ ├── csv_formatter.py
│ ├── empty_formatter.py
│ ├── formatter.py
│ ├── json_formatter.py
│ ├── load.py
│ ├── parquet_formatter.py
│ ├── text_formatter.py
│ └── tsv_formatter.py
├── ops
│ ├── __init__.py
│ ├── aggregator
│ │ ├── __init__.py
│ │ ├── entity_attribute_aggregator.py
│ │ ├── meta_tags_aggregator.py
│ │ ├── most_relevant_entities_aggregator.py
│ │ └── nested_aggregator.py
│ ├── base_op.py
│ ├── common
│ │ ├── __init__.py
│ │ ├── helper_func.py
│ │ ├── prompt2prompt_pipeline.py
│ │ └── special_characters.py
│ ├── deduplicator
│ │ ├── __init__.py
│ │ ├── document_deduplicator.py
│ │ ├── document_minhash_deduplicator.py
│ │ ├── document_simhash_deduplicator.py
│ │ ├── image_deduplicator.py
│ │ ├── ray_basic_deduplicator.py
│ │ ├── ray_bts_minhash_deduplicator.py
│ │ ├── ray_document_deduplicator.py
│ │ ├── ray_image_deduplicator.py
│ │ ├── ray_video_deduplicator.py
│ │ └── video_deduplicator.py
│ ├── filter
│ │ ├── __init__.py
│ │ ├── alphanumeric_filter.py
│ │ ├── audio_duration_filter.py
│ │ ├── audio_nmf_snr_filter.py
│ │ ├── audio_size_filter.py
│ │ ├── average_line_length_filter.py
│ │ ├── character_repetition_filter.py
│ │ ├── flagged_words_filter.py
│ │ ├── general_field_filter.py
│ │ ├── image_aesthetics_filter.py
│ │ ├── image_aspect_ratio_filter.py
│ │ ├── image_face_count_filter.py
│ │ ├── image_face_ratio_filter.py
│ │ ├── image_nsfw_filter.py
│ │ ├── image_pair_similarity_filter.py
│ │ ├── image_shape_filter.py
│ │ ├── image_size_filter.py
│ │ ├── image_text_matching_filter.py
│ │ ├── image_text_similarity_filter.py
│ │ ├── image_watermark_filter.py
│ │ ├── language_id_score_filter.py
│ │ ├── llm_difficulty_score_filter.py
│ │ ├── llm_quality_score_filter.py
│ │ ├── maximum_line_length_filter.py
│ │ ├── perplexity_filter.py
│ │ ├── phrase_grounding_recall_filter.py
│ │ ├── special_characters_filter.py
│ │ ├── specified_field_filter.py
│ │ ├── specified_numeric_field_filter.py
│ │ ├── stopwords_filter.py
│ │ ├── suffix_filter.py
│ │ ├── text_action_filter.py
│ │ ├── text_entity_dependency_filter.py
│ │ ├── text_length_filter.py
│ │ ├── text_pair_similarity_filter.py
│ │ ├── token_num_filter.py
│ │ ├── video_aesthetics_filter.py
│ │ ├── video_aspect_ratio_filter.py
│ │ ├── video_duration_filter.py
│ │ ├── video_frames_text_similarity_filter.py
│ │ ├── video_motion_score_filter.py
│ │ ├── video_motion_score_raft_filter.py
│ │ ├── video_nsfw_filter.py
│ │ ├── video_ocr_area_ratio_filter.py
│ │ ├── video_resolution_filter.py
│ │ ├── video_tagging_from_frames_filter.py
│ │ ├── video_watermark_filter.py
│ │ ├── word_repetition_filter.py
│ │ └── words_num_filter.py
│ ├── grouper
│ │ ├── __init__.py
│ │ ├── key_value_grouper.py
│ │ ├── naive_grouper.py
│ │ └── naive_reverse_grouper.py
│ ├── load.py
│ ├── mapper
│ │ ├── __init__.py
│ │ ├── annotation
│ │ │ ├── __init__.py
│ │ │ ├── annotation_mapper.py
│ │ │ └── human_preference_annotation_mapper.py
│ │ ├── audio_add_gaussian_noise_mapper.py
│ │ ├── audio_ffmpeg_wrapped_mapper.py
│ │ ├── calibrate_qa_mapper.py
│ │ ├── calibrate_query_mapper.py
│ │ ├── calibrate_response_mapper.py
│ │ ├── chinese_convert_mapper.py
│ │ ├── clean_copyright_mapper.py
│ │ ├── clean_email_mapper.py
│ │ ├── clean_html_mapper.py
│ │ ├── clean_ip_mapper.py
│ │ ├── clean_links_mapper.py
│ │ ├── dialog_intent_detection_mapper.py
│ │ ├── dialog_sentiment_detection_mapper.py
│ │ ├── dialog_sentiment_intensity_mapper.py
│ │ ├── dialog_topic_detection_mapper.py
│ │ ├── expand_macro_mapper.py
│ │ ├── extract_entity_attribute_mapper.py
│ │ ├── extract_entity_relation_mapper.py
│ │ ├── extract_event_mapper.py
│ │ ├── extract_keyword_mapper.py
│ │ ├── extract_nickname_mapper.py
│ │ ├── extract_support_text_mapper.py
│ │ ├── extract_tables_from_html_mapper.py
│ │ ├── fix_unicode_mapper.py
│ │ ├── generate_qa_from_examples_mapper.py
│ │ ├── generate_qa_from_text_mapper.py
│ │ ├── image_blur_mapper.py
│ │ ├── image_captioning_from_gpt4v_mapper.py
│ │ ├── image_captioning_mapper.py
│ │ ├── image_diffusion_mapper.py
│ │ ├── image_face_blur_mapper.py
│ │ ├── image_remove_background_mapper.py
│ │ ├── image_segment_mapper.py
│ │ ├── image_tagging_mapper.py
│ │ ├── imgdiff_difference_area_generator_mapper.py
│ │ ├── imgdiff_difference_caption_generator_mapper.py
│ │ ├── mllm_mapper.py
│ │ ├── nlpaug_en_mapper.py
│ │ ├── nlpcda_zh_mapper.py
│ │ ├── optimize_qa_mapper.py
│ │ ├── optimize_query_mapper.py
│ │ ├── optimize_response_mapper.py
│ │ ├── pair_preference_mapper.py
│ │ ├── punctuation_normalization_mapper.py
│ │ ├── python_file_mapper.py
│ │ ├── python_lambda_mapper.py
│ │ ├── query_intent_detection_mapper.py
│ │ ├── query_sentiment_detection_mapper.py
│ │ ├── query_topic_detection_mapper.py
│ │ ├── relation_identity_mapper.py
│ │ ├── remove_bibliography_mapper.py
│ │ ├── remove_comments_mapper.py
│ │ ├── remove_header_mapper.py
│ │ ├── remove_long_words_mapper.py
│ │ ├── remove_non_chinese_character_mapper.py
│ │ ├── remove_repeat_sentences_mapper.py
│ │ ├── remove_specific_chars_mapper.py
│ │ ├── remove_table_text_mapper.py
│ │ ├── remove_words_with_incorrect_substrings_mapper.py
│ │ ├── replace_content_mapper.py
│ │ ├── sdxl_prompt2prompt_mapper.py
│ │ ├── sentence_augmentation_mapper.py
│ │ ├── sentence_split_mapper.py
│ │ ├── text_chunk_mapper.py
│ │ ├── video_captioning_from_audio_mapper.py
│ │ ├── video_captioning_from_frames_mapper.py
│ │ ├── video_captioning_from_summarizer_mapper.py
│ │ ├── video_captioning_from_video_mapper.py
│ │ ├── video_extract_frames_mapper.py
│ │ ├── video_face_blur_mapper.py
│ │ ├── video_ffmpeg_wrapped_mapper.py
│ │ ├── video_remove_watermark_mapper.py
│ │ ├── video_resize_aspect_ratio_mapper.py
│ │ ├── video_resize_resolution_mapper.py
│ │ ├── video_split_by_duration_mapper.py
│ │ ├── video_split_by_key_frame_mapper.py
│ │ ├── video_split_by_scene_mapper.py
│ │ ├── video_tagging_from_audio_mapper.py
│ │ ├── video_tagging_from_frames_mapper.py
│ │ └── whitespace_normalization_mapper.py
│ ├── mixins.py
│ ├── op_fusion.py
│ └── selector
│ │ ├── __init__.py
│ │ ├── frequency_specified_field_selector.py
│ │ ├── random_selector.py
│ │ ├── range_specified_field_selector.py
│ │ ├── tags_specified_field_selector.py
│ │ └── topk_specified_field_selector.py
├── tools
│ └── __init__.py
└── utils
│ ├── __init__.py
│ ├── asset_utils.py
│ ├── availability_utils.py
│ ├── cache_utils.py
│ ├── ckpt_utils.py
│ ├── common_utils.py
│ ├── compress.py
│ ├── constant.py
│ ├── file_utils.py
│ ├── fingerprint_utils.py
│ ├── lazy_loader.py
│ ├── logger_utils.py
│ ├── mm_utils.py
│ ├── model_utils.py
│ ├── nltk_utils.py
│ ├── process_utils.py
│ ├── registry.py
│ ├── resource_utils.py
│ ├── sample.py
│ └── unittest_utils.py
├── demos
├── README.md
├── README_ZH.md
├── api_service
│ ├── configs
│ │ ├── dj_config_template.yaml
│ │ └── model_configs.json
│ ├── react_data_filter_process.ipynb
│ ├── react_data_mapper_process.ipynb
│ ├── utils.py
│ ├── wrapped_filters.py
│ └── wrapped_mappers.py
├── auto_evaluation_helm
│ ├── README_ZH.md
│ ├── app.py
│ └── imgs
│ │ ├── data-juicer.png
│ │ ├── eval-01.png
│ │ └── eval-02.png
├── data
│ ├── demo-dataset-annotation-human-preference.jsonl
│ ├── demo-dataset-chatml.jsonl
│ ├── demo-dataset-content.jsonl
│ ├── demo-dataset-deduplication.jsonl
│ ├── demo-dataset-images.jsonl
│ ├── demo-dataset-videos.jsonl
│ ├── demo-dataset.jsonl
│ ├── demo-dataset_1725870268.jsonl
│ └── demo-dataset_1725870628.jsonl
├── data_mixture
│ ├── app.py
│ └── data
│ │ ├── redpajama-c4-refined.jsonl
│ │ ├── redpajama-cc-2023-06-refined.jsonl
│ │ ├── redpajama-pile-stackexchange-refined.jsonl
│ │ ├── the-pile-nih-refined.jsonl
│ │ └── the-pile-uspto-refined.jsonl
├── data_process_hpo
│ ├── app.py
│ └── imgs
│ │ └── data-juicer.png
├── data_process_loop
│ ├── app.py
│ ├── configs
│ │ └── demo.yaml
│ └── data
│ │ └── demo-dataset.jsonl
├── data_visualization_diversity
│ ├── app.py
│ ├── configs
│ │ └── demo.yaml
│ └── data
│ │ └── demo-dataset.jsonl
├── data_visualization_op_effect
│ ├── app.py
│ ├── configs
│ │ ├── demo_en.yaml
│ │ └── demo_zh.yaml
│ └── data
│ │ └── demo-dataset.jsonl
├── data_visualization_op_insight
│ ├── app.css
│ ├── app.py
│ └── cache
│ │ └── .gitkeep
├── data_visualization_statistics
│ ├── app.py
│ ├── configs
│ │ └── demo.yaml
│ └── data
│ │ └── demo-dataset.jsonl
├── overview_scan
│ ├── app.py
│ └── data
│ │ └── demo-dataset.jsonl
├── process_cft_zh_data
│ ├── app.py
│ └── data
│ │ └── alpaca-cot.jsonl
├── process_code_data
│ ├── app.py
│ └── data
│ │ └── stack_exchange.jsonl
├── process_on_ray
│ ├── configs
│ │ ├── dedup.yaml
│ │ ├── demo-new-config.yaml
│ │ └── demo.yaml
│ └── data
│ │ ├── demo-dataset.json
│ │ └── demo-dataset.jsonl
├── process_sci_data
│ ├── app.py
│ └── data
│ │ └── arxiv.jsonl
├── process_video_on_ray
│ ├── configs
│ │ ├── demo-new-config.yaml
│ │ └── demo.yaml
│ └── data
│ │ ├── Note.md
│ │ ├── demo-dataset.jsonl
│ │ └── videos
│ │ ├── video1.mp4
│ │ ├── video2.mp4
│ │ └── video3.mp4
├── role_playing_system_prompt
│ ├── README_ZH.md
│ ├── role_playing_system_prompt.yaml
│ └── system_prompt_generator.py
├── tool_dataset_splitting_by_language
│ ├── app.py
│ ├── data
│ │ └── demo-dataset.jsonl
│ └── dataset_splitting_by_language.py
└── tool_quality_classifier
│ ├── app.py
│ ├── data
│ └── demo-dataset.jsonl
│ └── quality_classifier
│ ├── __init__.py
│ ├── eval.py
│ ├── predict.py
│ ├── qc_utils.py
│ └── train.py
├── docs
├── BadDataExhibition.md
├── BadDataExhibition_ZH.md
├── DJ_SORA.md
├── DJ_SORA_ZH.md
├── DJ_service.md
├── DJ_service_ZH.md
├── DatasetCfg.md
├── DatasetCfg_ZH.md
├── DeveloperGuide.md
├── DeveloperGuide_ZH.md
├── Distributed.md
├── Distributed_ZH.md
├── Operators.md
├── RecipeGallery.md
├── RecipeGallery_ZH.md
├── Sandbox.md
├── Sandbox_ZH.md
├── awesome_llm_data.md
├── imgs
│ ├── data-juicer.jpg
│ ├── eval-01.png
│ └── eval-02.png
├── sphinx_doc
│ ├── Makefile
│ ├── README.md
│ ├── README_ZH.md
│ ├── _templates
│ │ └── package.rst_t
│ ├── build_doc.sh
│ ├── make.bat
│ ├── redirect.html
│ └── source
│ │ ├── _static
│ │ ├── awesome-list.html
│ │ ├── sidebar-menu.css
│ │ └── tutorial_kdd24.html
│ │ ├── _templates
│ │ ├── page.html
│ │ └── sidebar
│ │ │ └── bottom_menu.html
│ │ ├── api.rst
│ │ ├── conf.py
│ │ ├── index.rst
│ │ └── index_ZH.rst
└── tutorial
│ ├── DJ-Cookbook.md
│ ├── DJ-Cookbook_ZH.md
│ ├── Installation.md
│ ├── Installation_ZH.md
│ ├── QuickStart.md
│ └── QuickStart_ZH.md
├── label_studio_localhost_connection.json
├── pyproject.toml
├── scripts
├── README.md
├── dlc
│ ├── partition_data_dlc.py
│ └── run_on_dlc.sh
└── run_slurm.sh
├── service.py
├── tests
├── __init__.py
├── analysis
│ ├── __init__.py
│ ├── test_collector.py
│ ├── test_column_wise_analysis.py
│ ├── test_diversity_analysis.py
│ ├── test_draw.py
│ ├── test_measure.py
│ └── test_overall_analysis.py
├── benchmark_performance
│ ├── configs
│ │ ├── audio.yaml
│ │ ├── image.yaml
│ │ ├── text.yaml
│ │ └── video.yaml
│ ├── report.py
│ └── run.sh
├── config
│ ├── __init__.py
│ ├── demo_4_test.yaml
│ ├── demo_4_test_bad_val.yaml
│ └── test_config.py
├── core
│ ├── __init__.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── test_data
│ │ │ ├── sample.jsonl
│ │ │ ├── sample.txt
│ │ │ ├── test_config.yaml
│ │ │ ├── test_config_list.yaml
│ │ │ └── test_config_ray.yaml
│ │ ├── test_data_validator.py
│ │ ├── test_dataset_builder.py
│ │ ├── test_dj_dataset.py
│ │ ├── test_load_strategy.py
│ │ └── test_ray_dataset.py
│ ├── test_adapter.py
│ └── test_monitor.py
├── download
│ ├── __init__.py
│ └── test_download.py
├── format
│ ├── __init__.py
│ ├── data
│ │ ├── structured
│ │ │ ├── demo-dataset.csv
│ │ │ ├── demo-dataset.jsonl
│ │ │ ├── demo-dataset.parquet
│ │ │ └── demo-dataset.tsv
│ │ └── text
│ │ │ ├── sample1.txt
│ │ │ ├── sample2.txt
│ │ │ ├── sample3.txt
│ │ │ ├── sample4.txt
│ │ │ ├── sample5.txt
│ │ │ └── sample6.txt
│ ├── test_csv_formatter.py
│ ├── test_empty_formatter.py
│ ├── test_json_formatter.py
│ ├── test_load_formatter.py
│ ├── test_parquet_formatter.py
│ ├── test_tsv_formatter.py
│ └── test_unify_format.py
├── ops
│ ├── __init__.py
│ ├── aggregator
│ │ ├── __init__.py
│ │ ├── test_entity_attribute_aggregator.py
│ │ ├── test_meta_tags_aggregator.py
│ │ ├── test_most_relevant_entities_aggregator.py
│ │ └── test_nested_aggregator.py
│ ├── common
│ │ └── __init__.py
│ ├── data
│ │ ├── audio1.wav
│ │ ├── audio2.wav
│ │ ├── audio3.ogg
│ │ ├── blip.jpg
│ │ ├── cat.jpg
│ │ ├── img1.png
│ │ ├── img2.jpg
│ │ ├── img3.jpg
│ │ ├── img4.png
│ │ ├── img5.jpg
│ │ ├── img6.jpg
│ │ ├── img7.jpg
│ │ ├── img8.jpg
│ │ ├── img_pair_1.jpg
│ │ ├── img_pair_2.jpg
│ │ ├── lena-face.jpg
│ │ ├── lena.jpg
│ │ ├── video1.mp4
│ │ ├── video2.mp4
│ │ ├── video3-no-audio.mp4
│ │ ├── video3.mp4
│ │ ├── video4.mp4
│ │ └── video5.mp4
│ ├── deduplicator
│ │ ├── __init__.py
│ │ ├── test_document_deduplicator.py
│ │ ├── test_document_minhash_deduplicator.py
│ │ ├── test_document_simhash_deduplicator.py
│ │ ├── test_image_deduplicator.py
│ │ ├── test_ray_bts_minhash_deduplicator.py
│ │ ├── test_ray_document_deduplicator.py
│ │ ├── test_ray_image_deduplicator.py
│ │ ├── test_ray_video_deduplicator.py
│ │ └── test_video_deduplicator.py
│ ├── filter
│ │ ├── __init__.py
│ │ ├── test_alphanumeric_filter.py
│ │ ├── test_audio_duration_filter.py
│ │ ├── test_audio_nmf_snr_filter.py
│ │ ├── test_audio_size_filter.py
│ │ ├── test_average_line_length_filter.py
│ │ ├── test_character_repetition_filter.py
│ │ ├── test_flagged_words_filter.py
│ │ ├── test_general_field_filter.py
│ │ ├── test_image_aesthetics_filter.py
│ │ ├── test_image_aspect_ratio_filter.py
│ │ ├── test_image_face_count_filter.py
│ │ ├── test_image_face_ratio_filter.py
│ │ ├── test_image_nsfw_filter.py
│ │ ├── test_image_pair_similarity_filter.py
│ │ ├── test_image_shape_filter.py
│ │ ├── test_image_size_filter.py
│ │ ├── test_image_text_matching_filter.py
│ │ ├── test_image_text_similarity_filter.py
│ │ ├── test_image_watermark_filter.py
│ │ ├── test_language_id_score_filter.py
│ │ ├── test_llm_difficulty_score_filter.py
│ │ ├── test_llm_quality_score_filter.py
│ │ ├── test_maximum_line_length_filter.py
│ │ ├── test_perplexity_filter.py
│ │ ├── test_phrase_grounding_recall_filter.py
│ │ ├── test_special_characters_filter.py
│ │ ├── test_specified_field_filter.py
│ │ ├── test_specified_numeric_field_filter.py
│ │ ├── test_stopwords_filter.py
│ │ ├── test_suffix_filter.py
│ │ ├── test_text_action_filter.py
│ │ ├── test_text_entity_dependency_filter.py
│ │ ├── test_text_length_filter.py
│ │ ├── test_text_pair_similarity_filter.py
│ │ ├── test_token_num_filter.py
│ │ ├── test_video_aesthetics_filter.py
│ │ ├── test_video_aspect_ratio_filter.py
│ │ ├── test_video_duration_filter.py
│ │ ├── test_video_frames_text_similarity_filter.py
│ │ ├── test_video_motion_score_filter.py
│ │ ├── test_video_motion_score_raft_filter.py
│ │ ├── test_video_nsfw_filter.py
│ │ ├── test_video_ocr_area_ratio_filter.py
│ │ ├── test_video_resolution_filter.py
│ │ ├── test_video_tagging_from_frames_filter.py
│ │ ├── test_video_watermark_filter.py
│ │ ├── test_word_repetition_filter.py
│ │ └── test_words_num_filter.py
│ ├── grouper
│ │ ├── __init__.py
│ │ ├── test_key_value_grouper.py
│ │ ├── test_naive_grouper.py
│ │ └── test_naive_reverse_grouper.py
│ ├── mapper
│ │ ├── __init__.py
│ │ ├── annotation
│ │ │ ├── __init__.py
│ │ │ ├── test_annotation_mapper.py
│ │ │ └── test_human_preference_annotation_mapper.py
│ │ ├── test_audio_add_gaussian_noise_mapper.py
│ │ ├── test_audio_ffmpeg_wrapped_mapper.py
│ │ ├── test_calibrate_qa_mapper.py
│ │ ├── test_calibrate_query_mapper.py
│ │ ├── test_calibrate_response_mapper.py
│ │ ├── test_chinese_convert_mapper.py
│ │ ├── test_clean_copyright_mapper.py
│ │ ├── test_clean_email_mapper.py
│ │ ├── test_clean_html_mapper.py
│ │ ├── test_clean_ip_mapper.py
│ │ ├── test_clean_links_mapper.py
│ │ ├── test_dialog_intent_detection_mapper.py
│ │ ├── test_dialog_sentiment_detection_mapper.py
│ │ ├── test_dialog_sentiment_intensity_mapper.py
│ │ ├── test_dialog_topic_detection_mapper.py
│ │ ├── test_expand_macro_mapper.py
│ │ ├── test_extract_entity_attribute_mapper.py
│ │ ├── test_extract_entity_relation_mapper.py
│ │ ├── test_extract_event_mapper.py
│ │ ├── test_extract_keyword_mapper.py
│ │ ├── test_extract_nickname_mapper.py
│ │ ├── test_extract_support_text_mapper.py
│ │ ├── test_extract_tables_from_html_mapper.py
│ │ ├── test_fix_unicode_mapper.py
│ │ ├── test_generate_qa_from_examples_mapper.py
│ │ ├── test_generate_qa_from_text_mapper.py
│ │ ├── test_image_blur_mapper.py
│ │ ├── test_image_captioning_mapper.py
│ │ ├── test_image_diffusion_mapper.py
│ │ ├── test_image_face_blur_mapper.py
│ │ ├── test_image_remove_background_mapper.py
│ │ ├── test_image_segment_mapper.py
│ │ ├── test_image_tagging_mapper.py
│ │ ├── test_imgdiff_difference_area_generator_mapper.py
│ │ ├── test_imgdiff_difference_caption_generator_mapper.py
│ │ ├── test_mllm_mapper.py
│ │ ├── test_nlpaug_en_mapper.py
│ │ ├── test_nlpcda_zh_mapper.py
│ │ ├── test_optimize_qa_mapper.py
│ │ ├── test_optimize_query_mapper.py
│ │ ├── test_optimize_response_mapper.py
│ │ ├── test_pair_preference_mapper.py
│ │ ├── test_punctuation_normalization_mapper.py
│ │ ├── test_python_file_mapper.py
│ │ ├── test_python_lambda_mapper.py
│ │ ├── test_query_intent_detection_mapper.py
│ │ ├── test_query_sentiment_detection_mapper.py
│ │ ├── test_query_topic_detection_mapper.py
│ │ ├── test_relation_identity_mapper.py
│ │ ├── test_remove_bibliography_mapper.py
│ │ ├── test_remove_comments_mapper.py
│ │ ├── test_remove_header_mapper.py
│ │ ├── test_remove_long_words_mapper.py
│ │ ├── test_remove_non_chinese_character_mapper.py
│ │ ├── test_remove_repeat_sentences_mapper.py
│ │ ├── test_remove_specific_chars_mapper.py
│ │ ├── test_remove_table_text_mapper.py
│ │ ├── test_remove_words_with_incorrect_substrings_mapper.py
│ │ ├── test_replace_content_mapper.py
│ │ ├── test_sdxl_prompt2prompt_mapper.py
│ │ ├── test_sentence_augmentation_mapper.py
│ │ ├── test_sentence_split_mapper.py
│ │ ├── test_text_chunk_mapper.py
│ │ ├── test_video_captioning_from_audio_mapper.py
│ │ ├── test_video_captioning_from_frames_mapper.py
│ │ ├── test_video_captioning_from_summarizer_mapper.py
│ │ ├── test_video_captioning_from_video_mapper.py
│ │ ├── test_video_extract_frames_mapper.py
│ │ ├── test_video_face_blur_mapper.py
│ │ ├── test_video_ffmpeg_wrapped_mapper.py
│ │ ├── test_video_remove_watermark_mapper.py
│ │ ├── test_video_resize_aspect_ratio_mapper.py
│ │ ├── test_video_resize_resolution_mapper.py
│ │ ├── test_video_split_by_duration_mapper.py
│ │ ├── test_video_split_by_key_frame_mapper.py
│ │ ├── test_video_split_by_scene_mapper.py
│ │ ├── test_video_tagging_from_audio_mapper.py
│ │ ├── test_video_tagging_from_frames_mapper.py
│ │ └── test_whitespace_normalization_mapper.py
│ ├── selector
│ │ ├── __init__.py
│ │ ├── test_frequency_specified_field_selector.py
│ │ ├── test_random_selector.py
│ │ ├── test_range_specified_field_selector.py
│ │ ├── test_tags_specified_field_selector.py
│ │ └── test_topk_specified_field_selector.py
│ └── test_op_fusion.py
├── run.py
├── tools
│ ├── __init__.py
│ └── test_process_data.py
└── utils
│ ├── __init__.py
│ ├── test_asset_utils.py
│ ├── test_availablility_utils.py
│ ├── test_cache_utils.py
│ ├── test_ckpt_utils.py
│ ├── test_common_utils.py
│ ├── test_compress.py
│ ├── test_constant.py
│ ├── test_file_utils.py
│ ├── test_fingerprint_utils.py
│ ├── test_lazy_loader.py
│ ├── test_logger_utils.py
│ ├── test_mm_utils.py
│ ├── test_model_utils.py
│ ├── test_process_utils.py
│ ├── test_registry.py
│ ├── test_resource_utils.py
│ └── test_unittest_utils.py
├── thirdparty
├── LLM_ecosystems
│ ├── README.md
│ ├── README_ZH.md
│ ├── patch
│ │ ├── helm.diff
│ │ └── megatron.diff
│ ├── setup_helm.sh
│ └── setup_megatron.sh
└── models
│ ├── README.md
│ ├── README_ZH.md
│ ├── patch
│ └── easyanimate.diff
│ └── setup_easyanimate.sh
├── tools
├── __init__.py
├── analyze_data.py
├── converter
│ ├── batch_convert.sh
│ ├── convert_gpt_to_transformers.py
│ └── modeling_megatron_llama.py
├── data_resplit.py
├── distributed_deduplication
│ ├── README.md
│ ├── README_ZH.md
│ ├── __init__.py
│ ├── dedup_utils.py
│ └── spark_dedup.py
├── dj_install.py
├── evaluator
│ ├── README.md
│ ├── README_ZH.md
│ ├── config
│ │ ├── evaluator_example.yaml
│ │ └── helm_spec_template.conf
│ ├── evaluator.py
│ ├── gpt_eval
│ │ ├── README.md
│ │ ├── README_ZH.md
│ │ ├── __init__.py
│ │ ├── answer
│ │ │ └── openai
│ │ │ │ └── gpt-3.5-turbo.jsonl
│ │ ├── answer_generator.py
│ │ ├── config
│ │ │ ├── config.yaml
│ │ │ ├── prompt.jsonl
│ │ │ ├── question.jsonl
│ │ │ └── reviewer.jsonl
│ │ └── gpt_evaluator.py
│ └── recorder
│ │ ├── README.md
│ │ ├── README_ZH.md
│ │ ├── __init__.py
│ │ ├── config
│ │ ├── leaderboard_example.yaml
│ │ ├── llama_example.yaml
│ │ └── mymodel_example.yaml
│ │ └── wandb_writer.py
├── fmt_conversion
│ ├── README.md
│ ├── README_ZH.md
│ ├── multimodal
│ │ ├── README.md
│ │ ├── README_ZH.md
│ │ ├── absolute_path_to_relative_path.py
│ │ ├── data_juicer_format_to_target_format
│ │ │ ├── dj_to_internvid.py
│ │ │ ├── dj_to_llava.py
│ │ │ ├── dj_to_mmc4.py
│ │ │ ├── dj_to_msrvtt.py
│ │ │ ├── dj_to_video_chatgpt.py
│ │ │ ├── dj_to_wavcaps.py
│ │ │ └── dj_to_youku.py
│ │ ├── source_format_to_data_juicer_format
│ │ │ ├── internvid_to_dj.py
│ │ │ ├── llava_to_dj.py
│ │ │ ├── mmc4_to_dj.py
│ │ │ ├── msrvtt_to_dj.py
│ │ │ ├── video_chatgpt_to_dj.py
│ │ │ ├── wavcaps_to_dj.py
│ │ │ └── youku_to_dj.py
│ │ └── utils.py
│ └── post_tuning_dialog
│ │ ├── README.md
│ │ ├── README_ZH.md
│ │ ├── data_juicer_format_to_target_format
│ │ ├── dj_to_alpaca.py
│ │ ├── dj_to_llama_factory_sharegpt.py
│ │ ├── dj_to_messages.py
│ │ └── dj_to_ms_swift_sharegpt.py
│ │ └── source_format_to_data_juicer_format
│ │ ├── alpaca_to_dj.py
│ │ ├── llama_factory_sharegpt_to_dj.py
│ │ ├── messages_to_dj.py
│ │ └── ms_swift_sharegpt_to_dj.py
├── generate_smtp_cert.py
├── generate_uv_lock.py
├── hpo
│ ├── README.md
│ ├── README_ZH.md
│ ├── configs
│ │ ├── process.yaml
│ │ └── quality_score_hpo.yaml
│ ├── demo-redpajama-c4-refined.jsonl
│ ├── execute_hpo_3sigma.py
│ ├── execute_hpo_wandb.py
│ └── objects.py
├── humanops
│ ├── README.md
│ ├── enable_legacy_token.png
│ └── label_studio_service.py
├── mm_eval
│ ├── __init__.py
│ ├── inception_metrics
│ │ ├── README.md
│ │ ├── README_ZH.md
│ │ ├── calc_metrics_for_videos.py
│ │ ├── dataset.py
│ │ ├── distributed.py
│ │ ├── util.py
│ │ └── video_metrics
│ │ │ ├── frechet_inception_distance.py
│ │ │ ├── frechet_video_distance.py
│ │ │ ├── inception_score.py
│ │ │ ├── kernel_inception_distance.py
│ │ │ ├── kernel_video_distance.py
│ │ │ ├── metric_main.py
│ │ │ ├── metric_utils.py
│ │ │ ├── precision_recall.py
│ │ │ ├── video_inception_score.py
│ │ │ └── video_precision_recall.py
│ └── vbench_metrics
│ │ ├── README.md
│ │ ├── README_ZH.md
│ │ ├── VBench_full_info.json
│ │ ├── VBench_mini_info.json
│ │ └── evaluate.py
├── multimodal
│ └── __init__.py
├── postprocess
│ ├── README.md
│ ├── README_ZH.md
│ ├── count_token.py
│ ├── data_mixture.py
│ └── deserialize_meta.py
├── preprocess
│ ├── README.md
│ ├── README_ZH.md
│ ├── dataset_split_by_language.py
│ ├── raw_alpaca_cot_merge_add_meta.py
│ ├── raw_arxiv_to_jsonl.py
│ ├── raw_stackexchange_to_jsonl.py
│ ├── reformat_csv_nan_value.py
│ ├── reformat_jsonl_nan_value.py
│ └── serialize_meta.py
├── process_data.py
├── quality_classifier
│ ├── README.md
│ ├── README_ZH.md
│ ├── __init__.py
│ ├── eval.py
│ ├── predict.py
│ ├── qc_utils.py
│ └── train.py
└── sandbox_starter.py
└── uv.lock
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 | # avoid measuring strange non-existing files
4 | /workspace/config.py
5 | /workspace/config-3.py
6 |
7 | # avoid measuring third-party dist packages
8 | */dist-packages/*
9 |
10 | # avoid measuring code of unittest
11 | tests/*
12 |
13 | [report]
14 | ignore_errors = True
15 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/custom.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Custom issue template
3 | about: Describe this issue template's purpose here.
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
1 | name: pre-commit
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | pre-commit:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v4
10 | - uses: actions/setup-python@v4
11 | with:
12 | python-version: '3.10'
13 | - uses: pre-commit/action@v3.0.0
14 |
--------------------------------------------------------------------------------
/.github/workflows/publish-docker-oss.yml:
--------------------------------------------------------------------------------
1 | name: Sync Docker Image to Aliyun OSS
2 |
3 | # This workflow uses actions that are not certified by GitHub.
4 | # They are provided by a third-party and are governed by
5 | # separate terms of service, privacy policy, and support
6 | # documentation.
7 |
8 | on:
9 | workflow_dispatch:
10 | schedule:
11 | - cron: '0 16 * * 1' # Every Monday at 16:00 UTC -- Every Tuesday at 0:00 in Beijing Time
12 |
13 | env:
14 | IMAGE_NAME: datajuicer/data-juicer
15 | IMAGE_TAG: latest
16 | REGISTRY_MIRROR: docker.xiaogenban1993.com
17 | ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
18 |
19 |
20 | jobs:
21 | build:
22 | runs-on: [docker-internal]
23 | timeout-minutes: 1440 # 24 hours
24 |
25 | steps:
26 | - name: Pull Docker image
27 | run: |
28 | docker pull ${{ env.REGISTRY_MIRROR }}/${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }}
29 |
30 | - name: Export to tar file
31 | run: |
32 | docker save ${{ env.REGISTRY_MIRROR }}/${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }} | gzip > data-juicer-${{ env.IMAGE_TAG }}.tar.gz
33 |
34 | - name: Sync Docker image to Aliyun OSS
35 | id: sync_to_oss
36 | run: |
37 | curl -s "http://127.0.0.1:5006/sync2oss?file_path=data-juicer-${{ env.IMAGE_TAG }}.tar.gz" > /dev/null
38 |
39 | - name: Clean up resources
40 | if: success()
41 | run: |
42 | rm -rf data-juicer-${{ env.IMAGE_TAG }}.tar.gz
43 |
--------------------------------------------------------------------------------
/.github/workflows/publish-pypi.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Publish PyPi Package
10 |
11 | on:
12 | workflow_dispatch:
13 | release:
14 | types: [published]
15 |
16 | permissions:
17 | contents: read
18 |
19 | jobs:
20 | deploy:
21 |
22 | runs-on: ubuntu-latest
23 |
24 | steps:
25 | - uses: actions/checkout@v4
26 | - name: Set up Python
27 | uses: actions/setup-python@v5
28 | with:
29 | python-version: '3.x'
30 | - name: Install dependencies
31 | run: |
32 | python -m pip install --upgrade pip
33 | pip install setuptools
34 | - name: Build package
35 | run: python setup.py sdist bdist_wheel
36 | - name: Publish package
37 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
38 | with:
39 | user: __token__
40 | password: ${{ secrets.PYPI_API_TOKEN }}
41 |
--------------------------------------------------------------------------------
/.github/workflows/sphinx_docs_linkcheck.yml:
--------------------------------------------------------------------------------
1 | name: Sphinx Docs LinkCheck
2 |
3 | on:
4 | schedule:
5 | - cron: "0 5 * * 0" # Runs at 05:00 on Sunday.
6 | workflow_dispatch:
7 |
8 | concurrency:
9 | group: ${{ github.workflow }}-${{ github.ref }}
10 | cancel-in-progress: true
11 |
12 | jobs:
13 | linkcheck:
14 | name: Check Links
15 | runs-on: ubuntu-latest
16 | strategy:
17 | matrix:
18 | python-version: [ "3.10" ]
19 | steps:
20 | - name: Checkout
21 | uses: actions/checkout@v4
22 | - name: Setup Python ${{ matrix.python-version }}
23 | uses: actions/setup-python@master
24 | with:
25 | python-version: ${{ matrix.python-version }}
26 | - name: Install dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install -v -e .[dev]
30 | - name: Run Sphinx linkcheck
31 | run: |
32 | cd docs/sphinx_doc
33 | sphinx-apidoc -f -o source ../../data_juicer -t _templates
34 | python ./create_symlinks.py
35 | sphinx-build -b linkcheck -j 4 source build/linkcheck || true
36 | - name: Upload Linkcheck Results
37 | uses: actions/upload-artifact@v4
38 | with:
39 | name: LinkcheckResults
40 | path: "docs/sphinx_doc/build/linkcheck"
41 |
--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
2 | #
3 | # You can adjust the behavior by modifying this file.
4 | # For more information, see:
5 | # https://github.com/actions/stale
6 | name: Mark stale issues and pull requests
7 |
8 | on:
9 | schedule:
10 | - cron: '30 9 * * *'
11 |
12 | jobs:
13 | stale:
14 |
15 | runs-on: ubuntu-latest
16 | permissions:
17 | issues: write
18 | pull-requests: write
19 |
20 | steps:
21 | - uses: actions/stale@v5
22 | with:
23 | repo-token: ${{ secrets.GITHUB_TOKEN }}
24 | stale-issue-message: 'This issue is marked as stale because there has been no activity for 21 days. Remove stale label or add new comments or this issue will be closed in 3 day.'
25 | stale-pr-message: 'This PR is marked as stale because there has been no activity for 21 days. Remove stale label or add new comments or this PR will be closed in 3 day.'
26 | close-issue-message: 'Close this stale issue.'
27 | close-pr-message: 'Close this stale PR.'
28 | stale-issue-label: 'stale-issue'
29 | stale-pr-label: 'stale-pr'
30 | days-before-stale: 21
31 | days-before-close: 3
32 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # data & resources
3 | outputs/
4 | assets/
5 |
6 | # setup
7 | data_juicer.egg-info/
8 | py_data_juicer.egg-info/
9 | build/
10 | dist
11 |
12 | # others
13 | .DS_Store
14 | .idea/
15 | wandb/
16 | __pycache__
17 | .vscode/
18 |
19 | # label studio related
20 | label_studio_data/
21 | label_studio_venv/
22 | label_studio_connection.json
23 |
24 | **/__dj__produced_data__/*
25 | venv/
26 | .venv/
27 |
28 | # dup files created by tests
29 | tests/ops/data/*dup*
30 | tests/tools/tmp_*/
31 | tests/ops/deduplicator/chinese_dedup/
32 | tests/ops/deduplicator/english_dedup/
33 |
--------------------------------------------------------------------------------
/configs/annotation/annotation_default.yaml:
--------------------------------------------------------------------------------
1 | # configs/demo/annotation.yaml
2 | # Process config for human preference annotation with Label Studio
3 |
4 | # Global parameters
5 | project_name: 'demo-annotation-human-preference'
6 | dataset_path: './demos/data/demo-dataset-annotation-human-preference.jsonl'
7 | np: 1 # Use single process for annotation tasks
8 | open_tracer: true
9 |
10 | export_path: './outputs/demo-annotation-human-preference/annotated-data.jsonl'
11 |
12 | # use prompt as the text field, which will be used in the label studio template
13 | text_keys: "prompt"
14 |
15 | # Process schedule
16 | process:
17 | # Annotation step using Label Studio with human preference
18 | - human_preference_annotation_mapper:
19 | # general annotation project settings
20 | project_name_prefix: "Human_Preference_Annotation"
21 | wait_for_annotations: true # Whether to wait for annotations to complete
22 | timeout: 3600 # Maximum time to wait for annotations in seconds (1 hour)
23 | poll_interval: 10 # Time between annotation status checks in seconds
24 | max_tasks_per_batch: 10 # Maximum number of tasks in a single batch
25 | notification_config:
26 | enabled: false
27 |
28 | # label studio connection settings
29 | api_url: "http://localhost:7070" # Default Label Studio URL
30 | api_key: "05409236-67a5-4169-af96-a52a818d0e81" # Your API key for label studio authentication # pragma: allowlist secret
31 |
32 | # human preference annotation settings
33 | prompt_key: "prompt" # Prompt field
34 | answer1_key: "answer1" # First answer option
35 | answer2_key: "answer2" # Second answer option
36 | chosen_key: "chosen" # Chosen field
37 | rejected_key: "rejected" # Rejected field
38 |
--------------------------------------------------------------------------------
/configs/config_min.yaml:
--------------------------------------------------------------------------------
1 | project_name: 'min'
2 |
3 | np: 4 # number of subprocess to process your dataset
4 | export_path: './outputs/'
5 |
6 | text_keys: 'text'
7 | video_key: 'videos'
8 | image_key: 'images'
9 | audio_key: 'audios'
10 | executor_type: default # type of executor, support "default" or "ray" for now.
11 | ray_address: auto # the address of the Ray cluster.
12 | suffixes: null
13 | add_suffix: false
14 |
--------------------------------------------------------------------------------
/configs/data_juicer_recipes/alpaca_cot/alpaca-cot-en-refine.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'Data-Juicer-recipes-alpaca-cot-en'
3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
4 | export_path: '/path/to/your/dataset.jsonl'
5 |
6 | np: 50 # number of subprocess to process your dataset
7 | open_tracer: true
8 |
9 | # process schedule
10 | # a list of several process operators with their arguments
11 | process:
12 | - document_deduplicator: # 104636705
13 | lowercase: true
14 | ignore_non_character: true
15 |
16 | - alphanumeric_filter: # 104636381
17 | tokenization: false
18 | min_ratio: 0.1
19 | - character_repetition_filter: # 104630030
20 | rep_len: 10
21 | max_ratio: 0.6
22 | - flagged_words_filter: # 104576967
23 | lang: en
24 | tokenization: true
25 | max_ratio: 0.017
26 | - maximum_line_length_filter: # 104575811
27 | min_len: 20
28 | - text_length_filter: # 104573711
29 | min_len: 30
30 |
31 | - document_simhash_deduplicator: # 72855345
32 | tokenization: space
33 | window_size: 3
34 | lowercase: true
35 | ignore_pattern: '\p{P}'
36 | num_blocks: 9
37 | hamming_distance: 7
38 |
--------------------------------------------------------------------------------
/configs/data_juicer_recipes/alpaca_cot/alpaca-cot-zh-refine.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'Data-Juicer-recipes-alpaca-cot-zh'
3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
4 | export_path: '/path/to/your/dataset.jsonl'
5 |
6 | np: 50 # number of subprocess to process your dataset
7 | open_tracer: true
8 |
9 | # process schedule
10 | # a list of several process operators with their arguments
11 | process:
12 | - document_deduplicator: # 16957516
13 | lowercase: true # whether to convert text to lower case
14 | ignore_non_character: true
15 |
16 | - alphanumeric_filter: # 16957388
17 | tokenization: false
18 | min_ratio: 0.10
19 | - character_repetition_filter: # 16956845
20 | rep_len: 10
21 | max_ratio: 0.6
22 | - flagged_words_filter: # 16954629
23 | lang: zh
24 | tokenization: true
25 | use_words_aug: true
26 | max_ratio: 0.017
27 | - text_length_filter: # 16954317
28 | min_len: 10
29 |
30 | - document_simhash_deduplicator: # 9873214
31 | tokenization: character
32 | window_size: 4 # small window size for short texts
33 | lowercase: true
34 | ignore_pattern: '\p{P}'
35 | num_blocks: 10
36 | hamming_distance: 8 # larger hamming distance threshold for short texts
37 |
--------------------------------------------------------------------------------
/configs/data_juicer_recipes/data-juicer-sandbox-optimal.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'Data-Juicer-recipes-T2V-optimal'
3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
4 | export_path: '/path/to/your/dataset.jsonl'
5 |
6 | np: 4 # number of subprocess to process your dataset
7 |
8 | # process schedule
9 | # a list of several process operators with their arguments
10 | process:
11 | - video_nsfw_filter:
12 | hf_nsfw_model: Falconsai/nsfw_image_detection
13 | max_score: 0.000195383
14 | frame_sampling_method: uniform
15 | frame_num: 3
16 | reduce_mode: avg
17 | any_or_all: any
18 | mem_required: '1GB'
19 | - video_frames_text_similarity_filter:
20 | hf_clip: openai/clip-vit-base-patch32
21 | min_score: 0.306337
22 | max_score: 1.0
23 | frame_sampling_method: uniform
24 | frame_num: 3
25 | horizontal_flip: false
26 | vertical_flip: false
27 | reduce_mode: avg
28 | any_or_all: any
29 | mem_required: '10GB'
30 |
--------------------------------------------------------------------------------
/configs/data_juicer_recipes/data-juicer-sandbox-self-evolution.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'Data-Juicer-recipes-T2V-evolution'
3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
4 | export_path: '/path/to/your/dataset.jsonl'
5 |
6 | np: 4 # number of subprocess to process your dataset
7 |
8 | # process schedule
9 | # a list of several process operators with their arguments
10 | process:
11 | - video_nsfw_filter:
12 | hf_nsfw_model: Falconsai/nsfw_image_detection
13 | max_score: 0.000195383
14 | frame_sampling_method: uniform
15 | frame_num: 3
16 | reduce_mode: avg
17 | any_or_all: any
18 | mem_required: '1GB'
19 | - video_frames_text_similarity_filter:
20 | hf_clip: openai/clip-vit-base-patch32
21 | min_score: 0.306337
22 | max_score: 1.0
23 | frame_sampling_method: uniform
24 | frame_num: 3
25 | horizontal_flip: false
26 | vertical_flip: false
27 | reduce_mode: avg
28 | any_or_all: any
29 | mem_required: '10GB'
30 | - video_motion_score_filter:
31 | min_score: 3
32 | max_score: 20
33 | sampling_fps: 2
34 | any_or_all: any
35 | - video_aesthetics_filter:
36 | hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE
37 | min_score: 0.418164
38 | max_score: 1.0
39 | frame_sampling_method: 'uniform'
40 | frame_num: 3
41 | reduce_mode: avg
42 | any_or_all: any
43 | mem_required: '1500MB'
44 | - video_duration_filter:
45 | min_duration: 2
46 | max_duration: 100000
47 | any_or_all: any
48 |
--------------------------------------------------------------------------------
/configs/data_juicer_recipes/github_code/redpajama-code-refine.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'Data-Juicer-recipes-code-rp'
3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
4 | export_path: '/path/to/your/dataset.jsonl'
5 |
6 | np: 50 # number of subprocess to process your dataset
7 | open_tracer: true
8 |
9 | # process schedule
10 | # a list of several process operators with their arguments
11 | process:
12 | - clean_email_mapper:
13 | - clean_links_mapper:
14 | - fix_unicode_mapper:
15 | - punctuation_normalization_mapper:
16 | - whitespace_normalization_mapper:
17 | - clean_copyright_mapper:
18 |
19 | - alphanumeric_filter:
20 | tokenization: False
21 | min_ratio: 0.4
22 | max_ratio: 0.8
23 | - alphanumeric_filter:
24 | tokenization: True
25 | min_ratio: 1.5
26 | max_ratio: 3
27 | - average_line_length_filter:
28 | min_len: 15
29 | max_len: 100
30 | - character_repetition_filter:
31 | rep_len: 10
32 | min_ratio: 0.05
33 | max_ratio: 0.3
34 | - maximum_line_length_filter:
35 | min_len: 50
36 | max_len: 500
37 | - text_length_filter:
38 | min_len: 300
39 | - words_num_filter:
40 | lang: en
41 | tokenization: False
42 | min_num: 30
43 | max_num: 5000
44 | - word_repetition_filter:
45 | lang: en
46 | tokenization: False
47 | rep_len: 10
48 | max_ratio: 0.1
49 | - document_simhash_deduplicator:
50 | tokenization: space
51 | window_size: 6
52 | lowercase: true
53 | ignore_pattern: '\p{P}'
54 | num_blocks: 6
55 | hamming_distance: 4
56 |
--------------------------------------------------------------------------------
/configs/data_juicer_recipes/github_code/redpajama-stack-code-deduplicate.yaml:
--------------------------------------------------------------------------------
1 | project_name: 'Data-Juicer-recipes-code'
2 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
3 | export_path: '/path/to/your/dataset.jsonl' # path to your dataset result file
4 |
5 | np: 50 # number of subprocess to process your dataset
6 | open_tracer: true
7 |
8 | # process schedule
9 | # a list of several process operators with their arguments
10 |
11 | process:
12 | - document_simhash_deduplicator:
13 | tokenization: space
14 | window_size: 6
15 | lowercase: true
16 | ignore_pattern: '\p{P}'
17 | num_blocks: 6
18 | hamming_distance: 4
19 |
--------------------------------------------------------------------------------
/configs/data_juicer_recipes/github_code/stack-code-refine.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'Data-Juicer-recipes-the-stack'
3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
4 | export_path: '/path/to/your/dataset.jsonl'
5 |
6 | text_keys: 'content'
7 |
8 | np: 50 # number of subprocess to process your dataset
9 | open_tracer: true
10 |
11 | # process schedule
12 | # a list of several process operators with their arguments
13 | process:
14 | - clean_email_mapper:
15 | - clean_links_mapper:
16 | - fix_unicode_mapper:
17 | - punctuation_normalization_mapper:
18 | - whitespace_normalization_mapper:
19 | - clean_copyright_mapper:
20 |
21 | - alphanumeric_filter: # 18766
22 | tokenization: false
23 | min_ratio: 0.2 # < 3sigma (0.3791)
24 | max_ratio: 0.9163 # 3sigma
25 | - alphanumeric_filter: # 146432
26 | tokenization: true
27 | min_ratio: 0.546 # 3sigma
28 | max_ratio: 3.65 # 3sigma
29 | - average_line_length_filter: # for code
30 | min_len: 10 # > 3sigma (0) -- 48790
31 | max_len: 150 # < 3sigma (15603) -- 233275
32 | - character_repetition_filter:
33 | max_ratio: 0.36 # 3sigma -- 346875
34 | - maximum_line_length_filter: # for code
35 | max_len: 1000 # remove 256670 samples
36 | - text_length_filter:
37 | max_len: 96714 # 3sigma -- 190006
38 | - words_num_filter:
39 | min_num: 20 # remove 1504958 samples
40 | max_num: 6640 # 3sigma -- remove 179847 samples
41 | - word_repetition_filter:
42 | rep_len: 10
43 | max_ratio: 0.357 # 3sigma -- 598462
44 |
45 | - document_simhash_deduplicator:
46 | tokenization: space
47 | window_size: 6
48 | lowercase: true
49 | ignore_pattern: '\p{P}'
50 | num_blocks: 6
51 | hamming_distance: 4
52 |
--------------------------------------------------------------------------------
/configs/data_juicer_recipes/pile-europarl-refine.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'Data-Juicer-recipes-EuroParl'
3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
4 | export_path: '/path/to/your/dataset.jsonl'
5 |
6 | np: 50 # number of subprocess to process your dataset
7 | open_tracer: true
8 |
9 | # process schedule
10 | # a list of several process operators with their arguments
11 | process:
12 | - clean_email_mapper:
13 | - clean_links_mapper:
14 | - fix_unicode_mapper:
15 | - punctuation_normalization_mapper:
16 | - whitespace_normalization_mapper:
17 |
18 | - alphanumeric_filter:
19 | tokenization: false
20 | min_ratio: 0.75 # <3sigma (0.779)
21 | max_ratio: 0.90 # >3sigma(0.878)
22 | - average_line_length_filter: # for code
23 | max_len: 588 # 3sigma
24 | - character_repetition_filter:
25 | rep_len: 10
26 | max_ratio: 0.16 # >3sigma (0.114)
27 | - flagged_words_filter:
28 | lang: en
29 | tokenization: true
30 | max_ratio: 0.0007 # 3sigma
31 | - language_id_score_filter:
32 | min_score: 0.7
33 | - maximum_line_length_filter: # for code
34 | max_len: 4000 # >3sigma (3104)
35 | - perplexity_filter:
36 | lang: en
37 | max_ppl: 7596 #(3sigma)
38 | - special_characters_filter:
39 | max_ratio: 0.3 # > 3sigma (0.243)
40 | - text_length_filter:
41 | max_len: 2e5
42 | - words_num_filter:
43 | tokenization: true
44 | min_num: 20
45 | max_num: 1e5 # 3sigma
46 | - word_repetition_filter:
47 | lang: en
48 | tokenization: true
49 | rep_len: 10
50 | max_ratio: 0.2 # > 3sigma (0.185)
51 |
52 | - document_simhash_deduplicator:
53 | tokenization: space
54 | window_size: 6
55 | lowercase: true
56 | ignore_pattern: '\p{P}'
57 | num_blocks: 6
58 | hamming_distance: 4
59 |
--------------------------------------------------------------------------------
/configs/data_juicer_recipes/pile-hackernews-refine.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'Data-Juicer-recipes-HackerNews'
3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
4 | export_path: '/path/to/your/dataset.jsonl'
5 |
6 | np: 48 # number of subprocess to process your dataset
7 | open_tracer: true
8 |
9 | # process schedule
10 | # a list of several process operators with their arguments
11 | process:
12 | - clean_email_mapper:
13 | #- clean_links_mapper:
14 | - fix_unicode_mapper:
15 | - punctuation_normalization_mapper:
16 | - whitespace_normalization_mapper:
17 |
18 | - alphanumeric_filter:
19 | tokenization: false
20 | min_ratio: 0.2 #<3sigma
21 | - average_line_length_filter:
22 | min_len: 15 # >3sigma
23 | - character_repetition_filter:
24 | rep_len: 10
25 | max_ratio: 0.3 # >3sigma
26 | - flagged_words_filter:
27 | lang: en
28 | tokenization: true
29 | max_ratio: 0.05 # >3sigma
30 | - language_id_score_filter:
31 | min_score: 0.2 # <3sigma
32 | - maximum_line_length_filter:
33 | min_len: 20 # >3sigma
34 | - perplexity_filter:
35 | lang: en
36 | max_ppl: 10000 # >3sigma
37 | - special_characters_filter:
38 | max_ratio: 0.7 # >3sigma
39 | - text_length_filter:
40 | min_len: 100 # > 3sigma
41 | - words_num_filter:
42 | lang: en
43 | tokenization: true
44 | min_num: 30 # > 3sigma
45 | - word_repetition_filter:
46 | lang: en
47 | tokenization: true
48 | rep_len: 10
49 | max_ratio: 0.8 # > 3sigma
50 |
51 | - document_simhash_deduplicator:
52 | tokenization: space
53 | window_size: 6
54 | lowercase: true
55 | ignore_pattern: '\p{P}'
56 | num_blocks: 6
57 | hamming_distance: 4
58 |
--------------------------------------------------------------------------------
/configs/data_juicer_recipes/pile-nih-refine.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'Data-Juicer-recipes-Hin'
3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
4 | export_path: '/path/to/your/dataset.jsonl'
5 |
6 | np: 50 # number of subprocess to process your dataset
7 | open_tracer: true
8 |
9 | # process schedule
10 | # a list of several process operators with their arguments
11 | process:
12 | - clean_email_mapper:
13 | - clean_links_mapper:
14 | - fix_unicode_mapper:
15 | - punctuation_normalization_mapper:
16 | - whitespace_normalization_mapper:
17 |
18 | - alphanumeric_filter:
19 | tokenization: false
20 | min_ratio: 0.75 # <3sigma (0.800)
21 | max_ratio: 0.866
22 | - average_line_length_filter:
23 | max_len: 10000 # >3sigma (5425)
24 | - character_repetition_filter:
25 | rep_len: 10
26 | max_ratio: 0.2 # >3sigma (0.127)
27 | - flagged_words_filter:
28 | lang: en
29 | tokenization: true
30 | max_ratio: 0.0003 # 3sigma
31 | - language_id_score_filter:
32 | min_score: 0.7
33 | - perplexity_filter:
34 | lang: en
35 | max_ppl: 1669 #(3sigma)
36 | - special_characters_filter:
37 | max_ratio: 0.3 # > 3sigma (0.218)
38 | - words_num_filter:
39 | tokenization: true
40 | min_num: 20
41 | max_num: 2000
42 | - word_repetition_filter:
43 | lang: en
44 | tokenization: true
45 | rep_len: 10
46 | max_ratio: 0.104 # 3sigma
47 |
48 | - document_simhash_deduplicator:
49 | tokenization: space
50 | window_size: 6
51 | lowercase: true
52 | ignore_pattern: '\p{P}'
53 | num_blocks: 6
54 | hamming_distance: 4
55 |
--------------------------------------------------------------------------------
/configs/data_juicer_recipes/pile-philpaper-refine.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'Data-Juicer-recipes-Philpaper'
3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
4 | export_path: '/path/to/your/dataset.jsonl'
5 |
6 | np: 50 # number of subprocess to process your dataset
7 | open_tracer: true
8 |
9 | # process schedule
10 | # a list of several process operators with their arguments
11 | process:
12 | - clean_email_mapper:
13 | - clean_links_mapper:
14 | - fix_unicode_mapper:
15 | - punctuation_normalization_mapper:
16 | - whitespace_normalization_mapper:
17 |
18 | - alphanumeric_filter:
19 | tokenization: false
20 | min_ratio: 0.7 # <3sigma (0.72)
21 | - average_line_length_filter: # for code
22 | max_len: 5e5 # >3sigma (406006)
23 | - character_repetition_filter:
24 | rep_len: 10
25 | max_ratio: 0.2 # >3sigma (0.145)
26 | - flagged_words_filter:
27 | lang: en
28 | tokenization: true
29 | max_ratio: 0.0007 # 3sigma
30 | - language_id_score_filter:
31 | min_score: 0.6
32 | - maximum_line_length_filter: # for code
33 | max_len: 1e6 # 3sigma
34 | - perplexity_filter:
35 | lang: en
36 | max_ppl: 5000
37 | - special_characters_filter:
38 | max_ratio: 0.4 # > 3sigma (0.302)
39 | - words_num_filter:
40 | lang: en
41 | tokenization: true
42 | min_num: 1000
43 | max_num: 2e5 # 3sigma
44 | - word_repetition_filter:
45 | lang: en
46 | tokenization: true
47 | rep_len: 10
48 | max_ratio: 0.3 # > 3sigma (0.249)
49 |
50 | - document_simhash_deduplicator:
51 | tokenization: space
52 | window_size: 6
53 | lowercase: true
54 | ignore_pattern: '\p{P}'
55 | num_blocks: 6
56 | hamming_distance: 4
57 |
--------------------------------------------------------------------------------
/configs/data_juicer_recipes/redpajama-arxiv-refine.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'Data-Juicer-arxivrecipes-arxiv'
3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
4 | export_path: '/path/to/your/dataset.jsonl'
5 |
6 | np: 50 # number of subprocess to process your dataset
7 | open_tracer: true
8 |
9 | # process schedule
10 | # a list of several process operators with their arguments
11 | process:
12 | - clean_email_mapper:
13 | - clean_links_mapper:
14 | - fix_unicode_mapper:
15 | - punctuation_normalization_mapper:
16 | - whitespace_normalization_mapper:
17 |
18 | - alphanumeric_filter:
19 | tokenization: false
20 | min_ratio: 0.516 # 3sigma
21 | max_ratio: 0.915 # 3sigma
22 | - average_line_length_filter: # for code
23 | max_len: 682 # 3sigma
24 | - character_repetition_filter:
25 | rep_len: 10
26 | max_ratio: 0.3
27 | - flagged_words_filter:
28 | lang: en
29 | tokenization: true
30 | max_ratio: 0.00076 # 3sigma
31 | #- language_id_score_filter: # remove language filter
32 | - maximum_line_length_filter: # for code
33 | max_len: 4000
34 | - perplexity_filter:
35 | lang: en
36 | max_ppl: 8000
37 | - special_characters_filter:
38 | max_ratio: 0.6
39 | - text_length_filter:
40 | max_len: 350000
41 | - words_num_filter:
42 | lang: en
43 | tokenization: true
44 | min_num: 20
45 | max_num: 100000
46 | - word_repetition_filter:
47 | lang: en
48 | tokenization: true
49 | rep_len: 10
50 | max_ratio: 0.574 # 3sigma
51 |
52 | - document_simhash_deduplicator:
53 | tokenization: space
54 | window_size: 6
55 | lowercase: true
56 | ignore_pattern: '\p{P}'
57 | num_blocks: 6
58 | hamming_distance: 4
59 |
--------------------------------------------------------------------------------
/configs/data_juicer_recipes/redpajama-book-refine.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'Data-Juicer-recipes-book'
3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
4 | export_path: '/path/to/your/dataset.jsonl'
5 |
6 | np: 50 # number of subprocess to process your dataset
7 | open_tracer: true
8 |
9 | # process schedule
10 | # a list of several process operators with their arguments
11 | process:
12 | - clean_email_mapper:
13 | - clean_links_mapper:
14 | - fix_unicode_mapper:
15 | - punctuation_normalization_mapper:
16 | - whitespace_normalization_mapper:
17 |
18 | - alphanumeric_filter:
19 | tokenization: false
20 | min_ratio: 0.55 # <3sigma (0.697)
21 | max_ratio: 0.854 # 3sigma
22 | - average_line_length_filter: # for code
23 | max_len: 500 # >3sigma (364)
24 | - character_repetition_filter:
25 | rep_len: 10
26 | max_ratio: 0.2 # >3sigma (0.12)
27 | - flagged_words_filter:
28 | lang: en
29 | tokenization: true
30 | max_ratio: 0.00047 # 3sigma
31 | - language_id_score_filter: # remove language filter
32 | min_score: 0.2
33 | - maximum_line_length_filter: # for code
34 | max_len: 13381 # 3sigma
35 | - perplexity_filter:
36 | lang: en
37 | max_ppl: 6000 # <3sigma (16516)
38 | - special_characters_filter:
39 | max_ratio: 0.5 # >3sigma (0.32)
40 | - words_num_filter:
41 | lang: en
42 | tokenization: true
43 | min_num: 1000
44 | max_num: 539754 # 3sigma
45 | - word_repetition_filter:
46 | lang: en
47 | tokenization: true
48 | rep_len: 10
49 | max_ratio: 0.194 # 3sigma
50 |
51 | - document_simhash_deduplicator:
52 | tokenization: space
53 | window_size: 6
54 | lowercase: true
55 | ignore_pattern: '\p{P}'
56 | num_blocks: 6
57 | hamming_distance: 4
58 |
--------------------------------------------------------------------------------
/configs/data_juicer_recipes/redpajama-c4-refine.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'Data-Juicer-recipes-c4'
3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
4 | export_path: '/path/to/your/dataset.jsonl' # path to your dataset result file
5 |
6 | np: 50 # number of subprocess to process your dataset
7 | open_tracer: True
8 |
9 | # process schedule
10 | # a list of several process operators with their arguments
11 | process:
12 | - clean_email_mapper:
13 | - clean_links_mapper:
14 | - fix_unicode_mapper:
15 | - punctuation_normalization_mapper:
16 | - whitespace_normalization_mapper:
17 |
18 | - alphanumeric_filter:
19 | tokenization: false
20 | min_ratio: 0.65 # <3sigma (0.740)
21 | max_ratio: 0.9 # >3sigma (0.867)
22 | - average_line_length_filter: # for code
23 | max_len: 3000 # >3sigma (1277)
24 | - character_repetition_filter:
25 | rep_len: 10
26 | max_ratio: 0.3 # >3sigma (0.168)
27 | - language_id_score_filter:
28 | min_score: 0.6
29 | - maximum_line_length_filter: # for code
30 | max_len: 4000 # >3sigma (2017)
31 | - perplexity_filter:
32 | lang: en
33 | max_ppl: 6000 #(>3sigma 4543)
34 | - special_characters_filter:
35 | max_ratio: 0.4 # > 3sigma (0.303)
36 | - words_num_filter:
37 | tokenization: true
38 | min_num: 20
39 | max_num: 10000
40 | - word_repetition_filter:
41 | lang: en
42 | tokenization: true
43 | rep_len: 10
44 | max_ratio: 0.231 # 3sigma
45 |
46 | - document_simhash_deduplicator:
47 | tokenization: space
48 | window_size: 6
49 | lowercase: true
50 | ignore_pattern: '\p{P}'
51 | num_blocks: 6
52 | hamming_distance: 4
53 |
--------------------------------------------------------------------------------
/configs/datasets/local_json.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'dataset-local-json'
3 | dataset:
4 | configs:
5 | - type: 'local'
6 | path: 'path/to/json/file'
7 |
--------------------------------------------------------------------------------
/configs/datasets/local_parquet.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'dataset-local-parquet'
3 | dataset:
4 | configs:
5 | - type: 'local'
6 | path: 'path/to/parquet/file'
7 |
--------------------------------------------------------------------------------
/configs/datasets/mixture.yaml:
--------------------------------------------------------------------------------
1 | project_name: 'dataset-mixture'
2 | dataset:
3 | max_sample_num: 10000
4 | configs:
5 | - type: 'local'
6 | weight: 1.0
7 | path: 'path/to/json/file'
8 | - type: 'local'
9 | weight: 1.0
10 | path: 'path/to/csv/file'
11 |
--------------------------------------------------------------------------------
/configs/datasets/remote_arxiv.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'dataset-remote-arxiv'
3 | dataset:
4 | configs:
5 | - type: 'remote'
6 | source: 'arxiv'
7 | lang: 'en'
8 | dump_date: 'latest'
9 | force_download: false
10 | url_limit: 2
11 |
--------------------------------------------------------------------------------
/configs/datasets/remote_commoncrawl.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'dataset-remote-commoncrawl'
3 | dataset:
4 | configs:
5 | - type: 'remote'
6 | source: 'commoncrawl'
7 | start_snapshot: '2020-50'
8 | end_snapshot: '2021-04'
9 | aws: true
10 | force_download: false
11 | url_limit: 2
12 |
--------------------------------------------------------------------------------
/configs/datasets/remote_huggingface.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'dataset-remote-huggingface'
3 | dataset:
4 | configs:
5 | - type: 'remote'
6 | source: 'huggingface'
7 | path: "HuggingFaceFW/fineweb"
8 | name: "CC-MAIN-2024-10"
9 | split: "train"
10 | limit: 1000
11 |
--------------------------------------------------------------------------------
/configs/datasets/remote_modelscope.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'dataset-remote-modelscope'
3 | dataset:
4 | configs:
5 | - type: 'remote'
6 | source: 'modelscope'
7 | path: 'modelscope/clue'
8 | subset_name: 'afqmc'
9 | split: 'train'
10 | limit: 1000
11 |
--------------------------------------------------------------------------------
/configs/datasets/remote_wiki.yaml:
--------------------------------------------------------------------------------
1 | # global parameters
2 | project_name: 'dataset-remote-wiki'
3 | dataset:
4 | configs:
5 | - type: 'remote'
6 | source: 'wiki'
7 | lang: 'en'
8 | dump_date: 'latest'
9 | force_download: false
10 | url_limit: 2
11 |
--------------------------------------------------------------------------------
/configs/datasets/validation_required_fields.yaml:
--------------------------------------------------------------------------------
1 | dataset:
2 | configs:
3 | - type: local
4 | path: /path/to/data.json
5 |
6 | validators:
7 | - type: required_fields
8 | required_fields:
9 | - "text"
10 | - "metadata"
11 | - "language"
12 | field_types:
13 | text: "str"
14 | metadata: "dict"
15 | language: "str"
16 |
--------------------------------------------------------------------------------
/configs/datasets/validation_swift_messages.yaml:
--------------------------------------------------------------------------------
1 | project_name: validation_swift_messages_demo
2 |
3 | dataset:
4 | configs:
5 | - type: local
6 | path: ./demos/data/demo-dataset-chatml.jsonl
7 |
8 | text_keys: messages
9 |
10 | export_path: ./outputs/validation_swift_messages_demo/output.jsonl
11 |
12 | validators:
13 | - type: swift_messages
14 | min_turns: 1 # Minimum number of user-assistant turns
15 | max_turns: 20 # Maximum number of turns including system message
16 |
17 | process:
18 | - text_length_filter:
19 | max_len: 18221 # 3sigma
20 |
--------------------------------------------------------------------------------
/configs/demo/analyzer.yaml:
--------------------------------------------------------------------------------
1 | # Process config example for dataset
2 |
3 | # global parameters
4 | project_name: 'demo-analyzer'
5 | dataset_path: './demos/data/demo-dataset.jsonl' # path to your dataset directory or file
6 | np: 4 # number of subprocess to process your dataset
7 |
8 | export_path: './outputs/demo-analyzer/demo-analyzer-result.jsonl'
9 |
10 | # process schedule
11 | # a list of several process operators with their arguments
12 | process:
13 | - language_id_score_filter:
14 | lang: 'en'
15 | - perplexity_filter:
16 | lang: 'en'
17 | max_ppl: 1500
18 |
--------------------------------------------------------------------------------
/configs/demo/bench/3_duplicate_pipeline.yaml:
--------------------------------------------------------------------------------
1 | # Sandbox config example
2 |
3 | # global parameters
4 | project_name: 'demo-bench'
5 | experiment_name: 'single_op_language_score' # for wandb tracer name
6 | work_dir: './outputs/demo-bench' # the default output dir for meta logging
7 |
8 | # configs for each job, the jobs will be executed according to the order in the list
9 | probe_job_configs:
10 |
11 | refine_recipe_job_configs:
12 |
13 | execution_job_configs:
14 | # train model
15 | - hook: 'TrainModelHook'
16 | meta_name:
17 | dj_configs:
18 | extra_configs: './configs/demo/bench/model_train_2_epoch.yaml' # the input data is set to be demo-dataset-with-multi-op-stats.jsonl
19 | # infer model
20 | - hook: 'InferModelHook'
21 | meta_name:
22 | dj_configs:
23 | extra_configs: './configs/demo/bench/model_infer.yaml'
24 |
25 | evaluation_job_configs:
26 | # vbench evaluation
27 | - hook: 'EvaluateDataHook'
28 | meta_name: 'vbench_eval'
29 | dj_configs:
30 | extra_configs: './configs/demo/bench/vbench_eval.yaml'
31 |
--------------------------------------------------------------------------------
/configs/demo/bench/model_infer.yaml:
--------------------------------------------------------------------------------
1 | type: easyanimate
2 | model_name: "easyanimate"
3 | infer_name: "easyanimate-lora-generate"
4 | train:
5 | model_path:
6 | # path to the pixart model or the hugging face model
7 | pretrained_model_name_or_path: "PixArt-alpha/PixArt-XL-2-512x512"
8 | # path to pretrained easyanimate checkpoint. Following are the links to available checkpoints.
9 | # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors
10 | transformer_path: "/PATH/TO/EASYANIMATE_MODEL"
11 | # Note that the root path is in "thirdparty/easy_animate"
12 | lora_path: "../../../outputs/demo-bench/models/checkpoint-2.safetensors"
13 |
14 | infer_config:
15 | # must match the pretrained easyanimate checkpoint.
16 | image_size: 256
17 | prompt_info_path: "../../../tools/mm_eval/vbench_metrics/VBench_mini_info.json" # Use VBench_full_info.json for full eval.
18 | gpu_num: 1
19 | batch_size: 8
20 | mixed_precision: "bf16"
21 | video_num_per_prompt: 5
22 | seed: 43
23 |
24 | saving_config:
25 | output_video_dir: "../../../outputs/demo-bench/generated_videos"
26 |
--------------------------------------------------------------------------------
/configs/demo/bench/model_train.yaml:
--------------------------------------------------------------------------------
1 | type: easyanimate
2 | model_name: "easyanimate"
3 | trainer_name: "easyanimate-lora-trainer"
4 | train:
5 | tracker_config:
6 | # config for wandb
7 | project_name: "demo-bench"
8 | experiment_name: 'demo-single-op-model-train'
9 | model_path:
10 | # path to the pixart model or the hugging face model
11 | pretrained_model_name_or_path: "PixArt-alpha/PixArt-XL-2-512x512"
12 | # path to pretrained easyanimate checkpoint. Following are the links to available checkpoints.
13 | # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors
14 | transformer_path: "/PATH/TO/EASYANIMATE_MODEL"
15 | dataset_path:
16 | # The root directory to videos. Set empty if it is the absolute path in the dataset.
17 | dataset_name: ""
18 | # path to the Data-Juicer dataset. Note that the root path is in "thirdparty/models/EasyAnimate"
19 | dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-for-train.jsonl"
20 | training_config:
21 | # image size, must match the pretrained easyanimate checkpoint.
22 | sample_size: 256
23 | mixed_precision: "bf16"
24 | batch_size_per_gpu: 8
25 | gradient_accumulation_steps: 1
26 | num_train_epochs: 1
27 | dataloader_num_workers: 8
28 | seed: 42
29 | saving_config:
30 | # Note that the root path is in "thirdparty/models/EasyAnimate"
31 | output_dir: "../../../outputs/demo-bench/models"
32 |
--------------------------------------------------------------------------------
/configs/demo/bench/model_train_2_epoch.yaml:
--------------------------------------------------------------------------------
1 | type: easyanimate
2 | model_name: "easyanimate"
3 | trainer_name: "easyanimate-lora-trainer"
4 | train:
5 | tracker_config:
6 | # config for wandb
7 | project_name: "demo-bench"
8 | experiment_name: 'demo-single-op-model-train'
9 | model_path:
10 | # path to the pixart model or the hugging face model
11 | pretrained_model_name_or_path: "PixArt-alpha/PixArt-XL-2-512x512"
12 | # path to pretrained easyanimate checkpoint. Following are the links to available checkpoints.
13 | # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors
14 | transformer_path: "/PATH/TO/EASYANIMATE_MODEL"
15 | dataset_path:
16 | # The root directory to videos. Set empty if it is the absolute path in the dataset.
17 | dataset_name: ""
18 | # path to the Data-Juicer dataset. Note that the root path is in "thirdparty/easy_animate"
19 | dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-with-multi-op-stats.jsonl"
20 | training_config:
21 | # image size, must match the pretrained easyanimate checkpoint.
22 | sample_size: 256
23 | mixed_precision: "bf16"
24 | batch_size_per_gpu: 8
25 | gradient_accumulation_steps: 1
26 | num_train_epochs: 2
27 | dataloader_num_workers: 8
28 | seed: 42
29 | saving_config:
30 | # Note that the root path is in "thirdparty/easy_animate"
31 | output_dir: "../../../outputs/demo-bench/models"
32 |
--------------------------------------------------------------------------------
/configs/demo/bench/vbench_eval.yaml:
--------------------------------------------------------------------------------
1 | type: vbench_video_evaluator
2 |
3 | # The vbench prompts for video generation. Use VBench_full_info.json for full eval.
4 | prompt_path: ./tools/mm_eval/vbench_metrics/VBench_mini_info.json
5 |
6 | # The path to the dir of generated videos
7 | videos_path: ./outputs/demo-bench/generated_videos
8 |
9 | # The dir to store the eval results
10 | result_dir: ./outputs/demo-bench/eval_results
11 |
12 | # Give a name for this eval
13 | eval_name: mini_test
14 |
15 | # If true, load the required model for VBench from the cache path of environment parameter VBENCH_CACHE_DIR
16 | load_ckpt_from_local: false
17 |
18 | # The dimensions considered in this eval.
19 | # All dimensions include: ['subject_consistency', 'background_consistency', 'temporal_flickering',
20 | # 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality', 'object_class',
21 | # 'multiple_objects', 'human_action', 'color', 'spatial_relationship', 'scene', 'temporal_style',
22 | # 'appearance_style', 'overall_consistency']
23 | dimension_list:
24 | - subject_consistency
25 | - dynamic_degree
26 |
--------------------------------------------------------------------------------
/configs/demo/dedup.yaml:
--------------------------------------------------------------------------------
1 | # Process config example for dataset
2 |
3 | # global parameters
4 | project_name: 'demo-dedup'
5 | dataset_path: './demos/data/demo-dataset-deduplication.jsonl' # path to your dataset directory or file
6 | np: 4 # number of subprocess to process your dataset
7 |
8 | open_tracer: true
9 |
10 | export_path: './outputs/demo-dedup/demo-dedup-processed.jsonl'
11 |
12 | # process schedule
13 | # a list of several process operators with their arguments
14 | process:
15 | - language_id_score_filter:
16 | lang: en
17 | min_score: 0.5
18 | # - document_deduplicator:
19 | # lowercase: false
20 | # ignore_non_character: false
21 | - document_minhash_deduplicator:
22 | tokenization: 'character'
23 |
--------------------------------------------------------------------------------
/configs/demo/process-huggingface.yaml:
--------------------------------------------------------------------------------
1 | # Process config example for dataset
2 |
3 | # global parameters
4 | project_name: 'demo-process'
5 | dataset:
6 | configs:
7 | - type: 'remote'
8 | source: 'huggingface'
9 | path: 'hugfaceguy0001/retarded_bar'
10 | name: 'question'
11 | split: 'train'
12 |
13 | np: 4 # number of subprocess to process your dataset
14 |
15 | export_path: './outputs/demo-process/demo-processed.jsonl'
16 |
17 | # process schedule
18 | # a list of several process operators with their arguments
19 | process:
20 | - language_id_score_filter:
21 | lang: 'zh'
22 | min_score: 0.8
23 |
--------------------------------------------------------------------------------
/configs/demo/process.yaml:
--------------------------------------------------------------------------------
1 | # Process config example for dataset
2 |
3 | # global parameters
4 | project_name: 'demo-process'
5 | dataset_path: './demos/data/demo-dataset.jsonl' # path to your dataset directory or file
6 | np: 4 # number of subprocess to process your dataset
7 |
8 | export_path: './outputs/demo-process/demo-processed.jsonl'
9 |
10 | # process schedule
11 | # a list of several process operators with their arguments
12 | process:
13 | - language_id_score_filter:
14 | lang: 'zh'
15 | min_score: 0.8
16 |
--------------------------------------------------------------------------------
/configs/demo/sandbox/gpt3_data_quality_eval_config.yaml:
--------------------------------------------------------------------------------
1 | type: dj_text_quality_classifier
2 | dataset_path: './outputs/demo-process/demo-processed.jsonl'
3 |
--------------------------------------------------------------------------------
/configs/demo/sandbox/gpt3_extra_train_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "modelscope",
3 | "dataset_path": "./outputs/demo-process/demo-processed.jsonl",
4 | "work_dir": "./outputs/sandbox-train/",
5 | "model_name": "iic/nlp_gpt3_text-generation_chinese-base",
6 | "trainer_name": "nlp-base-trainer",
7 | "key_remapping": {
8 | "text": "src_txt"
9 | },
10 | "train": {
11 | "max_epochs": 2,
12 | "lr_scheduler": {
13 | "type": "StepLR",
14 | "step_size": 2,
15 | "options": {
16 | "by_epoch": false
17 | }
18 | },
19 | "optimizer": {
20 | "type": "AdamW",
21 | "lr": 3e-4
22 | },
23 | "dataloader": {
24 | "batch_size_per_gpu": 2,
25 | "workers_per_gpu": 0
26 | }
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/configs/demo/sandbox/gpt3_extra_train_config.yaml:
--------------------------------------------------------------------------------
1 | type: modelscope
2 | dataset_path: './outputs/demo-process/demo-processed.jsonl'
3 | work_dir: './outputs/sandbox-train/'
4 | model_name: "iic/nlp_gpt3_text-generation_chinese-base"
5 | trainer_name: "nlp-base-trainer"
6 | key_remapping:
7 | text: "src_txt"
8 | train:
9 | max_epochs: 2
10 | lr_scheduler:
11 | type: "StepLR"
12 | step_size: 2
13 | options:
14 | by_epoch: false
15 | optimizer:
16 | type: "AdamW"
17 | lr: 0.0003
18 | dataloader:
19 | batch_size_per_gpu: 2
20 | workers_per_gpu: 0
21 |
--------------------------------------------------------------------------------
/configs/reproduced_bloom/README.md:
--------------------------------------------------------------------------------
1 | # BLOOM Config Files
2 |
3 | This folder contains example configuration files to easily and quickly reproduce the processing flow of the [ROOTS](https://github.com/bigscience-workshop/data-preparation) dataset, created by the BigScience initiative to train the BLOOM models.
4 |
5 | ## Oscar
6 | The raw data files can be downloaded as described in [BLOOM/Oscar](https://github.com/bigscience-workshop/data-preparation/tree/main/preprocessing/training/01b_oscar_cleaning_and_filtering). Then use [bloom-oscar.yaml](bloom-oscar.yaml) to perform the whole processing.
7 |
8 | An analysis of our reproduction will be published soon.
9 |
--------------------------------------------------------------------------------
/configs/reproduced_bloom/README_ZH.md:
--------------------------------------------------------------------------------
1 | # BLOOM 配置文件
2 |
3 | 此文件夹包含的配置文件用于轻松复现 [ROOTS](https://github.com/bigscience-workshop/data-preparation) 的处理流程,该数据集由 BigScience 创建并用于训练 BLOOM 模型。
4 |
5 | ## Oscar
6 |
7 | 原始文件可以参照 [BLOOM/Oscar](https://github.com/bigscience-workshop/data-preparation/tree/main/preprocessing/training/01b_oscar_cleaning_and_filtering) 下载,然后使用 [bloom-oscar.yaml](bloom-oscar.yaml) 进行完整的处理流程。
8 |
9 | 对我们复现结果的分析将在稍后发布。
10 |
--------------------------------------------------------------------------------
/configs/reproduced_bloom/bloom-oscar.yaml:
--------------------------------------------------------------------------------
1 | # Process config example for Oscar used in BLOOM
2 |
3 | # global parameters
4 | project_name: 'bloom_oscar'
5 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
6 | np: 4 # number of subprocess to process your dataset
7 |
8 | export_path: '/path/to/result/dataset.jsonl'
9 |
10 | # process schedule
11 | # a list of several process operators with their arguments
12 | process:
13 | # filter English corpus
14 | - language_id_score_filter:
15 | lang: en
16 | min_score: 0.8
17 |
18 | # basic process for regular English text
19 | - whitespace_normalization_mapper:
20 | - punctuation_normalization_mapper:
21 | - fix_unicode_mapper:
22 | - remove_words_with_incorrect_substrings_mapper:
23 | - remove_long_words_mapper:
24 | max_len: 25
25 |
26 | # basic filter rules for regular English text
27 | - words_num_filter:
28 | min_num: 20
29 | max_num: 100000
30 | - character_repetition_filter:
31 | rep_len: 10
32 | min_ratio: 0.0
33 | max_ratio: 0.106
34 | - word_repetition_filter:
35 | rep_len: 5
36 | min_ratio: 0.0
37 | max_ratio: 0.19
38 | - special_characters_filter:
39 | min_ratio: 0.0
40 | max_ratio: 0.4
41 | - stopwords_filter:
42 | lang: en
43 | min_ratio: 0.3
44 | - flagged_words_filter:
45 | lang: en
46 | max_ratio: 0.01
47 | - perplexity_filter:
48 | lang: en
49 | max_ppl: 1500
50 |
51 | # basic deduplication rules for regular English text
52 | - document_simhash_deduplicator:
53 | tokenization: space
54 | window_size: 6
55 | lowercase: true
56 | ignore_pattern: '\p{P}'
57 | num_blocks: 6
58 | hamming_distance: 4
59 |
--------------------------------------------------------------------------------
/configs/reproduced_redpajama/redpajama-arxiv.yaml:
--------------------------------------------------------------------------------
1 | # Process config example for arXiv dataset
2 |
3 | # global parameters
4 | project_name: 'arXiv'
5 | dataset_path: '/path/to/your/dataset/dir/or/file' # path to your dataset directory or file
6 | np: 32 # number of subprocess to process your dataset
7 |
8 | export_path: '/path/to/your/exported/dataset/file'
9 |
10 | # process schedule
11 | # a list of several process operators with their arguments
12 | process:
13 | - remove_header_mapper:
14 | drop_no_head: true
15 | - remove_comments_mapper:
16 | doc_type: ['md', 'tex']
17 | inline: true
18 | multiline: true
19 | - remove_bibliography_mapper:
20 | - expand_macro_mapper:
21 | - text_length_filter:
22 | min_len: 1
23 |
--------------------------------------------------------------------------------
/configs/reproduced_redpajama/redpajama-books.yaml:
--------------------------------------------------------------------------------
1 | # Process config example for Books used in RedPajam
2 |
3 | # global parameters
4 | project_name: 'RedPajam-books'
5 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
6 | np: 4 # number of subprocess to process your dataset
7 |
8 | export_path: '/path/to/result/dataset.jsonl'
9 |
10 | # process schedule
11 | # a list of several process operators with their arguments
12 | process:
13 | - document_simhash_deduplicator:
14 | tokenization: character
15 | window_size: 6
16 | lowercase: True
17 | ignore_pattern: '[^\w]+'
18 | num_blocks: 6
19 | hamming_distance: 5
20 |
--------------------------------------------------------------------------------
/configs/reproduced_redpajama/redpajama-code.yaml:
--------------------------------------------------------------------------------
1 | # Process config example for codes used in RedPajam
2 |
3 | # global parameters
4 | project_name: 'RedPajam-codes'
5 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
6 | np: 4 # number of subprocess to process your dataset
7 |
8 | export_path: '/path/to/result/dataset.jsonl'
9 |
10 | # process schedule
11 | # a list of several process operators with their arguments
12 | process:
13 | - document_deduplicator:
14 | - clean_copyright_mapper:
15 | - maximum_line_length_filter:
16 | min_len: 1
17 | max_len: 1000
18 | - average_line_length_filter:
19 | min_len: 1
20 | max_len: 100
21 | - alphanumeric_filter:
22 | min_ratio: 0.25
23 | max_ratio: 1.0
24 | - alphanumeric_filter:
25 | tokenization: True
26 | min_ratio: 1.5
27 | - suffix_filter:
28 | suffixes: [".asm", ".bat", ".cmd", ".c", ".h", ".cs", ".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H", ".cmake", ".css",
29 | ".dockerfile", ".f90", ".f", ".f03", ".f08", ".f77", ".f95", ".for", ".fpp", ".go", ".hs", ".html", ".java", ".js",
30 | ".jl", ".lua", ".md", ".markdown", ".php", ".php3", ".php4", ".php5", ".phps", ".phpt", ".pl", ".pm", ".pod", ".perl",
31 | ".ps1", ".psd1", ".psm1", ".py", ".rb", ".rs", ".sql", ".scala", ".sh", ".bash", ".command", ".zsh", ".ts", ".tsx",
32 | ".tex", ".vb", "Dockerfile", "Makefile", ".xml", ".rst", ".m", ".smali"]
33 |
--------------------------------------------------------------------------------
/configs/reproduced_redpajama/redpajama-stackexchange.yaml:
--------------------------------------------------------------------------------
1 | # Process config example for stackexchange used in RedPajam
2 |
3 | # global parameters
4 | project_name: 'RedPajam-stackexchange'
5 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
6 | np: 4 # number of subprocess to process your dataset
7 | use_cache: False
8 |
9 | export_path: '/path/to/result/dataset.jsonl'
10 |
11 | # process schedule
12 | # a list of several process operators with their arguments
13 | process:
14 | - clean_html_mapper:
15 | - language_id_score_filter:
16 | lang: ''
17 | min_score: 0.0
18 |
--------------------------------------------------------------------------------
/data_juicer/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '1.3.3'
2 |
3 | import os
4 | import subprocess
5 | import sys
6 |
7 | from loguru import logger
8 | # allow loading truncated images for some too large images.
9 | from PIL import ImageFile
10 |
11 | from data_juicer.utils.availability_utils import _is_package_available
12 | from data_juicer.utils.lazy_loader import LazyLoader
13 |
14 | torch = LazyLoader('torch')
15 | ImageFile.LOAD_TRUNCATED_IMAGES = True
16 |
17 | # For now, only INFO will be shown. Later the severity level will be changed
18 | # when setup_logger is called to initialize the logger.
19 | logger.remove()
20 | logger.add(sys.stderr, level='INFO')
21 |
22 |
23 | def _cuda_device_count():
24 | _torch_available = _is_package_available('torch')
25 |
26 | if _torch_available:
27 | return torch.cuda.device_count()
28 |
29 | try:
30 | nvidia_smi_output = subprocess.check_output(['nvidia-smi', '-L'],
31 | text=True)
32 | all_devices = nvidia_smi_output.strip().split('\n')
33 |
34 | cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
35 | if cuda_visible_devices is not None:
36 | logger.warning(
37 | 'CUDA_VISIBLE_DEVICES is ignored when torch is unavailable. '
38 | 'All detected GPUs will be used.')
39 |
40 | return len(all_devices)
41 | except Exception:
42 | # nvidia-smi not found or other error
43 | return 0
44 |
45 |
46 | def cuda_device_count():
47 | return _cuda_device_count()
48 |
49 |
50 | def is_cuda_available():
51 | return cuda_device_count() > 0
52 |
--------------------------------------------------------------------------------
/data_juicer/analysis/__init__.py:
--------------------------------------------------------------------------------
1 | from .column_wise_analysis import ColumnWiseAnalysis
2 | from .diversity_analysis import DiversityAnalysis
3 | from .overall_analysis import OverallAnalysis
4 |
5 | __all__ = [
6 | 'ColumnWiseAnalysis',
7 | 'DiversityAnalysis',
8 | 'OverallAnalysis',
9 | ]
10 |
--------------------------------------------------------------------------------
/data_juicer/analysis/draw.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import seaborn as sns
4 |
5 |
6 | def draw_heatmap(data,
7 | xlabels,
8 | ylabels='auto',
9 | figsize=None,
10 | triangle=False,
11 | show=False):
12 | """
13 | Draw heatmap of input data with special labels.
14 |
15 | :param data: input data, now support
16 | [`list`, `tuple`, `numpy array`, 'torch tensor']
17 | :param xlabels: x axis labels.
18 | :param ylabels: y axis labels, if None, use xlabels.
19 | :param figsize: figure size.
20 | :param triangle: only display triangle.
21 | :return: a plot figure.
22 | """
23 | figsize = figsize if figsize else (8 * 2.5, 6 * 2.5)
24 | _, ax = plt.subplots(figsize=figsize)
25 | mask = None
26 | if triangle:
27 | mask = np.triu(np.ones_like(data))
28 | ax.tick_params(
29 | right=True,
30 | top=True,
31 | labelright=True,
32 | labeltop=True,
33 | )
34 | sns.heatmap(data,
35 | ax=ax,
36 | cmap='Oranges',
37 | annot=True,
38 | mask=mask,
39 | linewidths=.05,
40 | square=True,
41 | xticklabels=xlabels,
42 | yticklabels=ylabels,
43 | annot_kws={'size': 8})
44 | plt.subplots_adjust(left=.1, right=0.95, bottom=0.22, top=0.95)
45 | fig = plt.gcf()
46 | if show:
47 | plt.show()
48 | return fig
49 |
--------------------------------------------------------------------------------
/data_juicer/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import (export_config, get_default_cfg, get_init_configs,
2 | init_configs, merge_config, prepare_side_configs)
3 |
4 | __all__ = [
5 | 'init_configs', 'get_init_configs', 'export_config', 'merge_config',
6 | 'prepare_side_configs', 'get_default_cfg'
7 | ]
8 |
--------------------------------------------------------------------------------
/data_juicer/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .adapter import Adapter
2 | from .analyzer import Analyzer
3 | from .data import NestedDataset
4 | from .executor import DefaultExecutor, ExecutorBase, ExecutorFactory
5 | from .exporter import Exporter
6 | from .monitor import Monitor
7 | from .tracer import Tracer
8 |
9 | __all__ = [
10 | 'Adapter',
11 | 'Analyzer',
12 | 'NestedDataset',
13 | 'ExecutorBase',
14 | 'ExecutorFactory',
15 | 'DefaultExecutor',
16 | 'Exporter',
17 | 'Monitor',
18 | 'Tracer',
19 | ]
20 |
--------------------------------------------------------------------------------
/data_juicer/core/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .dj_dataset import (DJDataset, NestedDataset,
2 | add_same_content_to_new_column,
3 | wrap_func_with_nested_access)
4 |
5 | __all__ = [
6 | 'DJDataset', 'NestedDataset', 'wrap_func_with_nested_access',
7 | 'add_same_content_to_new_column'
8 | ]
9 |
--------------------------------------------------------------------------------
/data_juicer/core/executor/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import ExecutorBase
2 | from .default_executor import DefaultExecutor
3 | from .factory import ExecutorFactory
4 |
5 | __all__ = ['ExecutorBase', 'ExecutorFactory', 'DefaultExecutor']
6 |
--------------------------------------------------------------------------------
/data_juicer/core/executor/base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Optional
3 |
4 | from jsonargparse import Namespace
5 | from pydantic import PositiveInt
6 |
7 | from data_juicer.config import init_configs
8 |
9 |
10 | class ExecutorBase(ABC):
11 |
12 | @abstractmethod
13 | def __init__(self, cfg: Optional[Namespace] = None):
14 | self.cfg = init_configs() if cfg is None else cfg
15 | self.executor_type = 'base'
16 |
17 | @abstractmethod
18 | def run(self,
19 | load_data_np: Optional[PositiveInt] = None,
20 | skip_return=False):
21 | raise NotImplementedError
22 |
--------------------------------------------------------------------------------
/data_juicer/core/executor/factory.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 |
3 | from .default_executor import DefaultExecutor
4 | from .ray_executor import RayExecutor
5 |
6 |
7 | class ExecutorFactory:
8 |
9 | @staticmethod
10 | def create_executor(
11 | executor_type: str) -> Union[DefaultExecutor, RayExecutor]:
12 | if executor_type in ('local', 'default'):
13 | return DefaultExecutor()
14 | elif executor_type == 'ray':
15 | return RayExecutor()
16 | # TODO: add nemo support
17 | # elif executor_type == "nemo":
18 | # return NemoExecutor()
19 | # TODO: add dask support
20 | # elif executor_type == "dask":
21 | # return DaskExecutor()
22 | else:
23 | raise ValueError('Unsupported executor type')
24 |
--------------------------------------------------------------------------------
/data_juicer/download/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/data_juicer/download/__init__.py
--------------------------------------------------------------------------------
/data_juicer/download/commoncrawl.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/data_juicer/download/commoncrawl.py
--------------------------------------------------------------------------------
/data_juicer/format/__init__.py:
--------------------------------------------------------------------------------
1 | from .csv_formatter import CsvFormatter
2 | from .empty_formatter import EmptyFormatter, RayEmptyFormatter
3 | from .formatter import LocalFormatter, RemoteFormatter
4 | from .json_formatter import JsonFormatter
5 | from .parquet_formatter import ParquetFormatter
6 | from .text_formatter import TextFormatter
7 | from .tsv_formatter import TsvFormatter
8 |
9 | __all__ = [
10 | 'JsonFormatter', 'LocalFormatter', 'RemoteFormatter', 'TextFormatter',
11 | 'ParquetFormatter', 'CsvFormatter', 'TsvFormatter', 'EmptyFormatter',
12 | 'RayEmptyFormatter'
13 | ]
14 |
--------------------------------------------------------------------------------
/data_juicer/format/csv_formatter.py:
--------------------------------------------------------------------------------
1 | from .formatter import FORMATTERS, LocalFormatter
2 |
3 |
4 | @FORMATTERS.register_module()
5 | class CsvFormatter(LocalFormatter):
6 | """
7 | The class is used to load and format csv-type files.
8 |
9 | Default suffixes is `['.csv']`
10 | """
11 | SUFFIXES = ['.csv']
12 |
13 | def __init__(self, dataset_path, suffixes=None, **kwargs):
14 | """
15 | Initialization method.
16 |
17 | :param dataset_path: a dataset file or a dataset directory
18 | :param suffixes: files with specified suffixes to be processed
19 | :param kwargs: extra args
20 | """
21 | super().__init__(
22 | dataset_path=dataset_path,
23 | suffixes=suffixes if suffixes else self.SUFFIXES,
24 | type='csv',
25 | **kwargs,
26 | )
27 |
--------------------------------------------------------------------------------
/data_juicer/format/json_formatter.py:
--------------------------------------------------------------------------------
1 | from .formatter import FORMATTERS, LocalFormatter
2 |
3 |
4 | @FORMATTERS.register_module()
5 | class JsonFormatter(LocalFormatter):
6 | """
7 | The class is used to load and format json-type files.
8 |
9 | Default suffixes is `['.json', '.jsonl', '.jsonl.zst']`
10 | """
11 | SUFFIXES = ['.json', '.jsonl', '.jsonl.zst']
12 |
13 | def __init__(self, dataset_path, suffixes=None, **kwargs):
14 | """
15 | Initialization method.
16 |
17 | :param dataset_path: a dataset file or a dataset directory
18 | :param suffixes: files with specified suffixes to be processed
19 | :param kwargs: extra args
20 | """
21 | super().__init__(
22 | dataset_path=dataset_path,
23 | suffixes=suffixes if suffixes else self.SUFFIXES,
24 | type='json',
25 | **kwargs,
26 | )
27 |
--------------------------------------------------------------------------------
/data_juicer/format/parquet_formatter.py:
--------------------------------------------------------------------------------
1 | from .formatter import FORMATTERS, LocalFormatter
2 |
3 |
4 | @FORMATTERS.register_module()
5 | class ParquetFormatter(LocalFormatter):
6 | """
7 | The class is used to load and format parquet-type files.
8 |
9 | Default suffixes is `['.parquet']`
10 | """
11 | SUFFIXES = ['.parquet']
12 |
13 | def __init__(self, dataset_path, suffixes=None, **kwargs):
14 | """
15 | Initialization method.
16 |
17 | :param dataset_path: a dataset file or a dataset directory
18 | :param suffixes: files with specified suffixes to be processed
19 | :param kwargs: extra args
20 | """
21 | super().__init__(
22 | dataset_path=dataset_path,
23 | suffixes=suffixes if suffixes else self.SUFFIXES,
24 | type='parquet',
25 | **kwargs,
26 | )
27 |
--------------------------------------------------------------------------------
/data_juicer/format/tsv_formatter.py:
--------------------------------------------------------------------------------
1 | from .formatter import FORMATTERS, LocalFormatter
2 |
3 |
4 | @FORMATTERS.register_module()
5 | class TsvFormatter(LocalFormatter):
6 | """
7 | The class is used to load and format tsv-type files.
8 |
9 | Default suffixes is `['.tsv']`
10 | """
11 | SUFFIXES = ['.tsv']
12 |
13 | def __init__(self, dataset_path, suffixes=None, **kwargs):
14 | """
15 | Initialization method.
16 |
17 | :param dataset_path: a dataset file or a dataset directory
18 | :param suffixes: files with specified suffixes to be processed
19 | :param kwargs: extra args, e.g. `delimiter = ','`
20 | """
21 | super().__init__(
22 | dataset_path=dataset_path,
23 | suffixes=suffixes if suffixes else self.SUFFIXES,
24 | type='csv',
25 | delimiter='\t',
26 | **kwargs,
27 | )
28 |
--------------------------------------------------------------------------------
/data_juicer/ops/__init__.py:
--------------------------------------------------------------------------------
1 | # yapf: disable
2 | from . import aggregator, deduplicator, filter, grouper, mapper, selector
3 | from .base_op import (NON_STATS_FILTERS, OPERATORS, TAGGING_OPS, UNFORKABLE,
4 | Aggregator, Deduplicator, Filter, Grouper, Mapper,
5 | Selector)
6 | from .load import load_ops
7 |
8 | __all__ = [
9 | 'load_ops',
10 | 'Filter',
11 | 'Mapper',
12 | 'Deduplicator',
13 | 'Selector',
14 | 'Grouper',
15 | 'Aggregator',
16 | 'UNFORKABLE',
17 | 'NON_STATS_FILTERS',
18 | 'OPERATORS',
19 | 'TAGGING_OPS',
20 | ]
21 |
--------------------------------------------------------------------------------
/data_juicer/ops/aggregator/__init__.py:
--------------------------------------------------------------------------------
1 | from .entity_attribute_aggregator import EntityAttributeAggregator
2 | from .meta_tags_aggregator import MetaTagsAggregator
3 | from .most_relevant_entities_aggregator import MostRelevantEntitiesAggregator
4 | from .nested_aggregator import NestedAggregator
5 |
6 | __all__ = [
7 | 'NestedAggregator', 'MetaTagsAggregator', 'EntityAttributeAggregator',
8 | 'MostRelevantEntitiesAggregator'
9 | ]
10 |
--------------------------------------------------------------------------------
/data_juicer/ops/common/__init__.py:
--------------------------------------------------------------------------------
1 | from .helper_func import (get_sentences_from_document, get_words_from_document,
2 | merge_on_whitespace_tab_newline,
3 | split_on_newline_tab_whitespace, split_on_whitespace,
4 | split_text_by_punctuation, strip, words_augmentation,
5 | words_refinement)
6 | from .special_characters import SPECIAL_CHARACTERS
7 |
8 | __all__ = [
9 | 'get_sentences_from_document', 'get_words_from_document',
10 | 'merge_on_whitespace_tab_newline', 'split_on_newline_tab_whitespace',
11 | 'split_on_whitespace', 'strip', 'words_augmentation', 'words_refinement',
12 | 'split_text_by_punctuation', 'SPECIAL_CHARACTERS'
13 | ]
14 |
--------------------------------------------------------------------------------
/data_juicer/ops/common/special_characters.py:
--------------------------------------------------------------------------------
1 | # Most of the code here has been modified from:
2 | # https://huggingface.co/spaces/huggingface/text-data-filtering
3 | # --------------------------------------------------------
4 |
5 | import string
6 |
7 | import emoji
8 |
9 | # special characters
10 | MAIN_SPECIAL_CHARACTERS = string.punctuation + string.digits \
11 | + string.whitespace
12 | OTHER_SPECIAL_CHARACTERS = (
13 | " ’“”–ー一▬…✦�£•€«»°·═"
14 | "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰
‑≤≥‖"
15 | "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
16 | "゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
17 | "」﴾》"
18 | )
19 | EMOJI = list(emoji.EMOJI_DATA.keys())
20 | SPECIAL_CHARACTERS = set(MAIN_SPECIAL_CHARACTERS + OTHER_SPECIAL_CHARACTERS)
21 | SPECIAL_CHARACTERS.update(EMOJI)
22 |
23 | # various whitespaces for whitespace normalization
24 | # whitespaces in unicode can be found here:
25 | # https://en.wikipedia.org/wiki/Whitespace_character
26 | VARIOUS_WHITESPACES = {
27 | ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
28 | ' ', ' ', ' ', ' ', '', '', '', '', '', ''
29 | }
30 |
--------------------------------------------------------------------------------
/data_juicer/ops/deduplicator/__init__.py:
--------------------------------------------------------------------------------
1 | from .document_deduplicator import DocumentDeduplicator
2 | from .document_minhash_deduplicator import DocumentMinhashDeduplicator
3 | from .document_simhash_deduplicator import DocumentSimhashDeduplicator
4 | from .image_deduplicator import ImageDeduplicator
5 | from .ray_basic_deduplicator import RayBasicDeduplicator
6 | from .ray_bts_minhash_deduplicator import RayBTSMinhashDeduplicator
7 | from .ray_document_deduplicator import RayDocumentDeduplicator
8 | from .ray_image_deduplicator import RayImageDeduplicator
9 | from .ray_video_deduplicator import RayVideoDeduplicator
10 | from .video_deduplicator import VideoDeduplicator
11 |
12 | __all__ = [
13 | 'DocumentDeduplicator',
14 | 'DocumentMinhashDeduplicator',
15 | 'DocumentSimhashDeduplicator',
16 | 'ImageDeduplicator',
17 | 'RayBasicDeduplicator',
18 | 'RayDocumentDeduplicator',
19 | 'RayImageDeduplicator',
20 | 'RayVideoDeduplicator',
21 | 'RayImageDeduplicator',
22 | 'RayBTSMinhashDeduplicator',
23 | 'VideoDeduplicator',
24 | ]
25 |
--------------------------------------------------------------------------------
/data_juicer/ops/filter/suffix_filter.py:
--------------------------------------------------------------------------------
1 | from typing import List, Union
2 |
3 | from data_juicer.utils.constant import Fields
4 |
5 | from ..base_op import NON_STATS_FILTERS, OPERATORS, Filter
6 |
7 | OP_NAME = 'suffix_filter'
8 |
9 |
10 | @NON_STATS_FILTERS.register_module(OP_NAME)
11 | @OPERATORS.register_module(OP_NAME)
12 | class SuffixFilter(Filter):
13 | """Filter to keep samples with specified suffix."""
14 |
15 | def __init__(self, suffixes: Union[str, List[str]] = [], *args, **kwargs):
16 | """
17 | Initialization method.
18 |
19 | :param suffixes: the suffix of text that will be keep.
20 | For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
21 | :param args: extra args
22 | :param kwargs: extra args
23 | """
24 | super().__init__(*args, **kwargs)
25 | if suffixes is None:
26 | self.suffixes = []
27 | elif isinstance(suffixes, str):
28 | self.suffixes = [suffixes]
29 | else:
30 | self.suffixes = suffixes
31 |
32 | def compute_stats_single(self, sample):
33 | return sample
34 |
35 | def process_single(self, sample):
36 | if self.suffixes:
37 | if sample[Fields.suffix] in self.suffixes:
38 | return True
39 | else:
40 | return False
41 | else:
42 | return True
43 |
--------------------------------------------------------------------------------
/data_juicer/ops/grouper/__init__.py:
--------------------------------------------------------------------------------
1 | from .key_value_grouper import KeyValueGrouper
2 | from .naive_grouper import NaiveGrouper
3 | from .naive_reverse_grouper import NaiveReverseGrouper
4 |
5 | __all__ = ['KeyValueGrouper', 'NaiveGrouper', 'NaiveReverseGrouper']
6 |
--------------------------------------------------------------------------------
/data_juicer/ops/grouper/key_value_grouper.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional
2 |
3 | from data_juicer.utils.common_utils import dict_to_hash, nested_access
4 |
5 | from ..base_op import OPERATORS, Grouper, convert_list_dict_to_dict_list
6 | from .naive_grouper import NaiveGrouper
7 |
8 |
9 | @OPERATORS.register_module('key_value_grouper')
10 | class KeyValueGrouper(Grouper):
11 | """Group samples to batched samples according values in given keys. """
12 |
13 | def __init__(self,
14 | group_by_keys: Optional[List[str]] = None,
15 | *args,
16 | **kwargs):
17 | """
18 | Initialization method.
19 |
20 | :param group_by_keys: group samples according values in the keys.
21 | Support for nested keys such as "__dj__stats__.text_len".
22 | It is [self.text_key] in default.
23 | :param args: extra args
24 | :param kwargs: extra args
25 | """
26 | super().__init__(*args, **kwargs)
27 |
28 | self.group_by_keys = group_by_keys or [self.text_key]
29 | self.naive_grouper = NaiveGrouper()
30 |
31 | def process(self, dataset):
32 |
33 | if len(dataset) == 0:
34 | return dataset
35 |
36 | sample_map = {}
37 | for sample in dataset:
38 | cur_dict = {}
39 | for key in self.group_by_keys:
40 | cur_dict[key] = nested_access(sample, key)
41 | sample_key = dict_to_hash(cur_dict)
42 | if sample_key in sample_map:
43 | sample_map[sample_key].append(sample)
44 | else:
45 | sample_map[sample_key] = [sample]
46 |
47 | batched_samples = [
48 | convert_list_dict_to_dict_list(sample_map[k]) for k in sample_map
49 | ]
50 |
51 | return batched_samples
52 |
--------------------------------------------------------------------------------
/data_juicer/ops/grouper/naive_grouper.py:
--------------------------------------------------------------------------------
1 | from ..base_op import OPERATORS, Grouper, convert_list_dict_to_dict_list
2 |
3 |
4 | @OPERATORS.register_module('naive_grouper')
5 | class NaiveGrouper(Grouper):
6 | """Group all samples to one batched sample. """
7 |
8 | def __init__(self, *args, **kwargs):
9 | """
10 | Initialization method.
11 |
12 | :param args: extra args
13 | :param kwargs: extra args
14 | """
15 | super().__init__(*args, **kwargs)
16 |
17 | def process(self, dataset):
18 |
19 | if len(dataset) == 0:
20 | return dataset
21 |
22 | batched_sample = convert_list_dict_to_dict_list(dataset)
23 |
24 | return [batched_sample]
25 |
--------------------------------------------------------------------------------
/data_juicer/ops/grouper/naive_reverse_grouper.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 |
4 | from data_juicer.utils.constant import Fields
5 | from data_juicer.utils.file_utils import create_directory_if_not_exists
6 |
7 | from ..base_op import OPERATORS, Grouper, convert_dict_list_to_list_dict
8 |
9 |
10 | @OPERATORS.register_module('naive_reverse_grouper')
11 | class NaiveReverseGrouper(Grouper):
12 | """Split batched samples to samples. """
13 |
14 | def __init__(self, batch_meta_export_path=None, *args, **kwargs):
15 | """
16 | Initialization method.
17 |
18 | :param batch_meta_export_path: the path to export the batch meta.
19 | Just drop the batch meta if it is None.
20 | :param args: extra args
21 | :param kwargs: extra args
22 | """
23 | super().__init__(*args, **kwargs)
24 | self.batch_meta_export_path = batch_meta_export_path
25 |
26 | def process(self, dataset):
27 |
28 | if len(dataset) == 0:
29 | return dataset
30 |
31 | samples = []
32 | batch_metas = []
33 | for sample in dataset:
34 | if Fields.batch_meta in sample:
35 | batch_metas.append(sample[Fields.batch_meta])
36 | sample = {
37 | k: sample[k]
38 | for k in sample if k != Fields.batch_meta
39 | }
40 | samples.extend(convert_dict_list_to_list_dict(sample))
41 | if self.batch_meta_export_path is not None:
42 | create_directory_if_not_exists(
43 | os.path.dirname(self.batch_meta_export_path))
44 | with open(self.batch_meta_export_path, 'w') as f:
45 | for batch_meta in batch_metas:
46 | f.write(json.dumps(batch_meta, ensure_ascii=False) + '\n')
47 |
48 | return samples
49 |
--------------------------------------------------------------------------------
/data_juicer/ops/load.py:
--------------------------------------------------------------------------------
1 | from .base_op import OPERATORS
2 |
3 |
4 | def load_ops(process_list):
5 | """
6 | Load op list according to the process list from config file.
7 |
8 | :param process_list: A process list. Each item is an op name and its
9 | arguments.
10 | :return: The op instance list.
11 | """
12 | ops = []
13 | new_process_list = []
14 |
15 | for process in process_list:
16 | op_name, args = list(process.items())[0]
17 | ops.append(OPERATORS.modules[op_name](**args))
18 | new_process_list.append(process)
19 |
20 | # store the OP configs into each OP
21 | for op_cfg, op in zip(new_process_list, ops):
22 | op._op_cfg = op_cfg
23 |
24 | return ops
25 |
--------------------------------------------------------------------------------
/data_juicer/ops/mapper/annotation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/data_juicer/ops/mapper/annotation/__init__.py
--------------------------------------------------------------------------------
/data_juicer/ops/mapper/calibrate_query_mapper.py:
--------------------------------------------------------------------------------
1 | from data_juicer.ops.base_op import OPERATORS
2 | from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper
3 |
4 | OP_NAME = 'calibrate_query_mapper'
5 |
6 |
7 | # TODO: LLM-based inference.
8 | @OPERATORS.register_module(OP_NAME)
9 | class CalibrateQueryMapper(CalibrateQAMapper):
10 | """
11 | Mapper to calibrate query in question-answer pairs based on reference text.
12 | """
13 |
14 | DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【问题】进行校准,\
15 | 使其更加详细、准确,且仍可以由原答案回答。只输出校准后的问题,不要输出多余内容。'
16 |
17 | def parse_output(self, raw_output):
18 | return raw_output.strip(), None
19 |
--------------------------------------------------------------------------------
/data_juicer/ops/mapper/calibrate_response_mapper.py:
--------------------------------------------------------------------------------
1 | from data_juicer.ops.base_op import OPERATORS
2 | from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper
3 |
4 | OP_NAME = 'calibrate_response_mapper'
5 |
6 |
7 | # TODO: LLM-based inference.
8 | @OPERATORS.register_module(OP_NAME)
9 | class CalibrateResponseMapper(CalibrateQAMapper):
10 | """
11 | Mapper to calibrate response in question-answer pairs based on reference text.
12 | """ # noqa: E501
13 |
14 | DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【回答】进行校准,\
15 | 使其更加详细、准确,且仍可以回答原问题。只输出校准后的回答,不要输出多余内容。'
16 |
17 | def parse_output(self, raw_output):
18 | return None, raw_output.strip()
19 |
--------------------------------------------------------------------------------
/data_juicer/ops/mapper/clean_email_mapper.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | import regex as re
4 |
5 | from ..base_op import OPERATORS, Mapper
6 |
7 |
8 | @OPERATORS.register_module('clean_email_mapper')
9 | class CleanEmailMapper(Mapper):
10 | """Mapper to clean email in text samples."""
11 |
12 | _batched_op = True
13 |
14 | def __init__(self,
15 | pattern: Optional[str] = None,
16 | repl: str = '',
17 | *args,
18 | **kwargs):
19 | """
20 | Initialization method.
21 |
22 | :param pattern: regular expression pattern to search for within text.
23 | :param repl: replacement string, default is empty string.
24 | :param args: extra args
25 | :param kwargs: extra args
26 | """
27 | super().__init__(*args, **kwargs)
28 | if pattern is None:
29 | self.pattern = r'[A-Za-z0-9.\-+_]+@[a-z0-9.\-+_]+\.[a-z]+'
30 | else:
31 | self.pattern = pattern
32 | if ((len(pattern) > 2) and
33 | (pattern.startswith("r'") and pattern.endswith("'")
34 | or pattern.startswith('r"') and pattern.endswith('"'))):
35 | self.pattern = pattern[2:-1]
36 |
37 | self.repl = repl
38 |
39 | def process_batched(self, samples):
40 | for idx, text in enumerate(samples[self.text_key]):
41 | if not re.search(self.pattern, text, flags=re.DOTALL):
42 | continue
43 | samples[self.text_key][idx] = re.sub(pattern=self.pattern,
44 | repl=self.repl,
45 | string=text,
46 | flags=re.DOTALL)
47 |
48 | return samples
49 |
--------------------------------------------------------------------------------
/data_juicer/ops/mapper/clean_html_mapper.py:
--------------------------------------------------------------------------------
1 | # Some code here has been modified from:
2 | # https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
3 | # --------------------------------------------------------
4 |
5 | from data_juicer.utils.lazy_loader import LazyLoader
6 |
7 | from ..base_op import OPERATORS, Mapper
8 |
9 | selectolax = LazyLoader('selectolax')
10 |
11 | OP_NAME = 'clean_html_mapper'
12 |
13 |
14 | @OPERATORS.register_module(OP_NAME)
15 | class CleanHtmlMapper(Mapper):
16 | """Mapper to clean html code in text samples."""
17 |
18 | _batched_op = True
19 |
20 | def __init__(self, *args, **kwargs):
21 | """
22 | Initialization method.
23 |
24 | :param args: extra args
25 | :param kwargs: extra args
26 | """
27 | super().__init__(*args, **kwargs)
28 |
29 | def process_batched(self, samples):
30 |
31 | def _clean_html(raw_html):
32 | raw_html = raw_html.replace('
', '\n*')
33 | raw_html = raw_html.replace(' ', '')
34 | raw_html = raw_html.replace('', '\n*')
35 | raw_html = raw_html.replace(' ', '')
36 | parser = selectolax.parser.HTMLParser(raw_html)
37 | return parser.text()
38 |
39 | samples[self.text_key] = [
40 | _clean_html(text) for text in samples[self.text_key]
41 | ]
42 | return samples
43 |
--------------------------------------------------------------------------------
/data_juicer/ops/mapper/fix_unicode_mapper.py:
--------------------------------------------------------------------------------
1 | from data_juicer.utils.lazy_loader import LazyLoader
2 |
3 | from ..base_op import OPERATORS, Mapper
4 |
5 | ftfy = LazyLoader('ftfy')
6 |
7 | OP_NAME = 'fix_unicode_mapper'
8 |
9 |
10 | @OPERATORS.register_module(OP_NAME)
11 | class FixUnicodeMapper(Mapper):
12 | """Mapper to fix unicode errors in text samples."""
13 |
14 | _batched_op = True
15 |
16 | def __init__(self, normalization: str = None, *args, **kwargs):
17 | """
18 | Initialization method.
19 |
20 | :param normalization: the specified form of Unicode
21 | normalization mode, which can be one of
22 | ['NFC', 'NFKC', 'NFD', and 'NFKD'], default 'NFC'.
23 | :param args: extra args
24 | :param kwargs: extra args
25 | """
26 | super().__init__(*args, **kwargs)
27 | if normalization and len(normalization) > 0:
28 | self.normalization = normalization.upper()
29 | else:
30 | self.normalization = 'NFC'
31 |
32 | if self.normalization.upper() not in ['NFC', 'NFKC', 'NFD', 'NFKD']:
33 | raise ValueError(f'Normalization mode [{normalization}] is not '
34 | 'supported. Can only be one of '
35 | '["NFC", "NFKC", "NFD", "NFKD"]')
36 |
37 | def process_batched(self, samples):
38 | samples[self.text_key] = [
39 | ftfy.fix_text(text, normalization=self.normalization)
40 | for text in samples[self.text_key]
41 | ]
42 | return samples
43 |
--------------------------------------------------------------------------------
/data_juicer/ops/mapper/optimize_query_mapper.py:
--------------------------------------------------------------------------------
1 | from data_juicer.ops.base_op import OPERATORS
2 | from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper
3 |
4 | OP_NAME = 'optimize_query_mapper'
5 |
6 |
7 | # TODO: Extend LLM-based OPs into API-based implementation.
8 | @OPERATORS.register_module(OP_NAME)
9 | class OptimizeQueryMapper(OptimizeQAMapper):
10 | """
11 | Mapper to optimize query in question-answer pairs.
12 | """
13 |
14 | DEFAULT_SYSTEM_PROMPT = '优化问答对中的【问题】,将其更加详细具体,但仍可以由原答案回答。只输出优化后的【问题】,不要输出多余内容。' # noqa: E501
15 |
16 | _accelerator = 'cuda'
17 |
18 | def parse_output(self, raw_output):
19 | return raw_output.strip(), None
20 |
--------------------------------------------------------------------------------
/data_juicer/ops/mapper/optimize_response_mapper.py:
--------------------------------------------------------------------------------
1 | from data_juicer.ops.base_op import OPERATORS
2 | from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper
3 |
4 | OP_NAME = 'optimize_response_mapper'
5 |
6 |
7 | # TODO: Extend LLM-based OPs into API-based implementation.
8 | @OPERATORS.register_module(OP_NAME)
9 | class OptimizeResponseMapper(OptimizeQAMapper):
10 | """
11 | Mapper to optimize response in question-answer pairs.
12 | """
13 |
14 | DEFAULT_SYSTEM_PROMPT = '请优化问答对中的回答,将其更加详细具体,但仍可以回答原问题。只输出优化后的回答,不要输出多余内容。'
15 |
16 | _accelerator = 'cuda'
17 |
18 | def parse_output(self, raw_output):
19 | return None, raw_output.strip()
20 |
--------------------------------------------------------------------------------
/data_juicer/ops/mapper/remove_bibliography_mapper.py:
--------------------------------------------------------------------------------
1 | # Some code here has been modified from:
2 | # https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
3 | # --------------------------------------------------------
4 |
5 | import regex as re
6 |
7 | from ..base_op import OPERATORS, Mapper
8 |
9 |
10 | @OPERATORS.register_module('remove_bibliography_mapper')
11 | class RemoveBibliographyMapper(Mapper):
12 | """Mapper to remove bibliography at the end of documents in Latex
13 | samples."""
14 |
15 | _batched_op = True
16 |
17 | def __init__(self, *args, **kwargs):
18 | """
19 | Initialization method.
20 |
21 | :param args: extra args
22 | :param kwargs: extra args
23 | """
24 | super().__init__(*args, **kwargs)
25 | self.pattern = r'(\\appendix|'
26 | self.pattern += r'\\begin\{references\}|'
27 | self.pattern += r'\\begin\{REFERENCES\}|'
28 | self.pattern += r'\\begin\{thebibliography\}|'
29 | self.pattern += r'\\bibliography\{.*\}'
30 | self.pattern += r').*$'
31 |
32 | def process_batched(self, samples):
33 | samples[self.text_key] = [
34 | re.sub(pattern=self.pattern,
35 | repl=r'',
36 | string=text,
37 | flags=re.DOTALL) for text in samples[self.text_key]
38 | ]
39 |
40 | return samples
41 |
--------------------------------------------------------------------------------
/data_juicer/ops/mapper/remove_non_chinese_character_mapper.py:
--------------------------------------------------------------------------------
1 | import regex as re
2 |
3 | from ..base_op import OPERATORS, Mapper
4 |
5 |
6 | @OPERATORS.register_module('remove_non_chinese_character_mapper')
7 | class RemoveNonChineseCharacterlMapper(Mapper):
8 | """Mapper to remove non chinese Character in text samples."""
9 |
10 | _batched_op = True
11 |
12 | def __init__(self,
13 | keep_alphabet: bool = True,
14 | keep_number: bool = True,
15 | keep_punc: bool = True,
16 | *args,
17 | **kwargs):
18 | """
19 | Initialization method.
20 |
21 | :param keep_alphabet: whether to keep alphabet
22 | :param keep_number: whether to keep number
23 | :param keep_punc: whether to keep punctuation
24 | :param args: extra args
25 | :param kwargs: extra args
26 | """
27 | super().__init__(*args, **kwargs)
28 | self.pattern = u'[^\u4e00-\u9fa5'
29 | if keep_alphabet:
30 | self.pattern += u'A-Za-z'
31 | if keep_number:
32 | self.pattern += u'0-9'
33 | if keep_punc:
34 | self.pattern += u'., ,\\-。%《*》/•、&&(—)(+):?!!“”·]+'
35 | else:
36 | self.pattern += u']'
37 |
38 | def process_batched(self, samples):
39 | for idx, text in enumerate(samples[self.text_key]):
40 | if not re.search(self.pattern, text, flags=re.DOTALL):
41 | continue
42 |
43 | samples[self.text_key][idx] = re.sub(pattern=self.pattern,
44 | repl=r'',
45 | string=text,
46 | flags=re.DOTALL)
47 | return samples
48 |
--------------------------------------------------------------------------------
/data_juicer/ops/mapper/remove_specific_chars_mapper.py:
--------------------------------------------------------------------------------
1 | from typing import List, Union
2 |
3 | import regex as re
4 |
5 | from ..base_op import OPERATORS, Mapper
6 |
7 |
8 | @OPERATORS.register_module('remove_specific_chars_mapper')
9 | class RemoveSpecificCharsMapper(Mapper):
10 | """Mapper to clean specific chars in text samples."""
11 |
12 | _batched_op = True
13 |
14 | def __init__(self,
15 | chars_to_remove: Union[str, List[str]] = '◆●■►▼▲▴∆▻▷❖♡□',
16 | *args,
17 | **kwargs):
18 | """
19 | Initialization method.
20 |
21 | :param chars_to_remove: a list or a string including all
22 | characters that need to be removed from text.
23 | :param args: extra args
24 | :param kwargs: extra args
25 | """
26 |
27 | super().__init__(*args, **kwargs)
28 | if chars_to_remove:
29 | self.pattern = '[' + '|'.join(chars_to_remove) + ']'
30 | else:
31 | self.pattern = None
32 |
33 | def process_batched(self, samples):
34 | if self.pattern is None:
35 | return samples
36 |
37 | samples[self.text_key] = [
38 | re.sub(pattern=self.pattern,
39 | repl=r'',
40 | string=text,
41 | flags=re.DOTALL) for text in samples[self.text_key]
42 | ]
43 | return samples
44 |
--------------------------------------------------------------------------------
/data_juicer/ops/mapper/remove_table_text_mapper.py:
--------------------------------------------------------------------------------
1 | import regex as re
2 | from pydantic import Field
3 | from typing_extensions import Annotated
4 |
5 | from ..base_op import OPERATORS, Mapper
6 |
7 |
8 | @OPERATORS.register_module('remove_table_text_mapper')
9 | class RemoveTableTextMapper(Mapper):
10 | """
11 | Mapper to remove table texts from text samples.
12 |
13 | Regular expression is used to remove tables in the range of column
14 | number of tables.
15 | """
16 |
17 | _batched_op = True
18 |
19 | def __init__(self,
20 | min_col: Annotated[int, Field(ge=2, le=20)] = 2,
21 | max_col: Annotated[int, Field(ge=2, le=20)] = 20,
22 | *args,
23 | **kwargs):
24 | """
25 | Initialization method.
26 |
27 | :param min_col: The min number of columns of table to remove.
28 | :param max_col: The max number of columns of table to remove.
29 | :param args: extra args
30 | :param kwargs: extra args
31 | """
32 | super().__init__(*args, **kwargs)
33 | self.min_col = min_col
34 | self.max_col = max_col
35 | self.pattern = r'(?<=\n)((\S+?)([ |\t](\S+?)){%d}\n+){2,}'
36 |
37 | def process_batched(self, samples):
38 | for idx, text in enumerate(samples[self.text_key]):
39 | for i in range(self.min_col - 1, self.max_col):
40 | pattern = re.compile(self.pattern % i)
41 | text = pattern.sub('', text)
42 |
43 | samples[self.text_key][idx] = text
44 |
45 | return samples
46 |
--------------------------------------------------------------------------------
/data_juicer/ops/mapper/sentence_split_mapper.py:
--------------------------------------------------------------------------------
1 | from data_juicer.utils.model_utils import get_model, prepare_model
2 | from data_juicer.utils.nltk_utils import patch_nltk_pickle_security
3 |
4 | from ..base_op import OPERATORS, Mapper
5 | from ..common import get_sentences_from_document
6 |
7 | OP_NAME = 'sentence_split_mapper'
8 |
9 |
10 | @OPERATORS.register_module(OP_NAME)
11 | class SentenceSplitMapper(Mapper):
12 | """Mapper to split text samples to sentences."""
13 |
14 | _batched_op = True
15 |
16 | def __init__(self, lang: str = 'en', *args, **kwargs):
17 | """
18 | Initialization method.
19 |
20 | :param lang: split sentence of text in which language.
21 | :param args: extra args
22 | :param kwargs: extra args
23 | """
24 | super().__init__(*args, **kwargs)
25 | self.lang = lang
26 |
27 | # Ensure NLTK pickle security patch is applied
28 | patch_nltk_pickle_security()
29 |
30 | # Prepare the sentence tokenizer model
31 | self.model_key = prepare_model(model_type='nltk', lang=lang)
32 |
33 | def process_batched(self, samples):
34 | # Get the sentence tokenizer model
35 | nltk_model = get_model(self.model_key)
36 |
37 | samples[self.text_key] = [
38 | get_sentences_from_document(
39 | text, model_func=nltk_model.tokenize if nltk_model else None)
40 | for text in samples[self.text_key]
41 | ]
42 |
43 | return samples
44 |
--------------------------------------------------------------------------------
/data_juicer/ops/mapper/whitespace_normalization_mapper.py:
--------------------------------------------------------------------------------
1 | # Most of the code here has been modified from:
2 | # https://github.com/bigscience-workshop/data-preparation
3 | # --------------------------------------------------------
4 |
5 | from ..base_op import OPERATORS, Mapper
6 | from ..common.special_characters import VARIOUS_WHITESPACES
7 |
8 |
9 | @OPERATORS.register_module('whitespace_normalization_mapper')
10 | class WhitespaceNormalizationMapper(Mapper):
11 | """
12 | Mapper to normalize different kinds of whitespaces to whitespace ' ' (0x20)
13 | in text samples.
14 |
15 | Different kinds of whitespaces can be found here:
16 | https://en.wikipedia.org/wiki/Whitespace_character
17 | """
18 |
19 | _batched_op = True
20 |
21 | def __init__(self, *args, **kwargs):
22 | """
23 | Initialization method.
24 |
25 | :param args: extra args
26 | :param kwargs: extra args
27 | """
28 | super().__init__(*args, **kwargs)
29 |
30 | def process_batched(self, samples):
31 | for idx, text in enumerate(samples[self.text_key]):
32 | # remove whitespaces before and after the main content
33 | text = text.strip()
34 |
35 | # replace all kinds of whitespaces with ' '
36 | samples[self.text_key][idx] = ''.join([
37 | char if char not in VARIOUS_WHITESPACES else ' '
38 | for char in text
39 | ])
40 |
41 | return samples
42 |
--------------------------------------------------------------------------------
/data_juicer/ops/selector/__init__.py:
--------------------------------------------------------------------------------
1 | from .frequency_specified_field_selector import FrequencySpecifiedFieldSelector
2 | from .random_selector import RandomSelector
3 | from .range_specified_field_selector import RangeSpecifiedFieldSelector
4 | from .tags_specified_field_selector import TagsSpecifiedFieldSelector
5 | from .topk_specified_field_selector import TopkSpecifiedFieldSelector
6 |
7 | __all__ = [
8 | 'FrequencySpecifiedFieldSelector', 'RandomSelector',
9 | 'RangeSpecifiedFieldSelector', 'TagsSpecifiedFieldSelector',
10 | 'TopkSpecifiedFieldSelector'
11 | ]
12 |
--------------------------------------------------------------------------------
/data_juicer/tools/__init__.py:
--------------------------------------------------------------------------------
1 | # Some code here has been modified from:
2 | # https://github.com/Megvii-BaseDetection/YOLOX
3 | # --------------------------------------------------------
4 |
5 | # This file is used for package installation. Script of tools will be
6 | # available.
7 |
8 | import sys
9 | from importlib import abc, util
10 | from pathlib import Path
11 |
12 | _TOOLS_PATH = Path(__file__).resolve().parent.parent.parent / 'tools'
13 |
14 | if _TOOLS_PATH.is_dir():
15 | # This is true only for in-place installation
16 | # (pip install -e, setup.py develop),
17 | # where setup(package_dir=) does not work:
18 | # https://github.com/pypa/setuptools/issues/230
19 |
20 | class _PathFinder(abc.MetaPathFinder):
21 |
22 | def find_spec(self, name, path, target=None):
23 | if not name.startswith('data_juicer.tools.'):
24 | return
25 | project_name = name.split('.')[-1] + '.py'
26 | target_file = _TOOLS_PATH / project_name
27 | if not target_file.is_file():
28 | return
29 | return util.spec_from_file_location(name, target_file)
30 |
31 | sys.meta_path.append(_PathFinder())
32 |
--------------------------------------------------------------------------------
/data_juicer/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/data_juicer/utils/__init__.py
--------------------------------------------------------------------------------
/data_juicer/utils/resource_utils.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 |
3 | import psutil
4 | from loguru import logger
5 |
6 | NVSMI_REPORT = True
7 |
8 |
9 | def query_cuda_info(query_key):
10 | global NVSMI_REPORT
11 | # get cuda info using "nvidia-smi" command in MB
12 | try:
13 | nvidia_smi_output = subprocess.check_output([
14 | 'nvidia-smi', f'--query-gpu={query_key}',
15 | '--format=csv,noheader,nounits'
16 | ]).decode('utf-8')
17 | except Exception as e:
18 | if 'non-zero exit status 2' in str(e):
19 | err_msg = f'The specified query_key [{query_key}] might not be ' \
20 | f'supported by command nvidia-smi. Please check and ' \
21 | f'retry!'
22 | elif 'No such file or directory' in str(e):
23 | err_msg = 'Command nvidia-smi is not found. There might be no ' \
24 | 'GPUs on this machine.'
25 | else:
26 | err_msg = str(e)
27 | if NVSMI_REPORT:
28 | logger.warning(err_msg)
29 | NVSMI_REPORT = False
30 | return None
31 | cuda_info_list = []
32 | for line in nvidia_smi_output.strip().split('\n'):
33 | cuda_info_list.append(int(line))
34 | return cuda_info_list
35 |
36 |
37 | def get_cpu_count():
38 | return psutil.cpu_count()
39 |
40 |
41 | def get_cpu_utilization():
42 | return psutil.cpu_percent()
43 |
44 |
45 | def query_mem_info(query_key):
46 | mem = psutil.virtual_memory()
47 | if query_key not in mem._fields:
48 | logger.warning(f'No such query key [{query_key}] for memory info. '
49 | f'Should be one of {mem._fields}')
50 | return None
51 | val = round(mem.__getattribute__(query_key) / (2**20), 2) # in MB
52 | return val
53 |
--------------------------------------------------------------------------------
/data_juicer/utils/sample.py:
--------------------------------------------------------------------------------
1 | from itertools import chain, repeat
2 |
3 | import numpy as np
4 |
5 |
6 | def random_sample(dataset, weight=1.0, sample_number=0, seed=None):
7 | """
8 | Randomly sample a subset from a dataset with weight or number,
9 | if sample number is bigger than 0, we will use sample
10 | number instead of weight.
11 | :param dataset: a HuggingFace dataset
12 | :param weight: sample ratio of dataset
13 | :param sample_number: sample number of dataset
14 | :param seed: random sample seed, if None, 42 as default
15 | :return: a subset of dataset
16 | """
17 | if seed is None:
18 | seed = 42
19 |
20 | ds_samples = dataset.num_rows
21 | if sample_number <= 0:
22 | sample_number = int(np.ceil(ds_samples * weight))
23 |
24 | if sample_number == ds_samples:
25 | return dataset
26 |
27 | sample_index = range(sample_number)
28 |
29 | n_repeat = int(np.ceil(sample_number / ds_samples)) - 1
30 | if n_repeat > 0:
31 | remain_samples = sample_number - n_repeat * ds_samples
32 | sample_index = chain(*repeat(range(ds_samples), n_repeat),
33 | range(remain_samples))
34 |
35 | return dataset.shuffle(seed=seed).select(sample_index)
36 |
--------------------------------------------------------------------------------
/demos/README_ZH.md:
--------------------------------------------------------------------------------
1 | # 演示
2 |
3 | 此文件夹包含一些演示样例,帮助用户轻松体验 Data-Juicer 的各种功能和工具。
4 |
5 | ## 用法
6 |
7 | 使用 `demos` 子目录下的 `app.py` 来执行演示样例。
8 |
9 | ```shell
10 | cd
11 | streamlit run app.py
12 | ```
13 |
14 | ## 可用的演示
15 |
16 | - 数据集样例 (`data`)
17 | - 该文件夹包含一些样例数据集。
18 |
19 | - 初探索 (`overview_scan`)
20 | - 该示例介绍了 Data-Juicer 的基本概念和功能,例如特性、配置系统,算子等等。
21 |
22 | - 数据处理回路 (`data_process_loop`)
23 | - 该示例用来分析和处理数据集,并给出处理前后数据集的统计信息比对。
24 |
25 | - 词法多样性可视化 (`data_visualization_diversity`)
26 | - 该示例可以用来分析 CFT 数据集的动词-名词结构,并绘制成sunburst层级环形图表。
27 |
28 | - 算子效果可视化 (`data_visualization_op_effect`)
29 | - 该示例可以分析数据集的统计信息,并根据这些统计信息可以显示出每个 `Filter` 算子在不同阈值下的效果。
30 |
31 | - 统计信息可视化 (`data_visualization_statistics`)
32 | - 该示例可以分析数据集,并获得多达13种统计信息。
33 |
34 | - 处理 CFT 中文数据 (`process_cft_zh_data`)
35 | - 以 Alpaca-CoT 的部分中文数据为例,演示了 LLM 中指令跟随微调数据和有监督微调数据的分析和处理流程。
36 |
37 | - 处理预训练科学文献类数据 (`process_sci_data`)
38 | - 以 arXiv 的部分数据为例,演示了如何处理 LLM 预训练中的科学文献类数据的分析和处理流程。
39 |
40 | - 处理预训练代码类数据 (`process_code_data`)
41 | - 以 Stack-Exchange 的部分数据为例,演示了如何处理 LLM 预训练中的代码类数据的分析和处理流程。
42 |
43 | - 文本质量打分器 (`tool_quality_classifier`)
44 | - 该示例提供了3种文本质量打分器,对数据集进行打分评估。
45 |
46 | - 按语言分割数据集 (`tool_dataset_splitting_by_language`)
47 | - 该示例按照语言将数据集拆分为不同的子数据集。
48 |
49 | - 数据混合 (`data_mixture`)
50 | - 该示例从多份数据集中进行采样并混合为一个新的数据集。
51 |
--------------------------------------------------------------------------------
/demos/api_service/configs/dj_config_template.yaml:
--------------------------------------------------------------------------------
1 | # data-juicer config template
2 |
3 | # global parameters
4 | project_name: 'dj_agent'
5 | dataset_path: '' # path to your dataset directory or file, specified in the agent
6 | np: 4 # number of subprocess to process your dataset
7 |
8 | export_path: '' # path to the output path, specified in the agent
9 | export_original_dataset: true
10 |
11 | # process schedule
12 | # a list of several process operators with their arguments, specified in the agent
13 | process: []
14 |
--------------------------------------------------------------------------------
/demos/api_service/configs/model_configs.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "config_name": "gpt-4",
4 | "model_type": "openai-chat",
5 | "model_name": "gpt-4",
6 | "api_key": "your API key",
7 | "organization": "your organization name",
8 | "generate_args": {
9 | "temperature": 0.5
10 | }
11 | },
12 | {
13 | "config_name": "dashscope_chat-qwen-max",
14 | "model_type": "dashscope_chat",
15 | "model_name": "qwen-max",
16 | "api_key": "your API key",
17 | "generate_args": {
18 | "temperature": 0.0
19 | }
20 | }
21 | ]
22 |
--------------------------------------------------------------------------------
/demos/auto_evaluation_helm/app.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 | import streamlit as st
5 |
6 |
7 | class Visualize:
8 |
9 | @staticmethod
10 | def setup():
11 | st.set_page_config(
12 | page_title='Data-Juicer',
13 | page_icon=':smile',
14 | layout='wide',
15 | # initial_sidebar_state="expanded",
16 | )
17 |
18 | readme_link = 'https://github.com/alibaba/data-juicer'
19 | st.markdown(
20 | '# Data-Juicer
',
21 | unsafe_allow_html=True,
22 | )
23 | st.markdown(
24 | f' A One-Stop Data Processing System for \
25 | Large Language Models, \
26 | see more details in our
Github ',
27 | unsafe_allow_html=True,
28 | )
29 |
30 | @staticmethod
31 | def visualize():
32 | Visualize.setup()
33 |
34 |
35 | def main():
36 |
37 | def make_image(line):
38 | pattern = r'!\[(.*?)\]\((.*?)\)'
39 | matches = re.findall(pattern, line)
40 | st.image(matches[0][1], output_format='png', use_column_width=True)
41 |
42 | Visualize.visualize()
43 | buffer = []
44 | with open('README_ZH.md', 'r', encoding='utf-8') as f:
45 | lines = f.readlines()
46 | for line in lines:
47 | if 'imgs/' in line:
48 | st.markdown('\n'.join(buffer))
49 | make_image(line)
50 | buffer.clear()
51 | else:
52 | buffer.append(line)
53 | st.markdown('\n'.join(buffer))
54 | # hello()
55 |
56 |
57 | if __name__ == '__main__':
58 | main()
59 |
--------------------------------------------------------------------------------
/demos/auto_evaluation_helm/imgs/data-juicer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/auto_evaluation_helm/imgs/data-juicer.png
--------------------------------------------------------------------------------
/demos/auto_evaluation_helm/imgs/eval-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/auto_evaluation_helm/imgs/eval-01.png
--------------------------------------------------------------------------------
/demos/auto_evaluation_helm/imgs/eval-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/auto_evaluation_helm/imgs/eval-02.png
--------------------------------------------------------------------------------
/demos/data/demo-dataset-annotation-human-preference.jsonl:
--------------------------------------------------------------------------------
1 | {"prompt": "What is the capital of France?", "answer1": "Paris", "answer2": "Lyon"}
2 | {"prompt": "Which planet is known as the Red Planet?", "answer1": "Mars", "answer2": "Venus"}
3 | {"prompt": "What is the chemical symbol for gold?", "answer1": "Au", "answer2": "Ag"}
4 | {"prompt": "Who wrote 'Romeo and Juliet'?", "answer1": "William Shakespeare", "answer2": "Christopher Marlowe"}
5 | {"prompt": "What is the largest mammal on Earth?", "answer1": "Blue Whale", "answer2": "African Elephant"}
6 | {"prompt": "In which year did World War II end?", "answer1": "1945", "answer2": "1944"}
7 | {"prompt": "What is the square root of 64?", "answer1": "8", "answer2": "6"}
8 | {"prompt": "Who painted the Mona Lisa?", "answer1": "Leonardo da Vinci", "answer2": "Michelangelo"}
9 | {"prompt": "What is the main component of the Sun?", "answer1": "Hydrogen", "answer2": "Helium"}
10 | {"prompt": "Which programming language was created by Guido van Rossum?", "answer1": "Python", "answer2": "Java"}
--------------------------------------------------------------------------------
/demos/data/demo-dataset-chatml.jsonl:
--------------------------------------------------------------------------------
1 | {"messages": [{"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": "谁在文艺复兴时期绘制人体?"}, {"role": "assistant", "content": "文艺复兴时期是一个关于艺术、文化和学术的复兴运动,在这个时期,许多艺术家都绘制了人体。"},{"role": "user", "content": "那雕塑方面如何呢?"}, {"role": "assistant", "content": "文艺复兴时期的雕塑也非常有名,几位世界级的雕塑大师都出自于这个时期。"}]}
2 | {"messages":[{"content":"You are a helpful assistant","role":"system"},{"content":"什么时期的音乐家开始广泛使用交响乐团?","role":"user"},{"content":"浪漫主义时期,音乐家们开始广泛使用和扩展交响乐团,创作出规模宏大、情感丰富的交响乐作品。","role":"assistant"}]}
3 | {"messages":[{"content":"You are a helpful assistant","role":"system"},{"content":"哪个物理定律描述了物体在不受外力作用时保持静止或匀速直线运动的状态?","role":"user"},{"content":"牛顿第一定律,也称为惯性定律,描述了物体在不受外力作用时保持静止状态或匀速直线运动的状态。","role":"assistant"}]}
4 | {"messages":[{"content":"You are a helpful assistant","role":"system"},{"content":"哪种文学流派强调通过象征和暗喻探索潜意识思维?","role":"user"},{"content":"现代主义文学流派强调通过象征、暗喻以及非线性叙述等手法,深入探索人物的内心世界与潜意识思维。","role":"assistant"}]}
--------------------------------------------------------------------------------
/demos/data/demo-dataset-content.jsonl:
--------------------------------------------------------------------------------
1 | {"content": "Today is Sunday and it's a happy day!", "src": "Arxiv", "date": "2023-04-27", "version": "1.0"}
2 | {"content": "Do you need a cup of coffee?", "src": "code", "author": "xxx"}
3 | {"content": "你好,请问你是谁", "src": "customized", "author": "xxx"}
4 | {"content": "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.", "src": "Oscar", "version": "2.0"}
5 | {"content": "欢迎来到阿里巴巴!", "src": "customized", "version": "0.1", "author": "xxx"}
6 | {"content": "This paper proposed a novel method on LLM pretraining.", "src": "customized", "author": "xxx"}
7 |
--------------------------------------------------------------------------------
/demos/data/demo-dataset-images.jsonl:
--------------------------------------------------------------------------------
1 | {"images":["../../tests/ops/data/img1.png"], "text": "<__dj__image> A comfortable bed."}
2 | {"images":["../../tests/ops/data/img2.jpg"], "text": "<__dj__image> A bus."}
3 | {"images":["../../tests/ops/data/img3.jpg"], "text": "<__dj__image> Black and white photograph of a woman holding an umbrella."}
4 |
--------------------------------------------------------------------------------
/demos/data/demo-dataset-videos.jsonl:
--------------------------------------------------------------------------------
1 | {"videos":["../../tests/ops/data/video1.mp4"], "text": "<__dj__video> a cartoon"}
2 | {"videos":["../../tests/ops/data/video1.mp4"], "text": "<__dj__video> 一段卡通"}
3 | {"videos":["../../tests/ops/data/video2.mp4"], "text": "<__dj__video> a man"}
4 | {"videos":["../../tests/ops/data/video2.mp4"], "text": "<__dj__video> 一个男人"}
5 | {"videos":["../../tests/ops/data/video3.mp4"], "text": "<__dj__video> two women"}
6 | {"videos":["../../tests/ops/data/video3.mp4"], "text": "<__dj__video> 两个女人"}
7 |
--------------------------------------------------------------------------------
/demos/data/demo-dataset.jsonl:
--------------------------------------------------------------------------------
1 | {"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}}
2 | {"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}}
3 | {"text": "你好,请问你是谁", "meta": {"src": "customized", "author": "xxx"}}
4 | {"text": "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.", "meta": {"src": "Oscar", "version": "2.0"}}
5 | {"text": "欢迎来到阿里巴巴!", "meta": {"src": "customized", "version": "0.1", "author": "xxx"}}
6 | {"text": "This paper proposed a novel method on LLM pretraining.", "meta": {"src": "customized", "author": "xxx"}}
7 |
--------------------------------------------------------------------------------
/demos/data/demo-dataset_1725870268.jsonl:
--------------------------------------------------------------------------------
1 | {"text":"Today is Sunday and it's a happy day!","meta":{"src":"Arxiv","date":1682553600000,"version":"1.0","author":null},"__dj__stats__":{"text_len":37}}
2 | {"text":"Do you need a cup of coffee?","meta":{"src":"code","date":null,"version":null,"author":"xxx"},"__dj__stats__":{"text_len":28}}
3 | {"text":"Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.","meta":{"src":"Oscar","date":null,"version":"2.0","author":null},"__dj__stats__":{"text_len":101}}
4 | {"text":"This paper proposed a novel method on LLM pretraining.","meta":{"src":"customized","date":null,"version":null,"author":"xxx"},"__dj__stats__":{"text_len":54}}
5 |
--------------------------------------------------------------------------------
/demos/data/demo-dataset_1725870628.jsonl:
--------------------------------------------------------------------------------
1 | {"text":"Today is Sunday and it's a happy day!","meta":{"src":"Arxiv","date":1682553600000,"version":"1.0","author":null},"__dj__stats__":{"text_len":37}}
2 | {"text":"Do you need a cup of coffee?","meta":{"src":"code","date":null,"version":null,"author":"xxx"},"__dj__stats__":{"text_len":28}}
3 | {"text":"This paper proposed a novel method on LLM pretraining.","meta":{"src":"customized","date":null,"version":null,"author":"xxx"},"__dj__stats__":{"text_len":54}}
4 |
--------------------------------------------------------------------------------
/demos/data_process_hpo/app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 |
4 | class Visualize:
5 |
6 | @staticmethod
7 | def setup():
8 | st.set_page_config(
9 | page_title='Data-Juicer',
10 | page_icon=':smile',
11 | #layout='wide',
12 | # initial_sidebar_state="expanded",
13 | )
14 |
15 | readme_link = 'https://github.com/alibaba/data-juicer'
16 | st.markdown(
17 | ' Data-Juicer \
18 |
',
19 | unsafe_allow_html=True,
20 | )
21 | st.markdown(
22 | f' A One-Stop Data Processing System for \
23 | Large Language Models, \
24 | see more details in our
Github ',
25 | unsafe_allow_html=True,
26 | )
27 |
28 | @staticmethod
29 | def visualize():
30 | Visualize.setup()
31 |
32 |
33 | def main():
34 |
35 | def hello():
36 |
37 | st.image('imgs/data-juicer.png',
38 | output_format='png',
39 | use_column_width=True)
40 | demo = 'The demo is coming soon😊'
41 | st.markdown(
42 | f' {demo} \
43 |
',
44 | unsafe_allow_html=True,
45 | )
46 |
47 | Visualize.visualize()
48 | hello()
49 |
50 |
51 | if __name__ == '__main__':
52 | main()
53 |
--------------------------------------------------------------------------------
/demos/data_process_hpo/imgs/data-juicer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/data_process_hpo/imgs/data-juicer.png
--------------------------------------------------------------------------------
/demos/data_visualization_op_effect/data/demo-dataset.jsonl:
--------------------------------------------------------------------------------
1 | {"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}}
2 | {"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}}
3 | {"text": "你好,请问你是谁", "meta": {"src": "customized", "author": "xxx"}}
4 | {"text": "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.", "meta": {"src": "Oscar", "version": "2.0"}}
5 | {"text": "欢迎来到阿里巴巴!", "meta": {"src": "customized", "version": "0.1", "author": "xxx"}}
6 | {"text": "This paper proposed a novel method on LLM pretraining.", "meta": {"src": "customized", "author": "xxx"}}
7 |
--------------------------------------------------------------------------------
/demos/data_visualization_op_insight/cache/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/data_visualization_op_insight/cache/.gitkeep
--------------------------------------------------------------------------------
/demos/data_visualization_statistics/data/demo-dataset.jsonl:
--------------------------------------------------------------------------------
1 | {"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}}
2 | {"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}}
3 | {"text": "你好,请问你是谁", "meta": {"src": "customized", "author": "xxx"}}
4 | {"text": "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.", "meta": {"src": "Oscar", "version": "2.0"}}
5 | {"text": "欢迎来到阿里巴巴!", "meta": {"src": "customized", "version": "0.1", "author": "xxx"}}
6 | {"text": "This paper proposed a novel method on LLM pretraining.", "meta": {"src": "customized", "author": "xxx"}}
7 |
--------------------------------------------------------------------------------
/demos/process_on_ray/configs/dedup.yaml:
--------------------------------------------------------------------------------
1 | # Process config example for dataset
2 |
3 | # global parameters
4 | project_name: 'demo-dedup'
5 | dataset_path: './demos/process_on_ray/data/'
6 | export_path: './outputs/demo-dedup/demo-ray-bts-dedup-processed'
7 |
8 | executor_type: 'ray'
9 | ray_address: 'auto'
10 |
11 | # process schedule
12 | # a list of several process operators with their arguments
13 | process:
14 | - ray_bts_minhash_deduplicator:
15 | tokenization: 'character'
--------------------------------------------------------------------------------
/demos/process_video_on_ray/data/Note.md:
--------------------------------------------------------------------------------
1 | # Note for dataset path
2 |
3 | The videos/images path here support both absolute path and relative path.
4 | Please use an address that can be accessed on all nodes (such as an address within a NAS file-sharing system).
5 | For relative paths, these should be relative to the directory where the dataset file is located (the dataset_path parameter in the config).
6 | - if the dataset_path parameter is a directory, then it's relative to dataset_path
7 | - if the dataset_path parameter is a file, then it's relative to data_path parameter's corresponding dirname
8 |
--------------------------------------------------------------------------------
/demos/process_video_on_ray/data/demo-dataset.jsonl:
--------------------------------------------------------------------------------
1 | {"videos": ["./videos/video1.mp4"], "text": "<__dj__video> 10s videos <|__dj__eoc|>'}"}
2 | {"videos": ["./videos/video2.mp4"], "text": "<__dj__video> 23s videos <|__dj__eoc|>'}"}
3 | {"videos": ["./videos/video3.mp4"], "text": "<__dj__video> 46s videos <|__dj__eoc|>'}"}
--------------------------------------------------------------------------------
/demos/process_video_on_ray/data/videos/video1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/process_video_on_ray/data/videos/video1.mp4
--------------------------------------------------------------------------------
/demos/process_video_on_ray/data/videos/video2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/process_video_on_ray/data/videos/video2.mp4
--------------------------------------------------------------------------------
/demos/process_video_on_ray/data/videos/video3.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/process_video_on_ray/data/videos/video3.mp4
--------------------------------------------------------------------------------
/demos/tool_quality_classifier/quality_classifier/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/tool_quality_classifier/quality_classifier/__init__.py
--------------------------------------------------------------------------------
/docs/imgs/data-juicer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/docs/imgs/data-juicer.jpg
--------------------------------------------------------------------------------
/docs/imgs/eval-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/docs/imgs/eval-01.png
--------------------------------------------------------------------------------
/docs/imgs/eval-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/docs/imgs/eval-02.png
--------------------------------------------------------------------------------
/docs/sphinx_doc/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/sphinx_doc/README.md:
--------------------------------------------------------------------------------
1 | # Data-Juicer Documentation
2 |
3 | We build our documentation with help of Sphinx.
4 | To update the generated
5 | doc, please run the following commands:
6 |
7 | ```bash
8 | # $~/data_juicer/docs/sphinx_doc
9 | # 1. install the sphinx requirements and init the sphinx-quickstart
10 | pip install "py-data-juicer[dev]"
11 | # or pip install -r ../../environments/dev_requires
12 |
13 | # 2. auto generate and build the doc
14 | ./build_doc.sh
15 |
16 | # 3. finalize the doc, which is stored in the `build/` directory
17 | mv build/ position_to_publish
18 | ```
19 |
20 | Automatic action in github can be found in [here](https://github.com/modelscope/data-juicer/blob/main/.github/workflows/deploy_sphinx_docs.yml).
--------------------------------------------------------------------------------
/docs/sphinx_doc/README_ZH.md:
--------------------------------------------------------------------------------
1 | # Data-Juicer 文档
2 |
3 | Data-Juicer 借助 Sphinx 构建文档。
4 | 如需更新生成的文档,请运行以下命令:
5 |
6 | ```bash
7 | # $~/data_juicer/docs/sphinx_doc
8 | # 1.安装 sphinx 的依赖并初始化 sphinx-quickstart
9 | pip install "py-data-juicer[dev]"
10 | # or pip install -r ../../environments/dev_requires
11 | # 2. 运行文档构建脚本
12 | ./build_doc.sh
13 |
14 | # 3. 构建完成的文档存储目录为 `build/`
15 | mv build/ position_to_publish
16 | ```
17 |
18 | Github上的自动化部署配置可参考 [该处](
19 | https://github.com/modelscope/data-juicer/blob/main/.github/workflows/deploy_sphinx_docs.yml).
--------------------------------------------------------------------------------
/docs/sphinx_doc/_templates/package.rst_t:
--------------------------------------------------------------------------------
1 | {%- macro automodule(modname, options) -%}
2 | .. automodule:: {{ modname }}
3 | {%- for option in options %}
4 | :{{ option }}:
5 | {%- endfor %}
6 | {%- endmacro %}
7 |
8 | {{- pkgname | heading }}
9 |
10 | {%- macro toctree(docnames) -%}
11 | .. toctree::
12 | :maxdepth: {{ maxdepth }}
13 | {% for docname in docnames %}
14 | {{ docname }}
15 | {%- endfor %}
16 | {%- endmacro %}
17 |
18 | {{ automodule(pkgname, automodule_options) }}
19 |
--------------------------------------------------------------------------------
/docs/sphinx_doc/build_doc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | make clean
3 | languages=(en zh_CN)
4 |
5 | for lang in "${languages[@]}"; do
6 | sphinx-multiversion source build/$lang -D "language=$lang"
7 | done
8 |
9 |
--------------------------------------------------------------------------------
/docs/sphinx_doc/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/sphinx_doc/redirect.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Redirecting to https://[REPOSITORY_OWNER].github.io/data-juicer/en/main/
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/docs/sphinx_doc/source/_static/sidebar-menu.css:
--------------------------------------------------------------------------------
1 | .sidebar-bottom-menu {
2 | position: sticky;
3 | bottom: 0;
4 | width: 100%;
5 | background: var(--color-sidebar-background);
6 | border-top: 1px solid var(--color-sidebar-search-border);
7 | z-index: 1000;
8 | }
9 |
10 | .sidebar-bottom-menu .current-info {
11 | padding: var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal);
12 | text-align: center;
13 | cursor: pointer;
14 | color: var(--color-foreground-primary);
15 | }
16 |
17 | .sidebar-bottom-menu .current-info span {
18 | margin: 0 0.5em;
19 | }
20 |
21 | .sidebar-bottom-menu .dropdown-panel {
22 | display: none;
23 | position: absolute;
24 | bottom: 100%;
25 | left: 0;
26 | width: 100%;
27 | background: var(--color-sidebar-background);
28 | border-top: 1px solid var(--color-sidebar-search-border);
29 | }
30 |
31 | .sidebar-bottom-menu:hover .dropdown-panel {
32 | display: block;
33 | }
34 |
35 | .sidebar-bottom-menu .section {
36 | padding: var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal);
37 | }
38 |
39 | .sidebar-bottom-menu .section dt {
40 | color: var(--color-foreground-secondary);
41 | margin-bottom: 0.5em;
42 | font-weight: bold;
43 | }
44 |
45 | .sidebar-bottom-menu .section dd {
46 | margin: 0;
47 | }
48 |
49 | .sidebar-bottom-menu .section dd a {
50 | display: block;
51 | padding: 0.3em 0;
52 | color: var(--color-sidebar-link-text--top-level);
53 | text-decoration: none;
54 | }
55 |
56 | .sidebar-bottom-menu .section dd a:hover {
57 | color: var(--color-sidebar-link-text--top-level--hover);
58 | }
59 |
60 | .sidebar-bottom-menu .section dd a.active {
61 | font-weight: bold;
62 | }
63 |
--------------------------------------------------------------------------------
/docs/sphinx_doc/source/_templates/page.html:
--------------------------------------------------------------------------------
1 | {% extends "!page.html" %}
2 | {% block body %}
3 | {% if current_version and latest_version and current_version != latest_version %}
4 |
5 |
6 | {% if current_version.is_released %}
7 | You're reading an old version of this documentation.
8 | If you want up-to-date information, please have a look at {{latest_version.name}} .
9 | {% else %}
10 | You're reading the documentation from the main branch.
11 | For the latest released version, please have a look at {{latest_version.name}} .
12 | {% endif %}
13 |
14 |
15 | {% endif %}
16 | {{ super() }}
17 | {% endblock %}%
--------------------------------------------------------------------------------
/docs/sphinx_doc/source/_templates/sidebar/bottom_menu.html:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/sphinx_doc/source/api.rst:
--------------------------------------------------------------------------------
1 | API Reference
2 | =============
3 | .. toctree::
4 | :maxdepth: 2
5 | :glob:
6 |
7 | data_juicer.core
8 | data_juicer.ops
9 | data_juicer.ops.filter
10 | data_juicer.ops.mapper
11 | data_juicer.ops.deduplicator
12 | data_juicer.ops.selector
13 | data_juicer.ops.common
14 | data_juicer.analysis
15 | data_juicer.config
16 | data_juicer.format
17 |
--------------------------------------------------------------------------------
/docs/sphinx_doc/source/index.rst:
--------------------------------------------------------------------------------
1 | .. _Data Processing for and with Foundation Models:
2 | .. role:: raw-html-m2r(raw)
3 | :format: html
4 |
5 | `[中文主页] <../../zh_CN/main/index_ZH>`_ | `[DJ-Cookbook] `_ | `[OperatorZoo] `_ | `[API] `_ | `[Awesome LLM Data] `_
6 |
7 | Data Processing for and with Foundation Models
8 | ==============================================
9 |
10 | .. include:: README.md
11 | :start-after: # Data Processing for and with Foundation Models
12 | :parser: myst_parser.sphinx_
13 |
14 | .. toctree::
15 | :maxdepth: 2
16 | :caption: Tutorial
17 | :glob:
18 | :hidden:
19 |
20 | docs/tutorial/*
21 |
22 | .. toctree::
23 | :maxdepth: 2
24 | :caption: docs
25 | :glob:
26 | :hidden:
27 |
28 | docs/Operators
29 | docs/RecipeGallery
30 | docs/DatasetCfg
31 | docs/*
32 |
33 | .. toctree::
34 | :maxdepth: 2
35 | :caption: demos
36 | :glob:
37 | :hidden:
38 |
39 | demos/*
40 | demos/**/*
41 |
42 | .. toctree::
43 | :maxdepth: 2
44 | :caption: tools
45 | :glob:
46 | :hidden:
47 |
48 | tools/*
49 | tools/**/*
50 |
51 | .. toctree::
52 | :maxdepth: 2
53 | :caption: thirdparty
54 | :glob:
55 | :hidden:
56 |
57 | thirdparty/*
58 | thirdparty/**/*
59 |
60 | .. toctree::
61 | :maxdepth: 2
62 | :caption: API Reference
63 | :hidden:
64 |
65 | api
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/docs/sphinx_doc/source/index_ZH.rst:
--------------------------------------------------------------------------------
1 | .. _Data Processing for and with Foundation Models:
2 | .. role:: raw-html-m2r(raw)
3 | :format: html
4 |
5 |
6 | `[英文主页] <../../en/main/index>`_ | `[DJ-Cookbook] `_ | `[算子池] `_ | `[API] `_ | `[Awesome LLM Data] `_
7 |
8 | Data Processing for and with Foundation Models
9 | ==============================================
10 |
11 | .. include:: README_ZH.md
12 | :start-after: # Data Processing for and with Foundation Models
13 | :parser: myst_parser.sphinx_
14 |
15 | .. toctree::
16 | :maxdepth: 2
17 | :caption: 教程
18 | :glob:
19 | :hidden:
20 |
21 | docs/tutorial/*
22 |
23 | .. toctree::
24 | :maxdepth: 2
25 | :caption: 帮助文档
26 | :glob:
27 | :hidden:
28 |
29 | docs/Operators
30 | docs/RecipeGallery_ZH
31 | docs/DatasetCfg_ZH
32 | docs/*
33 |
34 | .. toctree::
35 | :maxdepth: 2
36 | :caption: demos
37 | :glob:
38 | :hidden:
39 |
40 | demos/*
41 | demos/**/*
42 |
43 | .. toctree::
44 | :maxdepth: 2
45 | :caption: 工具
46 | :glob:
47 | :hidden:
48 |
49 | tools/*
50 | tools/**/*
51 |
52 | .. toctree::
53 | :maxdepth: 2
54 | :caption: 第三方
55 | :glob:
56 | :hidden:
57 |
58 | thirdparty/*
59 | thirdparty/**/*
60 |
61 | .. toctree::
62 | :maxdepth: 2
63 | :hidden:
64 | :caption: API Reference
65 |
66 | api
--------------------------------------------------------------------------------
/label_studio_localhost_connection.json:
--------------------------------------------------------------------------------
1 | {
2 | "server_url": "http://localhost:7070",
3 | "api_token": "05409236-67a5-4169-af96-a52a818d0e81",
4 | "username": "admin@example.com",
5 | "password": "abcd1234"
6 | }
7 |
--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
1 | # Scripts for Running on Multi Nodes
2 |
3 |
4 | #### Running Using DLC(Deep Learing Containers)
5 |
6 | Internally we use [DLC](https://www.alibabacloud.com/help/zh/pai/user-guide/container-training/) from [PAI](https://www.alibabacloud.com/zh/product/machine-learning) to process data on multiple nodes.
7 |
8 | The scripts to run are in ./dlc folder.
9 |
10 | #### Running Using Slurm
11 |
12 | We provide scripts to support running on slurm, see ./run_slurm.sh.
13 |
14 | You can also manually partition the data according to specific circumstances and then use Slurm to run it on multiple machines by yourself.
15 |
--------------------------------------------------------------------------------
/scripts/dlc/run_on_dlc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # parameters
4 | datajuicer_path= # path to data-juicer
5 | config_path= # path to config file
6 |
7 | # hostname
8 | hostname=$(hostname)
9 |
10 | # into datajuicer_path
11 | cd "$datajuicer_path" || { echo "Could not change directory to $datajuicer_path"; exit 1; }
12 |
13 | # copy and generate new config file for current host
14 |
15 | config_basename=$(basename "$config_path")
16 | config_dirname=$(dirname "$config_path")
17 | config_extension="${config_basename##*.}"
18 | config_basename="${config_basename%.*}"
19 |
20 | new_config_file="${config_dirname}/${config_basename}_$hostname.$config_extension"
21 | cp "$config_path" "$new_config_file" || { echo "Could not copy config file"; exit 1; }
22 |
23 | echo "$new_config_file"
24 |
25 | if [[ "$OSTYPE" == "darwin"* ]]; then
26 | SED_I_SUFFIX=".bak"
27 | else
28 | SED_I_SUFFIX=""
29 | fi
30 |
31 | if grep -q "dataset_path: .*\.json" "$new_config_file"; then
32 | # .json data_path
33 | sed -i$SED_I_SUFFIX "s|\(dataset_path: \)\(.*\)\(/[^/]*\)\(.json\)|\1\2\3_$hostname\4|" "$new_config_file"
34 | else
35 | # dir dataset_path
36 | sed -i$SED_I_SUFFIX "s|\(dataset_path: '\)\(.*\)'\(.*\)|\1\2_$hostname'\3|" "$new_config_file"
37 | fi
38 |
39 | if grep -q "export_path: .*\.json" "$new_config_file"; then
40 | # .json data_path
41 | sed -i$SED_I_SUFFIX "s|\(export_path: \)\(.*\)\(/[^/]*\)\(.json\)|\1\2\3_$hostname\4|" "$new_config_file"
42 | else
43 | # dir export_path
44 | sed -i$SED_I_SUFFIX "s|\(export_path: '\)\(.*\)'\(.*\)|\1\2_$hostname'\3|" "$new_config_file"
45 | fi
46 |
47 | # run to process data
48 | python tools/process_data.py --config "$new_config_file"
49 |
--------------------------------------------------------------------------------
/scripts/run_slurm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=parallel_data_processing
3 | #SBATCH --ntasks=
4 | #SBATCH --nodes=
5 | #SBATCH --time=
6 | #SBATCH --partition=your_partition_name
7 | #SBATCH --output=parallel_data_processing_%j.out
8 | #SBATCH --error=parallel_data_processing_%j.err
9 | #SBATCH --exclusive
10 |
11 | # set data-juicer and config file path
12 | datajuicer_path= # please fill the actual path of datajuicer
13 | config_path= # please fill the actual path of config file
14 |
15 |
16 | cd $datajuicer_path
17 |
18 | readarray -t nodes <<< "$(sinfo --noheader --states=idle,mixed --format=%n)"
19 |
20 | PARTITION_SCRIPT=./scripts/dlc/partition_data_dlc.py
21 |
22 | # set dataset path
23 | JSON_FILE_PATH = # please fill the actual path of dataset file
24 |
25 | # split_dataset
26 | python $PARTITION_SCRIPT --input_file_path $JSON_FILE_PATH --output_file_path $JSON_FILE_PATH --hostnames "${nodes[@]}"
27 |
28 | # run on nodes
29 |
30 | for node in "${nodes[@]}"; do
31 | echo $node
32 | nohup srun --nodes=1 --ntasks=1 -w $node scripts/dlc/run_on_dlc.sh > output_$node.log 2>&1 &
33 | done
34 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/__init__.py
--------------------------------------------------------------------------------
/tests/analysis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/analysis/__init__.py
--------------------------------------------------------------------------------
/tests/analysis/test_collector.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | import torch.distributions
5 |
6 | from data_juicer.analysis.collector import TextTokenDistCollector
7 |
8 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
9 |
10 | class TextTokenDistCollectorTest(DataJuicerTestCaseBase):
11 |
12 | test_data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
13 | '..',
14 | '..',
15 | 'demos',
16 | 'data',
17 | 'demo-dataset.jsonl')
18 |
19 | tokenizer_model = 'EleutherAI/pythia-6.9b-deduped'
20 |
21 | @classmethod
22 | def tearDownClass(cls) -> None:
23 | super().tearDownClass(cls.tokenizer_model)
24 |
25 | def test_basic_func(self):
26 | collector = TextTokenDistCollector(self.tokenizer_model)
27 | dist = collector.collect(self.test_data_path, 'text')
28 | self.assertIsInstance(dist, torch.distributions.Categorical)
29 |
30 |
31 | if __name__ == '__main__':
32 | unittest.main()
33 |
--------------------------------------------------------------------------------
/tests/analysis/test_draw.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 |
5 | from data_juicer.analysis.draw import draw_heatmap
6 |
7 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
8 |
9 | class DrawTest(DataJuicerTestCaseBase):
10 |
11 | def test_basic_func(self):
12 | test_data = [
13 | {'a': 1, 'b': 2, 'c': 3},
14 | {'a': 4, 'b': 5, 'c': 6},
15 | {'a': 7, 'b': 8, 'c': 9},
16 | {'a': 10, 'b': 11, 'c': 12},
17 | {'a': 13, 'b': 14, 'c': 15},
18 | ]
19 | data = pd.DataFrame.from_records(test_data)
20 | ret = draw_heatmap(data, data.columns, triangle=True, show=True)
21 | self.assertIsInstance(ret, plt.Figure)
22 | ret = draw_heatmap(data, data.columns, show=True)
23 | self.assertIsInstance(ret, plt.Figure)
24 |
25 |
26 | if __name__ == '__main__':
27 | unittest.main()
28 |
--------------------------------------------------------------------------------
/tests/benchmark_performance/configs/audio.yaml:
--------------------------------------------------------------------------------
1 | # The config file for performance benchmark to measure the processing speed for
2 | # the current Data-Juicer system. OPs are selected according to their tags and
3 | # types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md)
4 |
5 | project_name: 'performance-benchmark-audio'
6 | dataset_path: 'perf_bench_data/audio/audio-10k.jsonl'
7 | export_path: 'outputs/performance_benchmark_audio/res.jsonl'
8 | np: 16
9 | use_cache: false
10 |
11 | process:
12 | - audio_duration_filter:
13 | - audio_nmf_snr_filter:
14 | - audio_size_filter:
15 |
--------------------------------------------------------------------------------
/tests/benchmark_performance/configs/image.yaml:
--------------------------------------------------------------------------------
1 | # The config file for performance benchmark to measure the processing speed for
2 | # the current Data-Juicer system. OPs are selected according to their tags and
3 | # types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md)
4 |
5 | project_name: 'performance-benchmark-image'
6 | dataset_path: 'perf_bench_data/image/10k.jsonl'
7 | export_path: 'outputs/performance_benchmark_image/res.jsonl'
8 | np: 16
9 | use_cache: false
10 |
11 | process:
12 | - image_aesthetics_filter:
13 | hf_scorer_model: 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE'
14 | min_score: 0.0
15 | mem_required: '1500MB'
16 | - image_captioning_mapper:
17 | hf_img2seq: 'Salesforce/blip2-opt-2.7b'
18 | caption_num: 1
19 | keep_original_sample: false
20 | mem_required: '16GB'
21 | - image_shape_filter:
22 | - image_blur_mapper:
23 | - image_deduplicator:
24 |
--------------------------------------------------------------------------------
/tests/benchmark_performance/configs/text.yaml:
--------------------------------------------------------------------------------
1 | # The config file for performance benchmark to measure the processing speed for
2 | # the current Data-Juicer system. OPs are selected according to their tags and
3 | # types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md)
4 |
5 | project_name: 'performance-benchmark-text'
6 | dataset_path: 'perf_bench_data/text/wiki-10k.jsonl'
7 | export_path: 'outputs/performance_benchmark_text/res.jsonl'
8 | np: 16
9 | use_cache: false
10 |
11 | process:
12 | - whitespace_normalization_mapper:
13 | - token_num_filter:
14 | hf_tokenizer: 'EleutherAI/pythia-6.9b-deduped'
15 | min_num: 0
16 | - document_deduplicator:
17 | lowercase: false
18 | ignore_non_character: false
19 | - topk_specified_field_selector:
20 | field_key: '__dj__stats__.num_token'
21 | topk: 1000
22 |
--------------------------------------------------------------------------------
/tests/benchmark_performance/configs/video.yaml:
--------------------------------------------------------------------------------
1 | # The config file for performance benchmark to measure the processing speed for
2 | # the current Data-Juicer system. OPs are selected according to their tags and
3 | # types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md)
4 |
5 | project_name: 'performance-benchmark-video'
6 | dataset_path: 'perf_bench_data/video/msr_vtt_train.jsonl'
7 | export_path: 'outputs/performance_benchmark_video/res.jsonl'
8 | np: 16
9 | use_cache: false
10 |
11 | process:
12 | - video_nsfw_filter:
13 | hf_nsfw_model: 'Falconsai/nsfw_image_detection'
14 | max_score: 1.0
15 | mem_required: '1GB'
16 | - video_tagging_from_frames_mapper:
17 | mem_required: '9GB'
18 | - video_duration_filter:
19 | - video_split_by_key_frame_mapper:
20 | keep_original_sample: false
21 | - video_deduplicator:
22 |
--------------------------------------------------------------------------------
/tests/benchmark_performance/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # setup wandb configs
4 | export WANDB_BASE_URL=$1
5 | export WANDB_API_KEY=$2
6 |
7 | BENCH_PATH=$(cd "$(dirname "$0")"; pwd)
8 | RELATIVE_DJ_PATH=../..
9 | MODALITIES=("text" "image" "video" "audio")
10 |
11 | cd $BENCH_PATH
12 |
13 | # 1. prepare dataset
14 | echo "Preparing benchmark dataset..."
15 | wget -q http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/perf_bench_data/perf_bench_data.tar.gz && tar zxf perf_bench_data.tar.gz
16 |
17 | # 2. run the benchmark
18 | for modality in ${MODALITIES[@]}
19 | do
20 | echo "Running benchmark for $modality modality..."
21 | python $RELATIVE_DJ_PATH/tools/process_data.py --config configs/$modality.yaml
22 | done
23 |
24 | # 3. collect & upload benchmark results
25 | echo "Collecting and reporting benchmark results..."
26 | python report.py
27 |
28 | # 4. clear resources
29 | echo "Clearing resources..."
30 | rm -rf perf_bench_data.tar.gz
31 | rm -rf perf_bench_data/
32 |
--------------------------------------------------------------------------------
/tests/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/config/__init__.py
--------------------------------------------------------------------------------
/tests/config/demo_4_test.yaml:
--------------------------------------------------------------------------------
1 | # Process config example for Arxiv dataset
2 |
3 | # global parameters
4 | project_name: 'test_demo'
5 | dataset_path: './demos/data/demo-dataset.jsonl' # path to your dataset directory or file
6 | np: 4 # number of subprocess to process your dataset
7 |
8 | export_path: './outputs/demo/demo-processed.parquet'
9 |
10 | # process schedule
11 | # a list of several process operators with their arguments
12 | process:
13 | - whitespace_normalization_mapper:
14 | - language_id_score_filter:
15 | lang: 'zh'
16 | - document_deduplicator: # deduplicate text samples using md5 hashing exact matching method
17 | lowercase: false # whether to convert text to lower case
18 | ignore_non_character: false
19 | - remove_table_text_mapper:
20 |
--------------------------------------------------------------------------------
/tests/config/demo_4_test_bad_val.yaml:
--------------------------------------------------------------------------------
1 | # Process config example for Arxiv dataset
2 |
3 | # global parameters
4 | project_name: 'test_demo'
5 | dataset_path: './demos/data/demo-dataset.jsonl' # path to your dataset directory or file
6 | np: 4 # number of subprocess to process your dataset
7 |
8 | export_path: './outputs/demo/demo-processed.parquet'
9 |
10 | # process schedule
11 | # a list of several process operators with their arguments
12 | process:
13 | - whitespace_normalization_mapper:
14 | - language_id_score_filter:
15 | lang: 'zh'
16 | - document_deduplicator: # deduplicate text samples using md5 hashing exact matching method
17 | lowercase: false # whether to convert text to lower case
18 | ignore_non_character: false
19 | - remove_table_text_mapper:
20 | max_col: 30 # !! a bad value !!
--------------------------------------------------------------------------------
/tests/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/core/__init__.py
--------------------------------------------------------------------------------
/tests/core/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/core/data/__init__.py
--------------------------------------------------------------------------------
/tests/core/data/test_data/sample.jsonl:
--------------------------------------------------------------------------------
1 | {"text": "Today is Sunday and it's a happy day!"}
2 | {"text": "Today is Monday and it's a happy day!"}
3 | {"text": "Today is Tuesday and it's a happy day!"}
4 | {"text": "Today is Wednesday and it's a happy day!"}
5 | {"text": "Today is Thursday and it's a happy day!"}
6 | {"text": "Today is Friday and it's a happy day!"}
7 | {"text": "Today is Saturday and it's a happy day!"}
8 |
--------------------------------------------------------------------------------
/tests/core/data/test_data/sample.txt:
--------------------------------------------------------------------------------
1 | Today is Sunday and it's a happy day!
2 |
--------------------------------------------------------------------------------
/tests/core/data/test_data/test_config.yaml:
--------------------------------------------------------------------------------
1 | project_name: 'dataset-local-json'
2 | dataset:
3 | configs:
4 | - type: 'local'
5 | path: 'sample.jsonl'
--------------------------------------------------------------------------------
/tests/core/data/test_data/test_config_list.yaml:
--------------------------------------------------------------------------------
1 | project_name: 'dataset-local-list'
2 | dataset:
3 | configs:
4 | - type: 'local'
5 | path: 'sample.jsonl'
6 | - type: 'local'
7 | path: 'sample.txt'
--------------------------------------------------------------------------------
/tests/core/data/test_data/test_config_ray.yaml:
--------------------------------------------------------------------------------
1 |
2 | # global parameters
3 | project_name: 'ray-demo-new-config'
4 | dataset:
5 | configs:
6 | - type: local
7 | path: ./test_data/sample.jsonl # path to your dataset directory or file
8 |
9 | export_path: './outputs/demo/demo-processed'
10 |
11 | executor_type: 'ray'
12 | ray_address: 'auto'
13 |
14 |
--------------------------------------------------------------------------------
/tests/download/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/download/__init__.py
--------------------------------------------------------------------------------
/tests/format/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/format/__init__.py
--------------------------------------------------------------------------------
/tests/format/data/structured/demo-dataset.csv:
--------------------------------------------------------------------------------
1 | text,meta
2 | Today is Sunday and it's a happy day!,"{'src': 'Arxiv', 'date': datetime.datetime(2023, 4, 27, 0, 0), 'version': '1.0', 'author': None}"
3 | Do you need a cup of coffee?,"{'src': 'code', 'date': None, 'version': None, 'author': 'xxx'}"
4 | 你好,请问你是谁,"{'src': 'customized', 'date': None, 'version': None, 'author': 'xxx'}"
5 | "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.","{'src': 'Oscar', 'date': None, 'version': '2.0', 'author': None}"
6 | 欢迎来到阿里巴巴!,"{'src': 'customized', 'date': None, 'version': '0.1', 'author': 'xxx'}"
7 | This paper proposed a novel method on LLM pretraining.,"{'src': 'customized', 'date': None, 'version': None, 'author': 'xxx'}"
8 |
--------------------------------------------------------------------------------
/tests/format/data/structured/demo-dataset.jsonl:
--------------------------------------------------------------------------------
1 | {"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}}
2 | {"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}}
3 | {"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}}
4 | {"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}}
5 | {"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}}
6 | {"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}}
7 |
--------------------------------------------------------------------------------
/tests/format/data/structured/demo-dataset.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/format/data/structured/demo-dataset.parquet
--------------------------------------------------------------------------------
/tests/format/data/structured/demo-dataset.tsv:
--------------------------------------------------------------------------------
1 | text meta
2 | Today is Sunday and it's a happy day! {'src': 'Arxiv', 'date': datetime.datetime(2023, 4, 27, 0, 0), 'version': '1.0', 'author': None}
3 | Do you need a cup of coffee? {'src': 'code', 'date': None, 'version': None, 'author': 'xxx'}
4 | 你好,请问你是谁 {'src': 'customized', 'date': None, 'version': None, 'author': 'xxx'}
5 | Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément. {'src': 'Oscar', 'date': None, 'version': '2.0', 'author': None}
6 | 欢迎来到阿里巴巴! {'src': 'customized', 'date': None, 'version': '0.1', 'author': 'xxx'}
7 | This paper proposed a novel method on LLM pretraining. {'src': 'customized', 'date': None, 'version': None, 'author': 'xxx'}
8 |
--------------------------------------------------------------------------------
/tests/format/data/text/sample1.txt:
--------------------------------------------------------------------------------
1 | Today is Sunday and it's a happy day!
2 |
--------------------------------------------------------------------------------
/tests/format/data/text/sample2.txt:
--------------------------------------------------------------------------------
1 | Do you need a cup of coffee?
2 |
--------------------------------------------------------------------------------
/tests/format/data/text/sample3.txt:
--------------------------------------------------------------------------------
1 | 你好,请问你是谁
2 |
--------------------------------------------------------------------------------
/tests/format/data/text/sample4.txt:
--------------------------------------------------------------------------------
1 | Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.
2 |
--------------------------------------------------------------------------------
/tests/format/data/text/sample5.txt:
--------------------------------------------------------------------------------
1 | 欢迎来到阿里巴巴!
2 |
--------------------------------------------------------------------------------
/tests/format/data/text/sample6.txt:
--------------------------------------------------------------------------------
1 | This paper proposed a novel method on LLM pretraining.
2 |
--------------------------------------------------------------------------------
/tests/format/test_csv_formatter.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | from data_juicer.format.csv_formatter import CsvFormatter
5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
6 |
7 |
8 | class CsvFormatterTest(DataJuicerTestCaseBase):
9 |
10 | def setUp(self):
11 | self._path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
12 | 'data', 'structured')
13 | self._file = os.path.join(self._path, 'demo-dataset.csv')
14 | print(self._file)
15 |
16 | def test_csv_file(self):
17 | formatter = CsvFormatter(self._file)
18 | ds = formatter.load_dataset()
19 | self.assertEqual(len(ds), 6)
20 | self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
21 |
22 | def test_csv_path(self):
23 | formatter = CsvFormatter(self._path)
24 | ds = formatter.load_dataset()
25 | self.assertEqual(len(ds), 6)
26 | self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
27 |
28 |
29 | if __name__ == '__main__':
30 | unittest.main()
31 |
--------------------------------------------------------------------------------
/tests/format/test_empty_formatter.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | from data_juicer.format.empty_formatter import EmptyFormatter
5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
6 |
7 |
8 | class EmptyFormatterTest(DataJuicerTestCaseBase):
9 |
10 | text_key = 'text'
11 |
12 | def test_empty_dataset(self):
13 | ds_len = 10
14 | formatter = EmptyFormatter(length=ds_len, feature_keys=[self.text_key])
15 | ds = formatter.load_dataset()
16 |
17 | self.assertEqual(len(ds), ds_len)
18 | self.assertEqual(list(ds.features.keys()), [self.text_key])
19 |
20 | for item in ds:
21 | self.assertDictEqual(item, {self.text_key: None})
22 |
23 | # test map
24 | update_column = {self.text_key: 1}
25 |
26 | def map_fn(sample):
27 | sample.update(update_column)
28 | return sample
29 |
30 | ds = ds.map(map_fn)
31 | self.assertEqual(len(ds), ds_len)
32 | for item in ds:
33 | self.assertDictEqual(item, update_column)
34 |
35 | # test filter
36 | def filter_fn(sample):
37 | return sample[self.text_key] > 2
38 |
39 | ds = ds.filter(filter_fn)
40 | self.assertEqual(len(ds), 0)
41 |
42 |
43 | if __name__ == '__main__':
44 | unittest.main()
45 |
--------------------------------------------------------------------------------
/tests/format/test_parquet_formatter.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | from data_juicer.format.parquet_formatter import ParquetFormatter
5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
6 |
7 |
8 | class CsvFormatterTest(DataJuicerTestCaseBase):
9 |
10 | def setUp(self):
11 | self._path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
12 | 'data', 'structured')
13 | self._file = os.path.join(self._path, 'demo-dataset.parquet')
14 | print(self._file)
15 |
16 | def test_parquet_file(self):
17 | formatter = ParquetFormatter(self._file)
18 | ds = formatter.load_dataset()
19 | self.assertEqual(len(ds), 6)
20 | self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
21 |
22 | def test_parquet_path(self):
23 | formatter = ParquetFormatter(self._path)
24 | ds = formatter.load_dataset()
25 | self.assertEqual(len(ds), 6)
26 | self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
27 |
28 |
29 | if __name__ == '__main__':
30 | unittest.main()
31 |
--------------------------------------------------------------------------------
/tests/format/test_tsv_formatter.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | from data_juicer.format.tsv_formatter import TsvFormatter
5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
6 |
7 |
8 | class TsvFormatterTest(DataJuicerTestCaseBase):
9 |
10 | def setUp(self):
11 | self._path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
12 | 'data', 'structured')
13 | self._file = os.path.join(self._path, 'demo-dataset.tsv')
14 | print(self._file)
15 |
16 | def test_tsv_file(self):
17 | formatter = TsvFormatter(self._file)
18 | ds = formatter.load_dataset()
19 | self.assertEqual(len(ds), 6)
20 | self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
21 |
22 | def test_tsv_path(self):
23 | formatter = TsvFormatter(self._path)
24 | ds = formatter.load_dataset()
25 | self.assertEqual(len(ds), 6)
26 | self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
27 |
28 |
29 | if __name__ == '__main__':
30 | unittest.main()
31 |
--------------------------------------------------------------------------------
/tests/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/__init__.py
--------------------------------------------------------------------------------
/tests/ops/aggregator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/aggregator/__init__.py
--------------------------------------------------------------------------------
/tests/ops/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/common/__init__.py
--------------------------------------------------------------------------------
/tests/ops/data/audio1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/audio1.wav
--------------------------------------------------------------------------------
/tests/ops/data/audio2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/audio2.wav
--------------------------------------------------------------------------------
/tests/ops/data/audio3.ogg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/audio3.ogg
--------------------------------------------------------------------------------
/tests/ops/data/blip.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/blip.jpg
--------------------------------------------------------------------------------
/tests/ops/data/cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/cat.jpg
--------------------------------------------------------------------------------
/tests/ops/data/img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img1.png
--------------------------------------------------------------------------------
/tests/ops/data/img2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img2.jpg
--------------------------------------------------------------------------------
/tests/ops/data/img3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img3.jpg
--------------------------------------------------------------------------------
/tests/ops/data/img4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img4.png
--------------------------------------------------------------------------------
/tests/ops/data/img5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img5.jpg
--------------------------------------------------------------------------------
/tests/ops/data/img6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img6.jpg
--------------------------------------------------------------------------------
/tests/ops/data/img7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img7.jpg
--------------------------------------------------------------------------------
/tests/ops/data/img8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img8.jpg
--------------------------------------------------------------------------------
/tests/ops/data/img_pair_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img_pair_1.jpg
--------------------------------------------------------------------------------
/tests/ops/data/img_pair_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img_pair_2.jpg
--------------------------------------------------------------------------------
/tests/ops/data/lena-face.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/lena-face.jpg
--------------------------------------------------------------------------------
/tests/ops/data/lena.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/lena.jpg
--------------------------------------------------------------------------------
/tests/ops/data/video1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/video1.mp4
--------------------------------------------------------------------------------
/tests/ops/data/video2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/video2.mp4
--------------------------------------------------------------------------------
/tests/ops/data/video3-no-audio.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/video3-no-audio.mp4
--------------------------------------------------------------------------------
/tests/ops/data/video3.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/video3.mp4
--------------------------------------------------------------------------------
/tests/ops/data/video4.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/video4.mp4
--------------------------------------------------------------------------------
/tests/ops/data/video5.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/video5.mp4
--------------------------------------------------------------------------------
/tests/ops/deduplicator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/deduplicator/__init__.py
--------------------------------------------------------------------------------
/tests/ops/filter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/filter/__init__.py
--------------------------------------------------------------------------------
/tests/ops/grouper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/grouper/__init__.py
--------------------------------------------------------------------------------
/tests/ops/grouper/test_key_value_grouper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from data_juicer.core.data import NestedDataset as Dataset
4 | from data_juicer.ops.grouper.key_value_grouper import KeyValueGrouper
5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
6 |
7 |
8 | class KeyValueGrouperTest(DataJuicerTestCaseBase):
9 |
10 | def _run_helper(self, op, samples, target):
11 | dataset = Dataset.from_list(samples)
12 | new_dataset = op.run(dataset)
13 |
14 | for batched_sample in new_dataset:
15 | lang = batched_sample['meta'][0]['language']
16 | self.assertEqual(batched_sample['text'], target[lang])
17 |
18 | def test_key_value_grouper(self):
19 |
20 | source = [
21 | {
22 | 'text': "Today is Sunday and it's a happy day!",
23 | 'meta': {
24 | 'language': 'en'
25 | }
26 | },
27 | {
28 | 'text': "Welcome to Alibaba.",
29 | 'meta': {
30 | 'language': 'en'
31 | }
32 | },
33 | {
34 | 'text': '欢迎来到阿里巴巴!',
35 | 'meta': {
36 | 'language': 'zh'
37 | }
38 | },
39 | ]
40 | target = {
41 | 'en':[
42 | "Today is Sunday and it's a happy day!",
43 | "Welcome to Alibaba."
44 | ],
45 | 'zh':[
46 | '欢迎来到阿里巴巴!'
47 | ]
48 | }
49 |
50 | op = KeyValueGrouper(['meta.language'])
51 | self._run_helper(op, source, target)
52 |
53 | if __name__ == '__main__':
54 | unittest.main()
--------------------------------------------------------------------------------
/tests/ops/grouper/test_naive_grouper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from data_juicer.core.data import NestedDataset as Dataset
4 | from data_juicer.ops.grouper.naive_grouper import NaiveGrouper
5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
6 |
7 |
8 | class NaiveGrouperTest(DataJuicerTestCaseBase):
9 |
10 | def _run_helper(self, op, samples, target):
11 | dataset = Dataset.from_list(samples)
12 | new_dataset = op.run(dataset)
13 |
14 | for d, t in zip(new_dataset, target):
15 | self.assertEqual(d['text'], t['text'])
16 |
17 | def test_naive_group(self):
18 |
19 | source = [
20 | {
21 | 'text': "Today is Sunday and it's a happy day!"
22 | },
23 | {
24 | 'text':
25 | "Sur la plateforme MT4, plusieurs manières d'accéder à \n"
26 | 'ces fonctionnalités sont conçues simultanément.'
27 | },
28 | {
29 | 'text': '欢迎来到阿里巴巴!'
30 | },
31 | ]
32 | target = [
33 | {
34 | 'text':[
35 | "Today is Sunday and it's a happy day!",
36 | "Sur la plateforme MT4, plusieurs manières d'accéder à \n"
37 | 'ces fonctionnalités sont conçues simultanément.',
38 | '欢迎来到阿里巴巴!'
39 | ]
40 | }
41 | ]
42 |
43 | op = NaiveGrouper()
44 | self._run_helper(op, source, target)
45 |
46 | if __name__ == '__main__':
47 | unittest.main()
--------------------------------------------------------------------------------
/tests/ops/mapper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/mapper/__init__.py
--------------------------------------------------------------------------------
/tests/ops/mapper/annotation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/mapper/annotation/__init__.py
--------------------------------------------------------------------------------
/tests/ops/mapper/test_clean_copyright_mapper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from data_juicer.core.data import NestedDataset as Dataset
4 | from data_juicer.ops.mapper.clean_copyright_mapper import CleanCopyrightMapper
5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
6 |
7 |
8 | class CleanCopyrightMapperTest(DataJuicerTestCaseBase):
9 |
10 | def setUp(self):
11 | self.op = CleanCopyrightMapper()
12 |
13 | def _run_clean_copyright(self, samples):
14 | dataset = Dataset.from_list(samples)
15 | dataset = dataset.map(self.op.process, batch_size=2)
16 |
17 | for data in dataset:
18 | self.assertEqual(data['text'], data['target'])
19 |
20 | def test_clean_copyright(self):
21 |
22 | samples = [{
23 | 'text': '这是一段 /* 多行注释\n注释内容copyright\n*/ 的文本。另外还有一些 // 单行注释。',
24 | 'target': '这是一段 的文本。另外还有一些 // 单行注释。'
25 | }, {
26 | 'text': '如果多行/*注释中没有\n关键词,那么\n这部分注释也不会\n被清除*/\n会保留下来',
27 | 'target': '如果多行/*注释中没有\n关键词,那么\n这部分注释也不会\n被清除*/\n会保留下来'
28 | }, {
29 | 'text': '//if start with\n//that will be cleaned \n evenly',
30 | 'target': ' evenly'
31 | }, {
32 | 'text': 'http://www.nasosnsncc.com',
33 | 'target': 'http://www.nasosnsncc.com'
34 | }, {
35 | 'text': '#if start with\nthat will be cleaned \n#evenly',
36 | 'target': 'that will be cleaned \n#evenly'
37 | }, {
38 | 'text': '--if start with\n--that will be cleaned \n#evenly',
39 | 'target': ''
40 | }]
41 | self._run_clean_copyright(samples)
42 |
43 |
44 | if __name__ == '__main__':
45 | unittest.main()
46 |
--------------------------------------------------------------------------------
/tests/ops/mapper/test_fix_unicode_mapper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from data_juicer.core.data import NestedDataset as Dataset
4 | from data_juicer.ops.mapper.fix_unicode_mapper import FixUnicodeMapper
5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
6 |
7 |
8 | class FixUnicodeMapperTest(DataJuicerTestCaseBase):
9 |
10 | def setUp(self):
11 | self.op = FixUnicodeMapper()
12 |
13 | def _run_fix_unicode(self, samples):
14 | dataset = Dataset.from_list(samples)
15 | dataset = dataset.map(self.op.process, batch_size=2)
16 |
17 | for data in dataset:
18 | self.assertEqual(data['text'], data['target'])
19 |
20 | def test_bad_unicode_text(self):
21 |
22 | samples = [
23 | {
24 | 'text': '✔ No problems',
25 | 'target': '✔ No problems'
26 | },
27 | {
28 | 'text':
29 | 'The Mona Lisa doesn’t have eyebrows.',
30 | 'target': 'The Mona Lisa doesn\'t have eyebrows.'
31 | },
32 | ]
33 |
34 | self._run_fix_unicode(samples)
35 |
36 | def test_good_unicode_text(self):
37 | samples = [
38 | {
39 | 'text': 'No problems',
40 | 'target': 'No problems'
41 | },
42 | {
43 | 'text': '阿里巴巴',
44 | 'target': '阿里巴巴'
45 | },
46 | ]
47 | self._run_fix_unicode(samples)
48 |
49 |
50 | if __name__ == '__main__':
51 | unittest.main()
52 |
--------------------------------------------------------------------------------
/tests/ops/mapper/test_generate_qa_from_examples_mapper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from loguru import logger
4 |
5 | from data_juicer.ops.mapper.generate_qa_from_examples_mapper import \
6 | GenerateQAFromExamplesMapper
7 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
8 |
9 | @unittest.skip('unknown vllm connection error')
10 | class GenerateQAFromExamplesMapperTest(DataJuicerTestCaseBase):
11 | text_key = 'text'
12 |
13 | def _run_op(self, enable_vllm=False, sampling_params=None, num_proc=1):
14 | op = GenerateQAFromExamplesMapper(
15 | seed_file='demos/data/demo-dataset-chatml.jsonl',
16 | example_num=3,
17 | enable_vllm=enable_vllm,
18 | sampling_params=sampling_params,
19 | )
20 |
21 | from data_juicer.format.empty_formatter import EmptyFormatter
22 | dataset = EmptyFormatter(3, [self.text_key]).load_dataset()
23 |
24 | results = dataset.map(op.process, num_proc=num_proc, with_rank=True)
25 |
26 | for row in results:
27 | logger.info(row)
28 | self.assertIn(op.query_key, row)
29 | self.assertIn(op.response_key, row)
30 |
31 | def test(self):
32 | sampling_params = {'max_new_tokens': 200}
33 | self._run_op(sampling_params=sampling_params)
34 |
35 | def test_multi_process(self):
36 | sampling_params = {'max_new_tokens': 200}
37 | self._run_op(sampling_params=sampling_params, num_proc=2)
38 |
39 | def test_vllm(self):
40 | sampling_params = {'max_tokens': 200}
41 | self._run_op(enable_vllm=True, sampling_params=sampling_params)
42 |
43 |
44 | if __name__ == '__main__':
45 | unittest.main()
46 |
--------------------------------------------------------------------------------
/tests/ops/mapper/test_mllm_mapper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from data_juicer.ops.mapper.mllm_mapper import MllmMapper
3 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
4 | import os
5 |
6 | class MllmMapperTest(DataJuicerTestCaseBase):
7 |
8 | hf_model = 'llava-hf/llava-v1.6-vicuna-7b-hf'
9 |
10 | text_key = 'text'
11 | image_key = "images"
12 |
13 | @classmethod
14 | def tearDownClass(cls) -> None:
15 | super().tearDownClass(cls.hf_model)
16 |
17 | def _run_mllm(self):
18 | op = MllmMapper(
19 | hf_model=self.hf_model,
20 | temperature=0.9,
21 | top_p=0.95,
22 | max_new_tokens=512
23 | )
24 |
25 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
26 | 'data')
27 | img2_path = os.path.join(data_path, 'img2.jpg')
28 | img3_path = os.path.join(data_path, 'img3.jpg')
29 |
30 | samples = [
31 | {self.text_key: 'Describe this image.', self.image_key: [img2_path, img3_path]},
32 | ]
33 |
34 | for sample in samples:
35 | result = op.process(sample)
36 | self.assertIsInstance(sample[self.text_key], list)
37 | self.assertEqual(len(sample[self.text_key]), 2)
38 | print(f'Output results: {result}')
39 |
40 | def test_mllm(self):
41 | self._run_mllm()
42 |
43 |
44 | if __name__ == '__main__':
45 | unittest.main()
46 |
--------------------------------------------------------------------------------
/tests/ops/mapper/test_optimize_qa_mapper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from loguru import logger
4 |
5 | from data_juicer.core.data import NestedDataset as Dataset
6 | from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper
7 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
8 |
9 | @unittest.skip('unknown vllm connection error')
10 | class OptimizeQAMapperTest(DataJuicerTestCaseBase):
11 |
12 | def _run_op(self, enable_vllm=False, sampling_params=None, num_proc=1):
13 |
14 | op = OptimizeQAMapper(enable_vllm=enable_vllm,
15 | sampling_params=sampling_params)
16 |
17 | samples = [{
18 | 'query':
19 | '鱼香肉丝怎么做?',
20 | 'response':
21 | '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。'
22 | }, {
23 | 'query': '什么是蚂蚁上树?',
24 | 'response': '蚂蚁上树是一道中国菜。'
25 | }]
26 | dataset = Dataset.from_list(samples)
27 | results = dataset.map(op.process, num_proc=num_proc, with_rank=True)
28 |
29 | for row in results:
30 | logger.info(f'Output results: {row}')
31 | self.assertNotEqual(row['query'], '')
32 | self.assertNotEqual(row['response'], '')
33 |
34 | def test(self):
35 | sampling_params = {'max_new_tokens': 200}
36 | self._run_op(sampling_params=sampling_params)
37 |
38 | def test_multi_process(self):
39 | sampling_params = {'max_new_tokens': 200}
40 | self._run_op(sampling_params=sampling_params, num_proc=2)
41 |
42 | def test_vllm(self):
43 | sampling_params = {'max_tokens': 200}
44 | self._run_op(enable_vllm=True, sampling_params=sampling_params)
45 |
46 |
47 | if __name__ == '__main__':
48 | unittest.main()
49 |
--------------------------------------------------------------------------------
/tests/ops/mapper/test_optimize_query_mapper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from loguru import logger
4 |
5 | from data_juicer.core.data import NestedDataset as Dataset
6 | from data_juicer.ops.mapper.optimize_query_mapper import OptimizeQueryMapper
7 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
8 |
9 | @unittest.skip('unknown vllm connection error')
10 | class OptimizeQueryMapperTest(DataJuicerTestCaseBase):
11 |
12 | def _run_op(self, enable_vllm=False, sampling_params=None, num_proc=1):
13 |
14 | op = OptimizeQueryMapper(
15 | hf_model='alibaba-pai/Qwen2-7B-Instruct-Refine',
16 | input_template='{}',
17 | qa_pair_template='{}',
18 | enable_vllm=enable_vllm,
19 | sampling_params=sampling_params)
20 |
21 | samples = [{
22 | 'query':
23 | '鱼香肉丝怎么做?',
24 | 'response':
25 | '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。'
26 | }, {
27 | 'query': '什么是蚂蚁上树?',
28 | 'response': '蚂蚁上树是一道中国菜。'
29 | }]
30 | dataset = Dataset.from_list(samples)
31 | results = dataset.map(op.process, num_proc=num_proc, with_rank=True)
32 |
33 | for row in results:
34 | logger.info(f'Output results: {row}')
35 | self.assertNotEqual(row['query'], '')
36 |
37 | def test(self):
38 | sampling_params = {'max_new_tokens': 200}
39 | self._run_op(sampling_params=sampling_params)
40 |
41 | def test_multi_process(self):
42 | sampling_params = {'max_new_tokens': 200}
43 | self._run_op(sampling_params=sampling_params, num_proc=2)
44 |
45 | def test_vllm(self):
46 | sampling_params = {'max_tokens': 200}
47 | self._run_op(enable_vllm=True, sampling_params=sampling_params)
48 |
49 |
50 | if __name__ == '__main__':
51 | unittest.main()
52 |
--------------------------------------------------------------------------------
/tests/ops/mapper/test_optimize_response_mapper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from loguru import logger
4 |
5 | from data_juicer.core.data import NestedDataset as Dataset
6 | from data_juicer.ops.mapper.optimize_response_mapper import \
7 | OptimizeResponseMapper
8 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
9 |
10 | @unittest.skip('unknown vllm connection error')
11 | class OptimizeResponseMapperTest(DataJuicerTestCaseBase):
12 |
13 | def _run_op(self, enable_vllm=False, sampling_params=None, num_proc=1):
14 |
15 | op = OptimizeResponseMapper(enable_vllm=enable_vllm,
16 | sampling_params=sampling_params)
17 |
18 | samples = [{
19 | 'query':
20 | '鱼香肉丝怎么做?',
21 | 'response':
22 | '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。'
23 | }, {
24 | 'query': '什么是蚂蚁上树?',
25 | 'response': '蚂蚁上树是一道中国菜。'
26 | }]
27 | dataset = Dataset.from_list(samples)
28 | results = dataset.map(op.process, num_proc=num_proc, with_rank=True)
29 |
30 | for row in results:
31 | logger.info(f'Output results: {row}')
32 | self.assertNotEqual(row['response'], '')
33 |
34 | def test(self):
35 | sampling_params = {'max_new_tokens': 200}
36 | self._run_op(sampling_params=sampling_params)
37 |
38 | def test_multi_process(self):
39 | sampling_params = {'max_new_tokens': 200}
40 | self._run_op(sampling_params=sampling_params, num_proc=2)
41 |
42 | def test_vllm(self):
43 | sampling_params = {'max_tokens': 200}
44 | self._run_op(enable_vllm=True, sampling_params=sampling_params)
45 |
46 |
47 | if __name__ == '__main__':
48 | unittest.main()
49 |
--------------------------------------------------------------------------------
/tests/ops/mapper/test_punctuation_normalization_mapper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from data_juicer.core.data import NestedDataset as Dataset
4 | from data_juicer.ops.mapper.punctuation_normalization_mapper import \
5 | PunctuationNormalizationMapper
6 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
7 |
8 |
9 | class PunctuationNormalizationMapperTest(DataJuicerTestCaseBase):
10 |
11 | def setUp(self):
12 | self.op = PunctuationNormalizationMapper()
13 |
14 | def _run_punctuation_normalization(self, samples):
15 | dataset = Dataset.from_list(samples)
16 | dataset = dataset.map(self.op.process, batch_size=2)
17 |
18 | for data in dataset:
19 | self.assertEqual(data['text'], data['target'])
20 |
21 | def test_case(self):
22 |
23 | samples = [{
24 | 'text':
25 | ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►',
26 | 'target':
27 | ",.,\"\"\"\"\"\"\"\"\"\"'::?!();- - . ~'...-<>[]%-"
28 | }]
29 |
30 | self._run_punctuation_normalization(samples)
31 |
32 |
33 | if __name__ == '__main__':
34 | unittest.main()
35 |
--------------------------------------------------------------------------------
/tests/ops/mapper/test_remove_bibliography_mapper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from data_juicer.core.data import NestedDataset as Dataset
4 | from data_juicer.ops.mapper.remove_bibliography_mapper import \
5 | RemoveBibliographyMapper
6 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
7 |
8 |
9 | class RemoveBibliographyMapperTest(DataJuicerTestCaseBase):
10 |
11 | def setUp(self):
12 | self.op = RemoveBibliographyMapper()
13 |
14 | def _run_remove_bibliography(self, samples):
15 | dataset = Dataset.from_list(samples)
16 | dataset = dataset.map(self.op.process, batch_size=2)
17 |
18 | for data in dataset:
19 | self.assertEqual(data['text'], data['target'])
20 |
21 | def test_bibliography_case(self):
22 |
23 | samples = [{
24 | 'text':
25 | "%%\n%% This is file `sample-sigconf.tex\\clearpage\n\\bibliographystyle{ACM-Reference-Format}\n\\bibliography{sample-base}\n\\end{document}\n\\endinput\n%%\n%% End of file `sample-sigconf.tex'.\n", # noqa: E501
26 | 'target':
27 | '%%\n%% This is file `sample-sigconf.tex\\clearpage\n\\bibliographystyle{ACM-Reference-Format}\n' # noqa: E501
28 | }]
29 |
30 | self._run_remove_bibliography(samples)
31 |
32 | def test_ref_case(self):
33 |
34 | samples = [{
35 | 'text':
36 | "%%\n%% This is file `sample-sigconf.tex\\clearpage\n\\begin{references}\n\\end{document}\n\\endinput\n%%\n%% End of file `sample-sigconf.tex'.\n", # noqa: E501
37 | 'target':
38 | '%%\n%% This is file `sample-sigconf.tex\\clearpage\n' # noqa: E501
39 | }]
40 |
41 | self._run_remove_bibliography(samples)
42 |
43 |
44 | if __name__ == '__main__':
45 | unittest.main()
46 |
--------------------------------------------------------------------------------
/tests/ops/mapper/test_remove_specific_chars_mapper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from data_juicer.core.data import NestedDataset as Dataset
4 | from data_juicer.ops.mapper.remove_specific_chars_mapper import \
5 | RemoveSpecificCharsMapper
6 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
7 |
8 |
9 | class RemoveSpecificCharsMapperTest(DataJuicerTestCaseBase):
10 |
11 | def setUp(self):
12 | self.op = RemoveSpecificCharsMapper()
13 |
14 | def _run_helper(self, samples):
15 | dataset = Dataset.from_list(samples)
16 | dataset = dataset.map(self.op.process, batch_size=2)
17 |
18 | for data in dataset:
19 | self.assertEqual(data['text'], data['target'])
20 |
21 | def test_complete_html_text(self):
22 |
23 | samples = [
24 | {
25 | 'text': '这是一个干净的文本。Including Chinese and English.',
26 | 'target': '这是一个干净的文本。Including Chinese and English.',
27 | },
28 | {
29 | 'text': '◆●■►▼▲▴∆▻▷❖♡□',
30 | 'target': '',
31 | },
32 | {
33 | 'text': '►This is a dirty text ▻ 包括中文和英文',
34 | 'target': 'This is a dirty text 包括中文和英文',
35 | },
36 | {
37 | 'text': '多个●■►▼这样的特殊字符可以►▼▲▴∆吗?',
38 | 'target': '多个这样的特殊字符可以吗?',
39 | },
40 | {
41 | 'text': '未指定的●■☛₨➩►▼▲特殊字符会☻▷❖被删掉吗??',
42 | 'target': '未指定的☛₨➩特殊字符会☻被删掉吗??',
43 | },
44 | ]
45 | self._run_helper(samples)
46 |
47 |
48 | if __name__ == '__main__':
49 | unittest.main()
50 |
--------------------------------------------------------------------------------
/tests/ops/mapper/test_sentence_augmentation_mapper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from copy import deepcopy
3 | from data_juicer.ops.mapper.sentence_augmentation_mapper import SentenceAugmentationMapper
4 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
5 |
6 |
7 | class SentenceAugmentationMapperTest(DataJuicerTestCaseBase):
8 |
9 | hf_model = 'Qwen/Qwen2-7B-Instruct'
10 |
11 | text_key = "caption1"
12 | text_key_second = "caption2"
13 |
14 | @classmethod
15 | def tearDownClass(cls) -> None:
16 | super().tearDownClass(cls.hf_model)
17 |
18 | def _run_sentence_augmentation_mapper(self):
19 | op = SentenceAugmentationMapper(
20 | hf_model=self.hf_model,
21 | task_sentence="Please replace one entity in this sentence with "
22 | "another entity, such as an animal, a vehicle, or a "
23 | "piece of furniture. Please only answer with the "
24 | "replaced sentence.",
25 | max_new_tokens=512,
26 | temperature=0.9,
27 | top_p=0.95,
28 | num_beams=1,
29 | text_key=self.text_key,
30 | text_key_second=self.text_key_second
31 | )
32 |
33 | samples = [
34 | {self.text_key: 'a book is near a cat and a dog'}
35 | ]
36 |
37 | for sample in samples:
38 | result = op.process(deepcopy(sample))
39 | print(f'Output results: {result}')
40 | self.assertNotEqual(sample, result)
41 |
42 | def test_sentence_augmentation_mapper(self):
43 | self._run_sentence_augmentation_mapper()
44 |
45 |
46 | if __name__ == '__main__':
47 | unittest.main()
48 |
--------------------------------------------------------------------------------
/tests/ops/mapper/test_whitespace_normalization_mapper.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from data_juicer.core.data import NestedDataset as Dataset
4 | from data_juicer.ops.mapper.whitespace_normalization_mapper import \
5 | WhitespaceNormalizationMapper
6 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
7 |
8 |
9 | class WhitespaceNormalizationMapperTest(DataJuicerTestCaseBase):
10 |
11 | def setUp(self):
12 | self.op = WhitespaceNormalizationMapper()
13 |
14 | def _run_whitespace_normalization(self, samples):
15 | dataset = Dataset.from_list(samples)
16 | dataset = dataset.map(self.op.process, batch_size=2)
17 |
18 | for data in dataset:
19 | self.assertEqual(data['text'], data['target'])
20 |
21 | def test_case(self):
22 |
23 | samples = [{
24 | 'text': 'x \t \u200B\u200C\u200D\u2060\u0084y',
25 | 'target': 'x y'
26 | }]
27 |
28 | self._run_whitespace_normalization(samples)
29 |
30 |
31 | if __name__ == '__main__':
32 | unittest.main()
33 |
--------------------------------------------------------------------------------
/tests/ops/selector/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/selector/__init__.py
--------------------------------------------------------------------------------
/tests/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/tools/__init__.py
--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/utils/__init__.py
--------------------------------------------------------------------------------
/tests/utils/test_availablility_utils.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from data_juicer.utils.availability_utils import _is_package_available
4 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
5 |
6 | class AvailabilityUtilsTest(DataJuicerTestCaseBase):
7 |
8 | def test_is_package_available(self):
9 | exist = _is_package_available('fsspec')
10 | self.assertTrue(exist)
11 | exist, version = _is_package_available('fsspec', return_version=True)
12 | self.assertTrue(exist)
13 | self.assertEqual(version, '2023.5.0')
14 |
15 | exist = _is_package_available('non_existing_package')
16 | self.assertFalse(exist)
17 | exist, version = _is_package_available('non_existing_package', return_version=True)
18 | self.assertFalse(exist)
19 | self.assertEqual(version, 'N/A')
20 |
21 |
22 | if __name__ == '__main__':
23 | unittest.main()
24 |
--------------------------------------------------------------------------------
/tests/utils/test_cache_utils.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import datasets
4 |
5 | from data_juicer.utils.cache_utils import DatasetCacheControl, dataset_cache_control
6 |
7 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
8 |
9 | class DatasetCacheControlTest(DataJuicerTestCaseBase):
10 |
11 | def test_basic_func(self):
12 | self.assertTrue(datasets.is_caching_enabled())
13 | with DatasetCacheControl(on=False):
14 | self.assertFalse(datasets.is_caching_enabled())
15 | self.assertTrue(datasets.is_caching_enabled())
16 |
17 | with DatasetCacheControl(on=False):
18 | self.assertFalse(datasets.is_caching_enabled())
19 | with DatasetCacheControl(on=True):
20 | self.assertTrue(datasets.is_caching_enabled())
21 | self.assertFalse(datasets.is_caching_enabled())
22 | self.assertTrue(datasets.is_caching_enabled())
23 |
24 | def test_decorator(self):
25 |
26 | @dataset_cache_control(on=False)
27 | def check():
28 | return datasets.is_caching_enabled()
29 |
30 | self.assertTrue(datasets.is_caching_enabled())
31 | self.assertFalse(check())
32 | self.assertTrue(datasets.is_caching_enabled())
33 |
34 |
35 | if __name__ == '__main__':
36 | unittest.main()
37 |
--------------------------------------------------------------------------------
/tests/utils/test_fingerprint_utils.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from data_juicer.core import NestedDataset
4 | from data_juicer.utils.fingerprint_utils import generate_fingerprint
5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
6 |
7 | class FingerprintUtilsTest(DataJuicerTestCaseBase):
8 |
9 | def test_generate_fingerprint(self):
10 | dataset = NestedDataset.from_list([{'text_key': 'test_val'}])
11 | fingerprint = generate_fingerprint(dataset)
12 | self.assertLessEqual(len(fingerprint), 64)
13 |
14 | # with func args
15 | new_fingerprint = generate_fingerprint(dataset, lambda x: x['text_key'])
16 | self.assertLessEqual(len(new_fingerprint), 64)
17 | self.assertNotEqual(new_fingerprint, fingerprint)
18 |
19 |
20 | if __name__ == '__main__':
21 | unittest.main()
22 |
--------------------------------------------------------------------------------
/tests/utils/test_process_utils.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import torch
3 | import multiprocess as mp
4 |
5 | from data_juicer.utils.process_utils import setup_mp, get_min_cuda_memory, calculate_np
6 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
7 |
8 | class ProcessUtilsTest(DataJuicerTestCaseBase):
9 |
10 | def test_setup_mp(self):
11 | all_methods = mp.get_all_start_methods()
12 | setup_mp()
13 | self.assertIn(mp.get_start_method(), all_methods)
14 |
15 | setup_mp('spawn')
16 | self.assertEqual(mp.get_start_method(), 'spawn')
17 |
18 | setup_mp(['spawn', 'forkserver', 'fork'])
19 | self.assertEqual(mp.get_start_method(), 'spawn')
20 |
21 | def test_get_min_cuda_memory(self):
22 | if torch.cuda.is_available():
23 | self.assertIsInstance(get_min_cuda_memory(), int)
24 | else:
25 | with self.assertRaises(AssertionError):
26 | get_min_cuda_memory()
27 |
28 |
29 | if __name__ == '__main__':
30 | unittest.main()
31 |
--------------------------------------------------------------------------------
/tests/utils/test_registry.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from data_juicer.utils.registry import Registry
4 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
5 |
6 | class RegistryTest(DataJuicerTestCaseBase):
7 |
8 | def test_basic_func(self):
9 | registry = Registry('test')
10 |
11 | class A:
12 | pass
13 | registry.register_module('module_a', A)
14 |
15 | @registry.register_module('module_b')
16 | class B:
17 | pass
18 |
19 | self.assertEqual(registry.name, 'test')
20 | self.assertEqual(registry.modules, {'module_a': A, 'module_b': B})
21 | self.assertEqual(registry.list(), ['module_a', 'module_b'])
22 | self.assertEqual(registry.get('module_a'), A)
23 | self.assertEqual(registry.get('module_b'), B)
24 |
25 | with self.assertRaises(KeyError):
26 | registry.register_module('module_b', B)
27 |
28 | with self.assertRaises(TypeError):
29 | registry.register_module(1, A)
30 |
31 |
32 | if __name__ == '__main__':
33 | unittest.main()
34 |
--------------------------------------------------------------------------------
/tests/utils/test_resource_utils.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from data_juicer.utils.lazy_loader import LazyLoader
3 | from data_juicer.utils.resource_utils import query_cuda_info, query_mem_info, get_cpu_count, get_cpu_utilization
4 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
5 |
6 |
7 | class RegistryTest(DataJuicerTestCaseBase):
8 |
9 | def test_query_cuda_info(self):
10 | torch = LazyLoader('torch')
11 | if torch.cuda.is_available():
12 | self.assertIsNotNone(query_cuda_info('memory.used'))
13 | else:
14 | self.assertIsNone(query_cuda_info('memory.used'))
15 |
16 | def test_query_mem_info(self):
17 | self.assertIsInstance(query_mem_info('total'), float)
18 | self.assertIsNone(query_mem_info('invalid key'))
19 |
20 | def test_get_cpu_count(self):
21 | self.assertIsInstance(get_cpu_count(), int)
22 |
23 | def test_get_cpu_utilization(self):
24 | self.assertIsInstance(get_cpu_utilization(), float)
25 |
26 |
27 | if __name__ == '__main__':
28 | unittest.main()
29 |
--------------------------------------------------------------------------------
/tests/utils/test_unittest_utils.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
4 |
5 | class UnittestUtilsTest(DataJuicerTestCaseBase):
6 |
7 | def test_placeholder(self):
8 | # placeholder for test
9 | pass
10 |
11 |
12 | if __name__ == '__main__':
13 | unittest.main()
14 |
--------------------------------------------------------------------------------
/thirdparty/LLM_ecosystems/README.md:
--------------------------------------------------------------------------------
1 | # LLM Ecosystems
2 |
3 | Dependencies of Auto Evaluation Toolkit, see [`tools/evaluator/README.md`](../tools/evaluator/README.md) for more details.
4 |
5 | ## Installation
6 |
7 | The auto-evaluation toolkit requires customized [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) and [HELM](https://github.com/stanford-crfm/helm).
8 | To avoid dependency problems when installing those packages, we recommend using NGC's PyTorch container (`nvcr.io/nvidia/pytorch:22.12-py3`).
9 | Assuming the path to your shared file system (where your data and model checkpoints are saved) is `/mnt/shared`, start the docker container with following commands.
10 |
11 | ```shell
12 | docker pull nvcr.io/nvidia/pytorch:22.12-py3
13 | docker run --gpus all -it --rm -v /mnt/shared:/workspace
14 | ```
15 |
16 | After starting the docker container, run the following scripts in the container to install Megatron-LM or HELM.
17 |
18 | The training machines only need to install Megatron-LM:
19 |
20 | ```shell
21 | ./setup_megatron.sh
22 | ```
23 |
24 | The evaluation machine needs to install both Megatron-LM and HELM
25 |
26 | ```shell
27 | ./setup_megatron.sh
28 | ./setup_helm.sh
29 | ```
30 |
31 | The toolkit use [W&B](https://wandb.ai/) (wandb) to monitor the trend of metrics during training. Above steps have installed wandb, and you only need to run `wandb login` and enter your wandb API key. If you have your own instance of wandb, run the following script.
32 |
33 | ```shell
34 | wandb login --host
35 | # enter your api key
36 | ```
37 |
38 |
--------------------------------------------------------------------------------
/thirdparty/LLM_ecosystems/README_ZH.md:
--------------------------------------------------------------------------------
1 | # 大语言模型生态
2 |
3 | 本目录包含了 Auto Evaluation Toolkit 的第三方依赖项,更多细节请参考 `tools/evaluator/README_ZH.md`。
4 |
5 | ## 安装
6 |
7 | Auto Evaluation Toolkit 依赖于定制化的 [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) 和 [HELM](https://github.com/stanford-crfm/helm)。
8 | 为了避免安装这些软件包时可能出现的依赖项问题,我们建议使用 NGC 的 Pytorch 容器(`nvcr.io/nvidia/pytorch:22.12-py3`)。
9 | 假设您共享文件系统的路径(即数据集和模型检查点的存储路径)为`/mnt/shared`,请使用如下指令启动 Docker 容器。
10 |
11 | ```shell
12 | docker pull nvcr.io/nvidia/pytorch:22.12-py3
13 | docker run --gpus all -it --rm -v /mnt/shared:/workspace
14 | ```
15 |
16 | 启动 Docker 容器后,在容器中运行以下脚本以安装 Megatron-LM 或 HELM。
17 |
18 | 训练机只需要安装 Megatron-LM:
19 |
20 | ```shell
21 | ./setup_megatron.sh
22 | ```
23 |
24 | 评测机需要同时安装 Megatron-LM 和 HELM
25 |
26 | ```shell
27 | ./setup_megatron.sh
28 | ./setup_helm.sh
29 | ```
30 |
31 | 工具包使用 [WandB](https://wandb.ai/) 来监视训练期间各指标的趋势。上面的步骤中已安装 wandb,您只需要运行 `wand login` 并输入 wandb API 密钥即可。如果您有自己的 wandb 实例,请运行以下脚本。
32 |
33 | ```shell
34 | wandb login --host
35 | #输入您的 API 密钥
36 | ```
37 |
--------------------------------------------------------------------------------
/thirdparty/LLM_ecosystems/setup_helm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export THIRD_PARTY_DIR=$(cd $(dirname $0); pwd)
4 | export HELM_DIR=${THIRD_PARTY_DIR}/helm
5 |
6 | # install conda
7 | conda &> /dev/null
8 | if [ $? -ne 0 ]; then
9 | echo "> setup conda ..."
10 | CONDA_DIR=${HOME}/miniconda3
11 | wget https://repo.anaconda.com/miniconda/Miniconda3-py38_23.1.0-1-Linux-x86_64.sh
12 | bash Miniconda3-py38_23.1.0-1-Linux-x86_64.sh -b -p $CONDA_DIR
13 | export PATH=$CONDA_DIR/bin:$PATH
14 | fi
15 |
16 | # setup helm
17 | echo "> setup helm ..."
18 | git clone https://github.com/stanford-crfm/helm.git
19 | cd $HELM_DIR
20 | git reset 33ca6e62 --hard
21 | git apply ${THIRD_PARTY_DIR}/patch/helm.diff
22 | conda create -n crfm-helm python=3.8 pip -y
23 | eval "$(conda shell.bash hook)"
24 | conda activate crfm-helm
25 | pip install -e .
--------------------------------------------------------------------------------
/thirdparty/LLM_ecosystems/setup_megatron.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export THIRD_PARTY_DIR=$(cd $(dirname $0); pwd)
4 | export MEGATRON_DIR=${THIRD_PARTY_DIR}/Megatron-LM
5 |
6 |
7 | # setup megatron
8 | echo "> setup Megatron-LM ..."
9 | git clone https://github.com/NVIDIA/Megatron-LM.git
10 | cd $MEGATRON_DIR
11 | git reset 040eac9 --hard
12 | git apply ${THIRD_PARTY_DIR}/patch/megatron.diff
13 | pip install flash-attn flask flask_restful jsonlines asyncio wandb sentencepiece
14 |
--------------------------------------------------------------------------------
/thirdparty/models/README.md:
--------------------------------------------------------------------------------
1 | # Third-party Model Library
2 |
3 | ## EasyAnimate
4 |
5 | Install [EasyAnimate](https://github.com/aigc-apps/EasyAnimate):
6 |
7 | ```shell
8 | bash ./setup_easyanimate.sh
9 | ```
10 |
--------------------------------------------------------------------------------
/thirdparty/models/README_ZH.md:
--------------------------------------------------------------------------------
1 | # 第三方模型库
2 |
3 | ## EasyAnimate
4 |
5 | 安装[EasyAnimate](https://github.com/aigc-apps/EasyAnimate):
6 |
7 | ```shell
8 | bash ./setup_easyanimate.sh
9 | ```
10 |
--------------------------------------------------------------------------------
/thirdparty/models/setup_easyanimate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export THIRD_PARTY_DIR=$(cd $(dirname $0); pwd)
4 | export EASYANIMATE_DIR=${THIRD_PARTY_DIR}/EasyAnimate
5 |
6 | # setup easyanimate
7 | echo "> setup easyanimate ..."
8 | git clone https://github.com/aigc-apps/EasyAnimate.git
9 | cd $EASYANIMATE_DIR
10 | git reset b54412ceb0af6a06bf907e049920f18508c862f1 --hard
11 | git apply ${THIRD_PARTY_DIR}/patch/easyanimate.diff
12 |
--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/__init__.py
--------------------------------------------------------------------------------
/tools/analyze_data.py:
--------------------------------------------------------------------------------
1 | from loguru import logger
2 |
3 | from data_juicer.core import Analyzer
4 |
5 |
6 | @logger.catch(reraise=True)
7 | def main():
8 | analyzer = Analyzer()
9 | analyzer.run()
10 |
11 |
12 | if __name__ == '__main__':
13 | main()
14 |
--------------------------------------------------------------------------------
/tools/converter/batch_convert.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | WORKDIR='.'
6 |
7 | MODEL_TO_CONVERT=(
8 | )
9 |
10 | PATH_TO_SAVE=(
11 | )
12 |
13 | for i in "${!MODEL_TO_CONVERT[@]}"; do
14 | path_model=${MODEL_TO_CONVERT[i]}
15 | path_save=${PATH_TO_SAVE[i]}
16 |
17 | echo $i ":" $path_model "to" $path_save
18 |
19 | python ${WORKDIR}/convert/convert_gpt_to_transformers.py \
20 | --load_path ${path_model} \
21 | --save_path ${path_save} \
22 | --max_shard_size "10GB" \
23 | --tokenizer_name "decapoda-research/llama-7b-hf" \
24 | --print-checkpoint-structure
25 | done
26 |
--------------------------------------------------------------------------------
/tools/distributed_deduplication/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/distributed_deduplication/__init__.py
--------------------------------------------------------------------------------
/tools/evaluator/config/evaluator_example.yaml:
--------------------------------------------------------------------------------
1 | auto_eval:
2 | project_name:
3 | model_name:
4 | cache_dir:
5 | megatron:
6 | process_num:
7 | megatron_home:
8 | checkpoint_path:
9 | tokenizer_type:
10 | vocab_path:
11 | merge_path:
12 | max_tokens:
13 | token_per_iteration:
14 | # tokenizer_path:
15 | # log_path:
16 | helm:
17 | helm_spec_template_path:
18 | helm_output_path:
19 | helm_env_name:
20 | gpt_evaluation:
21 | # openai config
22 | openai_api_key:
23 | openai_organization:
24 | # files config
25 | question_file: ./tools/eval/gpt_eval/config/question.jsonl
26 | answer_file:
27 | baseline_file: ./tools/eval/gpt_eval/answer/openai/chatgpt.jsonl
28 | prompt_file: ./tools/eval/gpt_eval/config/prompt.jsonl
29 | reviewer_file: ./tools/eval/gpt_eval/config/reviewer.jsonl
30 | result_file:
31 | wandb:
32 | project:
33 | base_url:
34 |
--------------------------------------------------------------------------------
/tools/evaluator/gpt_eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/evaluator/gpt_eval/__init__.py
--------------------------------------------------------------------------------
/tools/evaluator/gpt_eval/config/config.yaml:
--------------------------------------------------------------------------------
1 | answer_generation:
2 | model_name: my_model
3 | question_file: ./config/question.jsonl
4 | answer_file: ./answer/myorg/mymodel.jsonl
5 | batch_size: 4
6 | max_tokens: 512
7 | temperature: 0.7
8 | # config for huggingface
9 | huggingface:
10 | model_path: myorg/mymodel
11 | tokenizer_path: myorg/mymodel
12 | # # config for openai
13 | # openai:
14 | # openai_organization:
15 | # openai_api_key:
16 | # model:
17 | # max_retry:
18 | # # config for megatron-lm
19 | # megatron:
20 | # process_num:
21 | # checkpoint_path:
22 | # tokenizer_type:
23 | # vocab_path:
24 | # merge_path:
25 | # iteration:
26 | gpt_evaluation:
27 | # openai config
28 | openai_organization:
29 | openai_api_key:
30 | # files config
31 | question_file: ./config/question.jsonl
32 | answer_file: ./answer/myorg/mymodel.jsonl
33 | baseline_file: ./answer/openai/gpt-3.5-turbo.jsonl
34 | prompt_file: ./config/prompt.jsonl
35 | reviewer_file: ./config/reviewer.jsonl
36 | result_file: ./review/myorg/mymodel-gpt3.5-turbo.jsonl
37 |
--------------------------------------------------------------------------------
/tools/evaluator/gpt_eval/config/reviewer.jsonl:
--------------------------------------------------------------------------------
1 | {"category": "general", "metadata": {"temperature": 0.2, "max_tokens": 1024, "model": "gpt-3.5-turbo"}}
2 | {"category": "coding", "metadata": {"temperature": 0.2, "max_tokens": 1024, "model": "gpt-3.5-turbo"}}
3 | {"category": "math", "metadata": {"temperature": 0.2, "max_tokens": 1024, "model": "gpt-3.5-turbo"}}
4 |
--------------------------------------------------------------------------------
/tools/evaluator/recorder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/evaluator/recorder/__init__.py
--------------------------------------------------------------------------------
/tools/evaluator/recorder/config/leaderboard_example.yaml:
--------------------------------------------------------------------------------
1 | project:
2 | base_url:
3 | leaderboard: True
4 | leaderboard_metrics:
5 | - mmlu.EM
6 | - boolq.EM
7 | - quac.F1
8 | - raft.EM
9 | - hellaswag.EM
10 | - ...
11 | excluded_runs:
12 | -
13 | - ...
14 |
--------------------------------------------------------------------------------
/tools/evaluator/recorder/config/llama_example.yaml:
--------------------------------------------------------------------------------
1 | project:
2 | base_url:
3 | evals:
4 | - eval_type: helm
5 | model_name: llama-7b
6 | source: file
7 | token_num: 1000
8 | eval_result:
9 | mmlu:
10 | EM: 0.345
11 | raft:
12 | EM: 0.583
13 | imdb:
14 | EM: 0.933
15 | truthful_qa:
16 | EM: 0.297
17 | summarization_cnndm:
18 | ROUGE-2: 0.149
19 | summarization_xsum:
20 | ROUGE-2: 0.127
21 | boolq:
22 | EM: 0.751
23 | msmarco_trec:
24 | NDCG@10: 0.482
25 | msmarco_regular:
26 | RR@10: 0.252
27 | narrative_qa:
28 | F1: 0.524
29 | natural_qa_closedbook:
30 | F1: 0.297
31 | natural_qa_openbook_longans:
32 | F1: 0.580
33 | quac:
34 | F1: 0.332
35 | civil_comments:
36 | EM: 0.578
37 | hellaswag:
38 | EM: 0.747
39 | openbookqa:
40 | EM: 0.574
41 |
--------------------------------------------------------------------------------
/tools/evaluator/recorder/config/mymodel_example.yaml:
--------------------------------------------------------------------------------
1 | project:
2 | base_url:
3 | evals:
4 | - eval_type: helm
5 | model_name:
6 | source: helm
7 | helm_output_dir:
8 | helm_suite_name:
9 | token_per_iteration:
10 | benchmarks:
11 | - name: mmlu
12 | metrics:
13 | - EM
14 | - name: boolq
15 | metrics:
16 | - EM
17 | - name: quac
18 | metrics:
19 | - F1
20 | - name: raft
21 | metrics:
22 | - EM
23 | - name: hellaswag
24 | metrics:
25 | - EM
26 | - ...
27 |
--------------------------------------------------------------------------------
/tools/fmt_conversion/README_ZH.md:
--------------------------------------------------------------------------------
1 | # 格式转换工具
2 |
3 | 在这里,Data-Juicer 为各式各样的数据集提供了十数种格式转换工具,包括多模态数据集,后微调数据集等等。
4 | 这些工具能够将原始格式的数据集转换为Data-Juicer所需的统一中间格式(我们称之为"DJ格式")。DJ算子的默认实现基于该格式进行设计,例如会直接从'text'字段读取数据载荷(payload)执行相应处理操作。对于特殊格式需求,用户既可以通过调整算子参数配置来适配,也可通过自定义算子实现进行扩展支持。
5 |
6 | DJ 格式的一个示例如下所示:
7 |
8 | ```python
9 | {
10 | // >>> 核心内容:文本,对话,......
11 | "text": "xxx",
12 | "query": "xxx",
13 | "response": "xxx",
14 | ......
15 | // <<< 核心内容
16 |
17 | // >>> 额外数据内容:多模态数据路径,......
18 | "images": [
19 | "path/to/the/image/of/antarctica_snowfield",
20 | "path/to/the/image/of/antarctica_map",
21 | "path/to/the/image/of/europe_map"
22 | ],
23 | "audios": [
24 | "path/to/the/audio/of/sound_of_waves_in_Antarctic_Ocean"
25 | ],
26 | "videos": [
27 | "path/to/the/video/of/remote_sensing_view_of_antarctica"
28 | ],
29 | // <<< 额外数据内容
30 |
31 | // >>> meta 信息和 stats,它们可能是数据集原生的,也可以由 Data-Juicer 产出
32 | "meta": {
33 | "src": "customized",
34 | "version": "0.1",
35 | "author": "xxx"
36 | },
37 | "stats": {
38 | "lang": "en",
39 | "image_widths": [224, 336, 512],
40 | ...
41 | },
42 | // <<< meta 信息和 stats
43 | }
44 | ```
45 |
46 | 在 DJ 格式中大概包括三个部分:
47 | 1. 核心内容:例如 LLM 的预训练数据集中的文本内容,后微调数据集中的对话内容等。它们与数据集的下游使用的训练或者微调过程直接相关。
48 | 2. 额外数据内容:例如多模态数据集中的多模态数据路径。它们被组织为路径列表。
49 | 3. Meta 信息和 Stats:例如从原始数据集中继承而来的数据集版本或来源信息,或者由 Data-Juicer 的算子产出的类别 tags 和 stats 信息。
50 |
51 | 其中,第 2 和第 3 部分对于不同的数据集来说是通用的,而且都会被组织为几乎相同的结构。
52 | 作为对比,第 1 部分,也就是核心内容部分,对于各种数据集来说可能非常不同。
53 | 这里列举了针对不同种类数据集介绍这个部分更多细节的对应的文档:
54 | - [多模态数据集](multimodal/README_ZH.md)
55 | - [后微调数据集](post_tuning_dialog/README_ZH.md)
--------------------------------------------------------------------------------
/tools/hpo/configs/process.yaml:
--------------------------------------------------------------------------------
1 | # Process config example for dataset
2 |
3 | # global parameters
4 | project_name: 'demo-process-hpo'
5 | dataset_path: 'demo-redpajama-c4-refined.jsonl'
6 | np: 4 # number of subprocess to process your dataset
7 |
8 | export_path: './outputs/demo-hpo-process/demo-hpo-processed.jsonl'
9 |
10 | # process schedule
11 | # a list of several process operators with their arguments
12 | process:
13 | - character_repetition_filter: # filter text with the character repetition ratio out of specific range
14 | rep_len: 10 # repetition length for char-level n-gram
15 | min_ratio: 0.0 # the min ratio of filter range
16 | max_ratio: 0.5
17 | - text_length_filter: # filter text with length out of specific range
18 | min_len: 10 # the min length of filter range
19 | max_len: 10000
20 |
--------------------------------------------------------------------------------
/tools/hpo/configs/quality_score_hpo.yaml:
--------------------------------------------------------------------------------
1 |
2 | sweep_name: hpo_for_data-juicer
3 | sweep_max_count: 1000 # the maximal number of trials; `None` for unlimited
4 |
5 | # hpo configuration from original sweep, see more options and details in
6 | # https://docs.wandb.ai/guides/sweeps/define-sweep-configuration
7 |
8 | method: bayes # ["random", "grid", "bayes"]
9 |
10 | metric:
11 | name: quality_score # defined in hpo/objects.py
12 | goal: maximize # ["maximize", "minimize", +"target"]
13 |
14 | parameters:
15 | # can be [single value, multiple values, probabilities, distribution, nested]
16 | character_repetition_filter.rep_len:
17 | values: [2, 4, 8, 16]
18 | character_repetition_filter.max_ratio:
19 | values: [0.3, 0.5, 0.7]
20 | text_length_filter.min_len:
21 | distribution: q_log_uniform_values
22 | min: 256
23 | max: 8192
24 |
25 | early_terminate:
26 | type: hyperband
27 | max_iter: 27
28 | s: 2
29 |
--------------------------------------------------------------------------------
/tools/hpo/execute_hpo_wandb.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import wandb
4 | import yaml
5 | from jsonargparse import namespace_to_dict
6 | from objects import get_hpo_objective
7 |
8 | from data_juicer.config import init_configs, merge_config
9 |
10 | # 1: load the defined search space
11 | sweep_cfg_file_path = None
12 | for i in range(len(sys.argv) - 1):
13 | if sys.argv[i] == '--hpo_config':
14 | sweep_cfg_file_path = sys.argv[i + 1]
15 | break
16 | if not sweep_cfg_file_path:
17 | raise ValueError('Not found --hpo_config, you should specify your '
18 | 'hpo cfg file path following `--hpo_config`')
19 | with open(sweep_cfg_file_path) as f:
20 | sweep_configuration = yaml.safe_load(f)
21 |
22 |
23 | def search():
24 | wandb.init(project=sweep_configuration['sweep_name'])
25 |
26 | # 2.1: Choose objective that links the hyper-parameters you want to search
27 | object_func = get_hpo_objective(sweep_configuration['metric']['name'])
28 |
29 | dj_cfg = init_configs()
30 | # merge the new hyper-parameters selected by HPO scheduler
31 | dj_cfg = merge_config(dj_cfg, wandb.config)
32 | wandb.config = namespace_to_dict(dj_cfg) # for configuration track
33 |
34 | # 2.2: calculate objective using new hyper-parameters, track the results
35 | score = float(object_func(dj_cfg))
36 | wandb.log({sweep_configuration['metric']['name']: score})
37 |
38 |
39 | # 3: Start the sweep, iteratively search hyper-parameters
40 | sweep_id = wandb.sweep(sweep=sweep_configuration,
41 | project=sweep_configuration['sweep_name'])
42 |
43 | wandb.agent(sweep_id,
44 | function=search,
45 | count=sweep_configuration['sweep_max_count']
46 | if 'sweep_max_count' in sweep_configuration else None)
47 |
--------------------------------------------------------------------------------
/tools/humanops/enable_legacy_token.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/humanops/enable_legacy_token.png
--------------------------------------------------------------------------------
/tools/mm_eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/mm_eval/__init__.py
--------------------------------------------------------------------------------
/tools/mm_eval/vbench_metrics/README.md:
--------------------------------------------------------------------------------
1 | VBench from the paper "VBench: Comprehensive Benchmark Suite for Video Generative Models".
2 |
3 | Please refer to [GitHub](https://github.com/Vchitect/VBench) for more detail.
4 |
--------------------------------------------------------------------------------
/tools/mm_eval/vbench_metrics/README_ZH.md:
--------------------------------------------------------------------------------
1 | VBench来自paper:"VBench: Comprehensive Benchmark Suite for Video Generative Models"。
2 |
3 | 请跳转[GitHub](https://github.com/Vchitect/VBench)查看更多信息。
4 |
--------------------------------------------------------------------------------
/tools/multimodal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/multimodal/__init__.py
--------------------------------------------------------------------------------
/tools/process_data.py:
--------------------------------------------------------------------------------
1 | from loguru import logger
2 |
3 | from data_juicer.config import init_configs
4 | from data_juicer.core import DefaultExecutor
5 |
6 |
7 | @logger.catch(reraise=True)
8 | def main():
9 | cfg = init_configs()
10 | if cfg.executor_type == 'default':
11 | executor = DefaultExecutor(cfg)
12 | elif cfg.executor_type == 'ray':
13 | from data_juicer.core.executor.ray_executor import RayExecutor
14 | executor = RayExecutor(cfg)
15 | executor.run()
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/tools/quality_classifier/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/quality_classifier/__init__.py
--------------------------------------------------------------------------------