├── .coveragerc ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── custom.md │ ├── feature_request.yml │ └── question.yml └── workflows │ ├── deploy_sphinx_docs.yml │ ├── docker │ └── docker-compose.yml │ ├── perf-bench.yml │ ├── pre-commit.yml │ ├── publish-docker-oss.yml │ ├── publish-docker.yml │ ├── publish-pypi.yml │ ├── sphinx_docs_linkcheck.yml │ ├── stale.yml │ ├── unit-test-partial.yml │ └── unit-test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .pre-commit-hooks ├── build_op_doc.py └── tag_mappings.json ├── .secrets.baseline ├── Dockerfile ├── LICENSE ├── README.md ├── README_ZH.md ├── app.py ├── configs ├── annotation │ ├── README.md │ ├── annotation_default.yaml │ └── annotation_with_notifications.yaml ├── config_all.yaml ├── config_min.yaml ├── data_juicer_recipes │ ├── alpaca_cot │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── alpaca-cot-en-refine.yaml │ │ └── alpaca-cot-zh-refine.yaml │ ├── data-juicer-sandbox-optimal.yaml │ ├── data-juicer-sandbox-self-evolution.yaml │ ├── general-video-refine-example.yaml │ ├── github_code │ │ ├── redpajama-code-refine.yaml │ │ ├── redpajama-stack-code-deduplicate.yaml │ │ └── stack-code-refine.yaml │ ├── img-diff-recipe.yaml │ ├── llava-pretrain-refine.yaml │ ├── pile-europarl-refine.yaml │ ├── pile-freelaw-refine.yaml │ ├── pile-hackernews-refine.yaml │ ├── pile-nih-refine.yaml │ ├── pile-philpaper-refine.yaml │ ├── pile-pubmed-abstract-refine.yaml │ ├── pile-pubmed-central-refine.yaml │ ├── pile-uspto-refine.yaml │ ├── redpajama-arxiv-refine.yaml │ ├── redpajama-book-refine.yaml │ ├── redpajama-c4-refine.yaml │ ├── redpajama-cc-2019-30-refine.yaml │ ├── redpajama-cc-2020-05-refine.yaml │ ├── redpajama-cc-2021-04-refine.yaml │ ├── redpajama-cc-2022-05-refine.yaml │ ├── redpajama-cc-2023-06-refine.yaml │ ├── redpajama-pile-stackexchange-refine.yaml │ └── redpajama-wiki-refine.yaml ├── datasets │ ├── local_json.yaml │ ├── local_parquet.yaml │ ├── mixture.yaml │ ├── remote_arxiv.yaml │ ├── remote_commoncrawl.yaml │ ├── remote_huggingface.yaml │ ├── remote_modelscope.yaml │ ├── remote_wiki.yaml │ ├── validation_required_fields.yaml │ └── validation_swift_messages.yaml ├── demo │ ├── analyzer.yaml │ ├── bench │ │ ├── 1_single_op_pipeline.yaml │ │ ├── 2_multi_op_pipeline.yaml │ │ ├── 3_duplicate_pipeline.yaml │ │ ├── model_infer.yaml │ │ ├── model_train.yaml │ │ ├── model_train_2_epoch.yaml │ │ └── vbench_eval.yaml │ ├── dedup.yaml │ ├── process-huggingface.yaml │ ├── process.yaml │ └── sandbox │ │ ├── gpt3_data_quality_eval_config.yaml │ │ ├── gpt3_extra_train_config.json │ │ ├── gpt3_extra_train_config.yaml │ │ ├── inception_eval_config.yaml │ │ └── sandbox.yaml ├── reproduced_bloom │ ├── README.md │ ├── README_ZH.md │ └── bloom-oscar.yaml └── reproduced_redpajama │ ├── README.md │ ├── README_ZH.md │ ├── redpajama-arxiv.yaml │ ├── redpajama-books.yaml │ ├── redpajama-code.yaml │ └── redpajama-stackexchange.yaml ├── data_juicer ├── __init__.py ├── analysis │ ├── __init__.py │ ├── collector.py │ ├── column_wise_analysis.py │ ├── diversity_analysis.py │ ├── draw.py │ ├── measure.py │ └── overall_analysis.py ├── config │ ├── __init__.py │ └── config.py ├── core │ ├── __init__.py │ ├── adapter.py │ ├── analyzer.py │ ├── data │ │ ├── __init__.py │ │ ├── config_validator.py │ │ ├── data_validator.py │ │ ├── dataset_builder.py │ │ ├── dj_dataset.py │ │ ├── load_strategy.py │ │ ├── ray_dataset.py │ │ └── schema.py │ ├── executor │ │ ├── __init__.py │ │ ├── base.py │ │ ├── default_executor.py │ │ ├── factory.py │ │ └── ray_executor.py │ ├── exporter.py │ ├── monitor.py │ ├── sandbox │ │ ├── evaluators.py │ │ ├── factories.py │ │ ├── hooks.py │ │ ├── model_executors.py │ │ └── pipelines.py │ └── tracer.py ├── download │ ├── __init__.py │ ├── arxiv.py │ ├── commoncrawl.py │ ├── downloader.py │ └── wikipedia.py ├── format │ ├── __init__.py │ ├── csv_formatter.py │ ├── empty_formatter.py │ ├── formatter.py │ ├── json_formatter.py │ ├── load.py │ ├── parquet_formatter.py │ ├── text_formatter.py │ └── tsv_formatter.py ├── ops │ ├── __init__.py │ ├── aggregator │ │ ├── __init__.py │ │ ├── entity_attribute_aggregator.py │ │ ├── meta_tags_aggregator.py │ │ ├── most_relevant_entities_aggregator.py │ │ └── nested_aggregator.py │ ├── base_op.py │ ├── common │ │ ├── __init__.py │ │ ├── helper_func.py │ │ ├── prompt2prompt_pipeline.py │ │ └── special_characters.py │ ├── deduplicator │ │ ├── __init__.py │ │ ├── document_deduplicator.py │ │ ├── document_minhash_deduplicator.py │ │ ├── document_simhash_deduplicator.py │ │ ├── image_deduplicator.py │ │ ├── ray_basic_deduplicator.py │ │ ├── ray_bts_minhash_deduplicator.py │ │ ├── ray_document_deduplicator.py │ │ ├── ray_image_deduplicator.py │ │ ├── ray_video_deduplicator.py │ │ └── video_deduplicator.py │ ├── filter │ │ ├── __init__.py │ │ ├── alphanumeric_filter.py │ │ ├── audio_duration_filter.py │ │ ├── audio_nmf_snr_filter.py │ │ ├── audio_size_filter.py │ │ ├── average_line_length_filter.py │ │ ├── character_repetition_filter.py │ │ ├── flagged_words_filter.py │ │ ├── general_field_filter.py │ │ ├── image_aesthetics_filter.py │ │ ├── image_aspect_ratio_filter.py │ │ ├── image_face_count_filter.py │ │ ├── image_face_ratio_filter.py │ │ ├── image_nsfw_filter.py │ │ ├── image_pair_similarity_filter.py │ │ ├── image_shape_filter.py │ │ ├── image_size_filter.py │ │ ├── image_text_matching_filter.py │ │ ├── image_text_similarity_filter.py │ │ ├── image_watermark_filter.py │ │ ├── language_id_score_filter.py │ │ ├── llm_difficulty_score_filter.py │ │ ├── llm_quality_score_filter.py │ │ ├── maximum_line_length_filter.py │ │ ├── perplexity_filter.py │ │ ├── phrase_grounding_recall_filter.py │ │ ├── special_characters_filter.py │ │ ├── specified_field_filter.py │ │ ├── specified_numeric_field_filter.py │ │ ├── stopwords_filter.py │ │ ├── suffix_filter.py │ │ ├── text_action_filter.py │ │ ├── text_entity_dependency_filter.py │ │ ├── text_length_filter.py │ │ ├── text_pair_similarity_filter.py │ │ ├── token_num_filter.py │ │ ├── video_aesthetics_filter.py │ │ ├── video_aspect_ratio_filter.py │ │ ├── video_duration_filter.py │ │ ├── video_frames_text_similarity_filter.py │ │ ├── video_motion_score_filter.py │ │ ├── video_motion_score_raft_filter.py │ │ ├── video_nsfw_filter.py │ │ ├── video_ocr_area_ratio_filter.py │ │ ├── video_resolution_filter.py │ │ ├── video_tagging_from_frames_filter.py │ │ ├── video_watermark_filter.py │ │ ├── word_repetition_filter.py │ │ └── words_num_filter.py │ ├── grouper │ │ ├── __init__.py │ │ ├── key_value_grouper.py │ │ ├── naive_grouper.py │ │ └── naive_reverse_grouper.py │ ├── load.py │ ├── mapper │ │ ├── __init__.py │ │ ├── annotation │ │ │ ├── __init__.py │ │ │ ├── annotation_mapper.py │ │ │ └── human_preference_annotation_mapper.py │ │ ├── audio_add_gaussian_noise_mapper.py │ │ ├── audio_ffmpeg_wrapped_mapper.py │ │ ├── calibrate_qa_mapper.py │ │ ├── calibrate_query_mapper.py │ │ ├── calibrate_response_mapper.py │ │ ├── chinese_convert_mapper.py │ │ ├── clean_copyright_mapper.py │ │ ├── clean_email_mapper.py │ │ ├── clean_html_mapper.py │ │ ├── clean_ip_mapper.py │ │ ├── clean_links_mapper.py │ │ ├── dialog_intent_detection_mapper.py │ │ ├── dialog_sentiment_detection_mapper.py │ │ ├── dialog_sentiment_intensity_mapper.py │ │ ├── dialog_topic_detection_mapper.py │ │ ├── expand_macro_mapper.py │ │ ├── extract_entity_attribute_mapper.py │ │ ├── extract_entity_relation_mapper.py │ │ ├── extract_event_mapper.py │ │ ├── extract_keyword_mapper.py │ │ ├── extract_nickname_mapper.py │ │ ├── extract_support_text_mapper.py │ │ ├── extract_tables_from_html_mapper.py │ │ ├── fix_unicode_mapper.py │ │ ├── generate_qa_from_examples_mapper.py │ │ ├── generate_qa_from_text_mapper.py │ │ ├── image_blur_mapper.py │ │ ├── image_captioning_from_gpt4v_mapper.py │ │ ├── image_captioning_mapper.py │ │ ├── image_diffusion_mapper.py │ │ ├── image_face_blur_mapper.py │ │ ├── image_remove_background_mapper.py │ │ ├── image_segment_mapper.py │ │ ├── image_tagging_mapper.py │ │ ├── imgdiff_difference_area_generator_mapper.py │ │ ├── imgdiff_difference_caption_generator_mapper.py │ │ ├── mllm_mapper.py │ │ ├── nlpaug_en_mapper.py │ │ ├── nlpcda_zh_mapper.py │ │ ├── optimize_qa_mapper.py │ │ ├── optimize_query_mapper.py │ │ ├── optimize_response_mapper.py │ │ ├── pair_preference_mapper.py │ │ ├── punctuation_normalization_mapper.py │ │ ├── python_file_mapper.py │ │ ├── python_lambda_mapper.py │ │ ├── query_intent_detection_mapper.py │ │ ├── query_sentiment_detection_mapper.py │ │ ├── query_topic_detection_mapper.py │ │ ├── relation_identity_mapper.py │ │ ├── remove_bibliography_mapper.py │ │ ├── remove_comments_mapper.py │ │ ├── remove_header_mapper.py │ │ ├── remove_long_words_mapper.py │ │ ├── remove_non_chinese_character_mapper.py │ │ ├── remove_repeat_sentences_mapper.py │ │ ├── remove_specific_chars_mapper.py │ │ ├── remove_table_text_mapper.py │ │ ├── remove_words_with_incorrect_substrings_mapper.py │ │ ├── replace_content_mapper.py │ │ ├── sdxl_prompt2prompt_mapper.py │ │ ├── sentence_augmentation_mapper.py │ │ ├── sentence_split_mapper.py │ │ ├── text_chunk_mapper.py │ │ ├── video_captioning_from_audio_mapper.py │ │ ├── video_captioning_from_frames_mapper.py │ │ ├── video_captioning_from_summarizer_mapper.py │ │ ├── video_captioning_from_video_mapper.py │ │ ├── video_extract_frames_mapper.py │ │ ├── video_face_blur_mapper.py │ │ ├── video_ffmpeg_wrapped_mapper.py │ │ ├── video_remove_watermark_mapper.py │ │ ├── video_resize_aspect_ratio_mapper.py │ │ ├── video_resize_resolution_mapper.py │ │ ├── video_split_by_duration_mapper.py │ │ ├── video_split_by_key_frame_mapper.py │ │ ├── video_split_by_scene_mapper.py │ │ ├── video_tagging_from_audio_mapper.py │ │ ├── video_tagging_from_frames_mapper.py │ │ └── whitespace_normalization_mapper.py │ ├── mixins.py │ ├── op_fusion.py │ └── selector │ │ ├── __init__.py │ │ ├── frequency_specified_field_selector.py │ │ ├── random_selector.py │ │ ├── range_specified_field_selector.py │ │ ├── tags_specified_field_selector.py │ │ └── topk_specified_field_selector.py ├── tools │ └── __init__.py └── utils │ ├── __init__.py │ ├── asset_utils.py │ ├── availability_utils.py │ ├── cache_utils.py │ ├── ckpt_utils.py │ ├── common_utils.py │ ├── compress.py │ ├── constant.py │ ├── file_utils.py │ ├── fingerprint_utils.py │ ├── lazy_loader.py │ ├── logger_utils.py │ ├── mm_utils.py │ ├── model_utils.py │ ├── nltk_utils.py │ ├── process_utils.py │ ├── registry.py │ ├── resource_utils.py │ ├── sample.py │ └── unittest_utils.py ├── demos ├── README.md ├── README_ZH.md ├── api_service │ ├── configs │ │ ├── dj_config_template.yaml │ │ └── model_configs.json │ ├── react_data_filter_process.ipynb │ ├── react_data_mapper_process.ipynb │ ├── utils.py │ ├── wrapped_filters.py │ └── wrapped_mappers.py ├── auto_evaluation_helm │ ├── README_ZH.md │ ├── app.py │ └── imgs │ │ ├── data-juicer.png │ │ ├── eval-01.png │ │ └── eval-02.png ├── data │ ├── demo-dataset-annotation-human-preference.jsonl │ ├── demo-dataset-chatml.jsonl │ ├── demo-dataset-content.jsonl │ ├── demo-dataset-deduplication.jsonl │ ├── demo-dataset-images.jsonl │ ├── demo-dataset-videos.jsonl │ ├── demo-dataset.jsonl │ ├── demo-dataset_1725870268.jsonl │ └── demo-dataset_1725870628.jsonl ├── data_mixture │ ├── app.py │ └── data │ │ ├── redpajama-c4-refined.jsonl │ │ ├── redpajama-cc-2023-06-refined.jsonl │ │ ├── redpajama-pile-stackexchange-refined.jsonl │ │ ├── the-pile-nih-refined.jsonl │ │ └── the-pile-uspto-refined.jsonl ├── data_process_hpo │ ├── app.py │ └── imgs │ │ └── data-juicer.png ├── data_process_loop │ ├── app.py │ ├── configs │ │ └── demo.yaml │ └── data │ │ └── demo-dataset.jsonl ├── data_visualization_diversity │ ├── app.py │ ├── configs │ │ └── demo.yaml │ └── data │ │ └── demo-dataset.jsonl ├── data_visualization_op_effect │ ├── app.py │ ├── configs │ │ ├── demo_en.yaml │ │ └── demo_zh.yaml │ └── data │ │ └── demo-dataset.jsonl ├── data_visualization_op_insight │ ├── app.css │ ├── app.py │ └── cache │ │ └── .gitkeep ├── data_visualization_statistics │ ├── app.py │ ├── configs │ │ └── demo.yaml │ └── data │ │ └── demo-dataset.jsonl ├── overview_scan │ ├── app.py │ └── data │ │ └── demo-dataset.jsonl ├── process_cft_zh_data │ ├── app.py │ └── data │ │ └── alpaca-cot.jsonl ├── process_code_data │ ├── app.py │ └── data │ │ └── stack_exchange.jsonl ├── process_on_ray │ ├── configs │ │ ├── dedup.yaml │ │ ├── demo-new-config.yaml │ │ └── demo.yaml │ └── data │ │ ├── demo-dataset.json │ │ └── demo-dataset.jsonl ├── process_sci_data │ ├── app.py │ └── data │ │ └── arxiv.jsonl ├── process_video_on_ray │ ├── configs │ │ ├── demo-new-config.yaml │ │ └── demo.yaml │ └── data │ │ ├── Note.md │ │ ├── demo-dataset.jsonl │ │ └── videos │ │ ├── video1.mp4 │ │ ├── video2.mp4 │ │ └── video3.mp4 ├── role_playing_system_prompt │ ├── README_ZH.md │ ├── role_playing_system_prompt.yaml │ └── system_prompt_generator.py ├── tool_dataset_splitting_by_language │ ├── app.py │ ├── data │ │ └── demo-dataset.jsonl │ └── dataset_splitting_by_language.py └── tool_quality_classifier │ ├── app.py │ ├── data │ └── demo-dataset.jsonl │ └── quality_classifier │ ├── __init__.py │ ├── eval.py │ ├── predict.py │ ├── qc_utils.py │ └── train.py ├── docs ├── BadDataExhibition.md ├── BadDataExhibition_ZH.md ├── DJ_SORA.md ├── DJ_SORA_ZH.md ├── DJ_service.md ├── DJ_service_ZH.md ├── DatasetCfg.md ├── DatasetCfg_ZH.md ├── DeveloperGuide.md ├── DeveloperGuide_ZH.md ├── Distributed.md ├── Distributed_ZH.md ├── Operators.md ├── RecipeGallery.md ├── RecipeGallery_ZH.md ├── Sandbox.md ├── Sandbox_ZH.md ├── awesome_llm_data.md ├── imgs │ ├── data-juicer.jpg │ ├── eval-01.png │ └── eval-02.png ├── sphinx_doc │ ├── Makefile │ ├── README.md │ ├── README_ZH.md │ ├── _templates │ │ └── package.rst_t │ ├── build_doc.sh │ ├── make.bat │ ├── redirect.html │ └── source │ │ ├── _static │ │ ├── awesome-list.html │ │ ├── sidebar-menu.css │ │ └── tutorial_kdd24.html │ │ ├── _templates │ │ ├── page.html │ │ └── sidebar │ │ │ └── bottom_menu.html │ │ ├── api.rst │ │ ├── conf.py │ │ ├── index.rst │ │ └── index_ZH.rst └── tutorial │ ├── DJ-Cookbook.md │ ├── DJ-Cookbook_ZH.md │ ├── Installation.md │ ├── Installation_ZH.md │ ├── QuickStart.md │ └── QuickStart_ZH.md ├── label_studio_localhost_connection.json ├── pyproject.toml ├── scripts ├── README.md ├── dlc │ ├── partition_data_dlc.py │ └── run_on_dlc.sh └── run_slurm.sh ├── service.py ├── tests ├── __init__.py ├── analysis │ ├── __init__.py │ ├── test_collector.py │ ├── test_column_wise_analysis.py │ ├── test_diversity_analysis.py │ ├── test_draw.py │ ├── test_measure.py │ └── test_overall_analysis.py ├── benchmark_performance │ ├── configs │ │ ├── audio.yaml │ │ ├── image.yaml │ │ ├── text.yaml │ │ └── video.yaml │ ├── report.py │ └── run.sh ├── config │ ├── __init__.py │ ├── demo_4_test.yaml │ ├── demo_4_test_bad_val.yaml │ └── test_config.py ├── core │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── test_data │ │ │ ├── sample.jsonl │ │ │ ├── sample.txt │ │ │ ├── test_config.yaml │ │ │ ├── test_config_list.yaml │ │ │ └── test_config_ray.yaml │ │ ├── test_data_validator.py │ │ ├── test_dataset_builder.py │ │ ├── test_dj_dataset.py │ │ ├── test_load_strategy.py │ │ └── test_ray_dataset.py │ ├── test_adapter.py │ └── test_monitor.py ├── download │ ├── __init__.py │ └── test_download.py ├── format │ ├── __init__.py │ ├── data │ │ ├── structured │ │ │ ├── demo-dataset.csv │ │ │ ├── demo-dataset.jsonl │ │ │ ├── demo-dataset.parquet │ │ │ └── demo-dataset.tsv │ │ └── text │ │ │ ├── sample1.txt │ │ │ ├── sample2.txt │ │ │ ├── sample3.txt │ │ │ ├── sample4.txt │ │ │ ├── sample5.txt │ │ │ └── sample6.txt │ ├── test_csv_formatter.py │ ├── test_empty_formatter.py │ ├── test_json_formatter.py │ ├── test_load_formatter.py │ ├── test_parquet_formatter.py │ ├── test_tsv_formatter.py │ └── test_unify_format.py ├── ops │ ├── __init__.py │ ├── aggregator │ │ ├── __init__.py │ │ ├── test_entity_attribute_aggregator.py │ │ ├── test_meta_tags_aggregator.py │ │ ├── test_most_relevant_entities_aggregator.py │ │ └── test_nested_aggregator.py │ ├── common │ │ └── __init__.py │ ├── data │ │ ├── audio1.wav │ │ ├── audio2.wav │ │ ├── audio3.ogg │ │ ├── blip.jpg │ │ ├── cat.jpg │ │ ├── img1.png │ │ ├── img2.jpg │ │ ├── img3.jpg │ │ ├── img4.png │ │ ├── img5.jpg │ │ ├── img6.jpg │ │ ├── img7.jpg │ │ ├── img8.jpg │ │ ├── img_pair_1.jpg │ │ ├── img_pair_2.jpg │ │ ├── lena-face.jpg │ │ ├── lena.jpg │ │ ├── video1.mp4 │ │ ├── video2.mp4 │ │ ├── video3-no-audio.mp4 │ │ ├── video3.mp4 │ │ ├── video4.mp4 │ │ └── video5.mp4 │ ├── deduplicator │ │ ├── __init__.py │ │ ├── test_document_deduplicator.py │ │ ├── test_document_minhash_deduplicator.py │ │ ├── test_document_simhash_deduplicator.py │ │ ├── test_image_deduplicator.py │ │ ├── test_ray_bts_minhash_deduplicator.py │ │ ├── test_ray_document_deduplicator.py │ │ ├── test_ray_image_deduplicator.py │ │ ├── test_ray_video_deduplicator.py │ │ └── test_video_deduplicator.py │ ├── filter │ │ ├── __init__.py │ │ ├── test_alphanumeric_filter.py │ │ ├── test_audio_duration_filter.py │ │ ├── test_audio_nmf_snr_filter.py │ │ ├── test_audio_size_filter.py │ │ ├── test_average_line_length_filter.py │ │ ├── test_character_repetition_filter.py │ │ ├── test_flagged_words_filter.py │ │ ├── test_general_field_filter.py │ │ ├── test_image_aesthetics_filter.py │ │ ├── test_image_aspect_ratio_filter.py │ │ ├── test_image_face_count_filter.py │ │ ├── test_image_face_ratio_filter.py │ │ ├── test_image_nsfw_filter.py │ │ ├── test_image_pair_similarity_filter.py │ │ ├── test_image_shape_filter.py │ │ ├── test_image_size_filter.py │ │ ├── test_image_text_matching_filter.py │ │ ├── test_image_text_similarity_filter.py │ │ ├── test_image_watermark_filter.py │ │ ├── test_language_id_score_filter.py │ │ ├── test_llm_difficulty_score_filter.py │ │ ├── test_llm_quality_score_filter.py │ │ ├── test_maximum_line_length_filter.py │ │ ├── test_perplexity_filter.py │ │ ├── test_phrase_grounding_recall_filter.py │ │ ├── test_special_characters_filter.py │ │ ├── test_specified_field_filter.py │ │ ├── test_specified_numeric_field_filter.py │ │ ├── test_stopwords_filter.py │ │ ├── test_suffix_filter.py │ │ ├── test_text_action_filter.py │ │ ├── test_text_entity_dependency_filter.py │ │ ├── test_text_length_filter.py │ │ ├── test_text_pair_similarity_filter.py │ │ ├── test_token_num_filter.py │ │ ├── test_video_aesthetics_filter.py │ │ ├── test_video_aspect_ratio_filter.py │ │ ├── test_video_duration_filter.py │ │ ├── test_video_frames_text_similarity_filter.py │ │ ├── test_video_motion_score_filter.py │ │ ├── test_video_motion_score_raft_filter.py │ │ ├── test_video_nsfw_filter.py │ │ ├── test_video_ocr_area_ratio_filter.py │ │ ├── test_video_resolution_filter.py │ │ ├── test_video_tagging_from_frames_filter.py │ │ ├── test_video_watermark_filter.py │ │ ├── test_word_repetition_filter.py │ │ └── test_words_num_filter.py │ ├── grouper │ │ ├── __init__.py │ │ ├── test_key_value_grouper.py │ │ ├── test_naive_grouper.py │ │ └── test_naive_reverse_grouper.py │ ├── mapper │ │ ├── __init__.py │ │ ├── annotation │ │ │ ├── __init__.py │ │ │ ├── test_annotation_mapper.py │ │ │ └── test_human_preference_annotation_mapper.py │ │ ├── test_audio_add_gaussian_noise_mapper.py │ │ ├── test_audio_ffmpeg_wrapped_mapper.py │ │ ├── test_calibrate_qa_mapper.py │ │ ├── test_calibrate_query_mapper.py │ │ ├── test_calibrate_response_mapper.py │ │ ├── test_chinese_convert_mapper.py │ │ ├── test_clean_copyright_mapper.py │ │ ├── test_clean_email_mapper.py │ │ ├── test_clean_html_mapper.py │ │ ├── test_clean_ip_mapper.py │ │ ├── test_clean_links_mapper.py │ │ ├── test_dialog_intent_detection_mapper.py │ │ ├── test_dialog_sentiment_detection_mapper.py │ │ ├── test_dialog_sentiment_intensity_mapper.py │ │ ├── test_dialog_topic_detection_mapper.py │ │ ├── test_expand_macro_mapper.py │ │ ├── test_extract_entity_attribute_mapper.py │ │ ├── test_extract_entity_relation_mapper.py │ │ ├── test_extract_event_mapper.py │ │ ├── test_extract_keyword_mapper.py │ │ ├── test_extract_nickname_mapper.py │ │ ├── test_extract_support_text_mapper.py │ │ ├── test_extract_tables_from_html_mapper.py │ │ ├── test_fix_unicode_mapper.py │ │ ├── test_generate_qa_from_examples_mapper.py │ │ ├── test_generate_qa_from_text_mapper.py │ │ ├── test_image_blur_mapper.py │ │ ├── test_image_captioning_mapper.py │ │ ├── test_image_diffusion_mapper.py │ │ ├── test_image_face_blur_mapper.py │ │ ├── test_image_remove_background_mapper.py │ │ ├── test_image_segment_mapper.py │ │ ├── test_image_tagging_mapper.py │ │ ├── test_imgdiff_difference_area_generator_mapper.py │ │ ├── test_imgdiff_difference_caption_generator_mapper.py │ │ ├── test_mllm_mapper.py │ │ ├── test_nlpaug_en_mapper.py │ │ ├── test_nlpcda_zh_mapper.py │ │ ├── test_optimize_qa_mapper.py │ │ ├── test_optimize_query_mapper.py │ │ ├── test_optimize_response_mapper.py │ │ ├── test_pair_preference_mapper.py │ │ ├── test_punctuation_normalization_mapper.py │ │ ├── test_python_file_mapper.py │ │ ├── test_python_lambda_mapper.py │ │ ├── test_query_intent_detection_mapper.py │ │ ├── test_query_sentiment_detection_mapper.py │ │ ├── test_query_topic_detection_mapper.py │ │ ├── test_relation_identity_mapper.py │ │ ├── test_remove_bibliography_mapper.py │ │ ├── test_remove_comments_mapper.py │ │ ├── test_remove_header_mapper.py │ │ ├── test_remove_long_words_mapper.py │ │ ├── test_remove_non_chinese_character_mapper.py │ │ ├── test_remove_repeat_sentences_mapper.py │ │ ├── test_remove_specific_chars_mapper.py │ │ ├── test_remove_table_text_mapper.py │ │ ├── test_remove_words_with_incorrect_substrings_mapper.py │ │ ├── test_replace_content_mapper.py │ │ ├── test_sdxl_prompt2prompt_mapper.py │ │ ├── test_sentence_augmentation_mapper.py │ │ ├── test_sentence_split_mapper.py │ │ ├── test_text_chunk_mapper.py │ │ ├── test_video_captioning_from_audio_mapper.py │ │ ├── test_video_captioning_from_frames_mapper.py │ │ ├── test_video_captioning_from_summarizer_mapper.py │ │ ├── test_video_captioning_from_video_mapper.py │ │ ├── test_video_extract_frames_mapper.py │ │ ├── test_video_face_blur_mapper.py │ │ ├── test_video_ffmpeg_wrapped_mapper.py │ │ ├── test_video_remove_watermark_mapper.py │ │ ├── test_video_resize_aspect_ratio_mapper.py │ │ ├── test_video_resize_resolution_mapper.py │ │ ├── test_video_split_by_duration_mapper.py │ │ ├── test_video_split_by_key_frame_mapper.py │ │ ├── test_video_split_by_scene_mapper.py │ │ ├── test_video_tagging_from_audio_mapper.py │ │ ├── test_video_tagging_from_frames_mapper.py │ │ └── test_whitespace_normalization_mapper.py │ ├── selector │ │ ├── __init__.py │ │ ├── test_frequency_specified_field_selector.py │ │ ├── test_random_selector.py │ │ ├── test_range_specified_field_selector.py │ │ ├── test_tags_specified_field_selector.py │ │ └── test_topk_specified_field_selector.py │ └── test_op_fusion.py ├── run.py ├── tools │ ├── __init__.py │ └── test_process_data.py └── utils │ ├── __init__.py │ ├── test_asset_utils.py │ ├── test_availablility_utils.py │ ├── test_cache_utils.py │ ├── test_ckpt_utils.py │ ├── test_common_utils.py │ ├── test_compress.py │ ├── test_constant.py │ ├── test_file_utils.py │ ├── test_fingerprint_utils.py │ ├── test_lazy_loader.py │ ├── test_logger_utils.py │ ├── test_mm_utils.py │ ├── test_model_utils.py │ ├── test_process_utils.py │ ├── test_registry.py │ ├── test_resource_utils.py │ └── test_unittest_utils.py ├── thirdparty ├── LLM_ecosystems │ ├── README.md │ ├── README_ZH.md │ ├── patch │ │ ├── helm.diff │ │ └── megatron.diff │ ├── setup_helm.sh │ └── setup_megatron.sh └── models │ ├── README.md │ ├── README_ZH.md │ ├── patch │ └── easyanimate.diff │ └── setup_easyanimate.sh ├── tools ├── __init__.py ├── analyze_data.py ├── converter │ ├── batch_convert.sh │ ├── convert_gpt_to_transformers.py │ └── modeling_megatron_llama.py ├── data_resplit.py ├── distributed_deduplication │ ├── README.md │ ├── README_ZH.md │ ├── __init__.py │ ├── dedup_utils.py │ └── spark_dedup.py ├── dj_install.py ├── evaluator │ ├── README.md │ ├── README_ZH.md │ ├── config │ │ ├── evaluator_example.yaml │ │ └── helm_spec_template.conf │ ├── evaluator.py │ ├── gpt_eval │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── __init__.py │ │ ├── answer │ │ │ └── openai │ │ │ │ └── gpt-3.5-turbo.jsonl │ │ ├── answer_generator.py │ │ ├── config │ │ │ ├── config.yaml │ │ │ ├── prompt.jsonl │ │ │ ├── question.jsonl │ │ │ └── reviewer.jsonl │ │ └── gpt_evaluator.py │ └── recorder │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── __init__.py │ │ ├── config │ │ ├── leaderboard_example.yaml │ │ ├── llama_example.yaml │ │ └── mymodel_example.yaml │ │ └── wandb_writer.py ├── fmt_conversion │ ├── README.md │ ├── README_ZH.md │ ├── multimodal │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── absolute_path_to_relative_path.py │ │ ├── data_juicer_format_to_target_format │ │ │ ├── dj_to_internvid.py │ │ │ ├── dj_to_llava.py │ │ │ ├── dj_to_mmc4.py │ │ │ ├── dj_to_msrvtt.py │ │ │ ├── dj_to_video_chatgpt.py │ │ │ ├── dj_to_wavcaps.py │ │ │ └── dj_to_youku.py │ │ ├── source_format_to_data_juicer_format │ │ │ ├── internvid_to_dj.py │ │ │ ├── llava_to_dj.py │ │ │ ├── mmc4_to_dj.py │ │ │ ├── msrvtt_to_dj.py │ │ │ ├── video_chatgpt_to_dj.py │ │ │ ├── wavcaps_to_dj.py │ │ │ └── youku_to_dj.py │ │ └── utils.py │ └── post_tuning_dialog │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── data_juicer_format_to_target_format │ │ ├── dj_to_alpaca.py │ │ ├── dj_to_llama_factory_sharegpt.py │ │ ├── dj_to_messages.py │ │ └── dj_to_ms_swift_sharegpt.py │ │ └── source_format_to_data_juicer_format │ │ ├── alpaca_to_dj.py │ │ ├── llama_factory_sharegpt_to_dj.py │ │ ├── messages_to_dj.py │ │ └── ms_swift_sharegpt_to_dj.py ├── generate_smtp_cert.py ├── generate_uv_lock.py ├── hpo │ ├── README.md │ ├── README_ZH.md │ ├── configs │ │ ├── process.yaml │ │ └── quality_score_hpo.yaml │ ├── demo-redpajama-c4-refined.jsonl │ ├── execute_hpo_3sigma.py │ ├── execute_hpo_wandb.py │ └── objects.py ├── humanops │ ├── README.md │ ├── enable_legacy_token.png │ └── label_studio_service.py ├── mm_eval │ ├── __init__.py │ ├── inception_metrics │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── calc_metrics_for_videos.py │ │ ├── dataset.py │ │ ├── distributed.py │ │ ├── util.py │ │ └── video_metrics │ │ │ ├── frechet_inception_distance.py │ │ │ ├── frechet_video_distance.py │ │ │ ├── inception_score.py │ │ │ ├── kernel_inception_distance.py │ │ │ ├── kernel_video_distance.py │ │ │ ├── metric_main.py │ │ │ ├── metric_utils.py │ │ │ ├── precision_recall.py │ │ │ ├── video_inception_score.py │ │ │ └── video_precision_recall.py │ └── vbench_metrics │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── VBench_full_info.json │ │ ├── VBench_mini_info.json │ │ └── evaluate.py ├── multimodal │ └── __init__.py ├── postprocess │ ├── README.md │ ├── README_ZH.md │ ├── count_token.py │ ├── data_mixture.py │ └── deserialize_meta.py ├── preprocess │ ├── README.md │ ├── README_ZH.md │ ├── dataset_split_by_language.py │ ├── raw_alpaca_cot_merge_add_meta.py │ ├── raw_arxiv_to_jsonl.py │ ├── raw_stackexchange_to_jsonl.py │ ├── reformat_csv_nan_value.py │ ├── reformat_jsonl_nan_value.py │ └── serialize_meta.py ├── process_data.py ├── quality_classifier │ ├── README.md │ ├── README_ZH.md │ ├── __init__.py │ ├── eval.py │ ├── predict.py │ ├── qc_utils.py │ └── train.py └── sandbox_starter.py └── uv.lock /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | # avoid measuring strange non-existing files 4 | /workspace/config.py 5 | /workspace/config-3.py 6 | 7 | # avoid measuring third-party dist packages 8 | */dist-packages/* 9 | 10 | # avoid measuring code of unittest 11 | tests/* 12 | 13 | [report] 14 | ignore_errors = True 15 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/custom.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Custom issue template 3 | about: Describe this issue template's purpose here. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | pre-commit: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - uses: actions/setup-python@v4 11 | with: 12 | python-version: '3.10' 13 | - uses: pre-commit/action@v3.0.0 14 | -------------------------------------------------------------------------------- /.github/workflows/publish-docker-oss.yml: -------------------------------------------------------------------------------- 1 | name: Sync Docker Image to Aliyun OSS 2 | 3 | # This workflow uses actions that are not certified by GitHub. 4 | # They are provided by a third-party and are governed by 5 | # separate terms of service, privacy policy, and support 6 | # documentation. 7 | 8 | on: 9 | workflow_dispatch: 10 | schedule: 11 | - cron: '0 16 * * 1' # Every Monday at 16:00 UTC -- Every Tuesday at 0:00 in Beijing Time 12 | 13 | env: 14 | IMAGE_NAME: datajuicer/data-juicer 15 | IMAGE_TAG: latest 16 | REGISTRY_MIRROR: docker.xiaogenban1993.com 17 | ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true 18 | 19 | 20 | jobs: 21 | build: 22 | runs-on: [docker-internal] 23 | timeout-minutes: 1440 # 24 hours 24 | 25 | steps: 26 | - name: Pull Docker image 27 | run: | 28 | docker pull ${{ env.REGISTRY_MIRROR }}/${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }} 29 | 30 | - name: Export to tar file 31 | run: | 32 | docker save ${{ env.REGISTRY_MIRROR }}/${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }} | gzip > data-juicer-${{ env.IMAGE_TAG }}.tar.gz 33 | 34 | - name: Sync Docker image to Aliyun OSS 35 | id: sync_to_oss 36 | run: | 37 | curl -s "http://127.0.0.1:5006/sync2oss?file_path=data-juicer-${{ env.IMAGE_TAG }}.tar.gz" > /dev/null 38 | 39 | - name: Clean up resources 40 | if: success() 41 | run: | 42 | rm -rf data-juicer-${{ env.IMAGE_TAG }}.tar.gz 43 | -------------------------------------------------------------------------------- /.github/workflows/publish-pypi.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Publish PyPi Package 10 | 11 | on: 12 | workflow_dispatch: 13 | release: 14 | types: [published] 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | deploy: 21 | 22 | runs-on: ubuntu-latest 23 | 24 | steps: 25 | - uses: actions/checkout@v4 26 | - name: Set up Python 27 | uses: actions/setup-python@v5 28 | with: 29 | python-version: '3.x' 30 | - name: Install dependencies 31 | run: | 32 | python -m pip install --upgrade pip 33 | pip install setuptools 34 | - name: Build package 35 | run: python setup.py sdist bdist_wheel 36 | - name: Publish package 37 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 38 | with: 39 | user: __token__ 40 | password: ${{ secrets.PYPI_API_TOKEN }} 41 | -------------------------------------------------------------------------------- /.github/workflows/sphinx_docs_linkcheck.yml: -------------------------------------------------------------------------------- 1 | name: Sphinx Docs LinkCheck 2 | 3 | on: 4 | schedule: 5 | - cron: "0 5 * * 0" # Runs at 05:00 on Sunday. 6 | workflow_dispatch: 7 | 8 | concurrency: 9 | group: ${{ github.workflow }}-${{ github.ref }} 10 | cancel-in-progress: true 11 | 12 | jobs: 13 | linkcheck: 14 | name: Check Links 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: [ "3.10" ] 19 | steps: 20 | - name: Checkout 21 | uses: actions/checkout@v4 22 | - name: Setup Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@master 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install -v -e .[dev] 30 | - name: Run Sphinx linkcheck 31 | run: | 32 | cd docs/sphinx_doc 33 | sphinx-apidoc -f -o source ../../data_juicer -t _templates 34 | python ./create_symlinks.py 35 | sphinx-build -b linkcheck -j 4 source build/linkcheck || true 36 | - name: Upload Linkcheck Results 37 | uses: actions/upload-artifact@v4 38 | with: 39 | name: LinkcheckResults 40 | path: "docs/sphinx_doc/build/linkcheck" 41 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time. 2 | # 3 | # You can adjust the behavior by modifying this file. 4 | # For more information, see: 5 | # https://github.com/actions/stale 6 | name: Mark stale issues and pull requests 7 | 8 | on: 9 | schedule: 10 | - cron: '30 9 * * *' 11 | 12 | jobs: 13 | stale: 14 | 15 | runs-on: ubuntu-latest 16 | permissions: 17 | issues: write 18 | pull-requests: write 19 | 20 | steps: 21 | - uses: actions/stale@v5 22 | with: 23 | repo-token: ${{ secrets.GITHUB_TOKEN }} 24 | stale-issue-message: 'This issue is marked as stale because there has been no activity for 21 days. Remove stale label or add new comments or this issue will be closed in 3 day.' 25 | stale-pr-message: 'This PR is marked as stale because there has been no activity for 21 days. Remove stale label or add new comments or this PR will be closed in 3 day.' 26 | close-issue-message: 'Close this stale issue.' 27 | close-pr-message: 'Close this stale PR.' 28 | stale-issue-label: 'stale-issue' 29 | stale-pr-label: 'stale-pr' 30 | days-before-stale: 21 31 | days-before-close: 3 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # data & resources 3 | outputs/ 4 | assets/ 5 | 6 | # setup 7 | data_juicer.egg-info/ 8 | py_data_juicer.egg-info/ 9 | build/ 10 | dist 11 | 12 | # others 13 | .DS_Store 14 | .idea/ 15 | wandb/ 16 | __pycache__ 17 | .vscode/ 18 | 19 | # label studio related 20 | label_studio_data/ 21 | label_studio_venv/ 22 | label_studio_connection.json 23 | 24 | **/__dj__produced_data__/* 25 | venv/ 26 | .venv/ 27 | 28 | # dup files created by tests 29 | tests/ops/data/*dup* 30 | tests/tools/tmp_*/ 31 | tests/ops/deduplicator/chinese_dedup/ 32 | tests/ops/deduplicator/english_dedup/ 33 | -------------------------------------------------------------------------------- /configs/annotation/annotation_default.yaml: -------------------------------------------------------------------------------- 1 | # configs/demo/annotation.yaml 2 | # Process config for human preference annotation with Label Studio 3 | 4 | # Global parameters 5 | project_name: 'demo-annotation-human-preference' 6 | dataset_path: './demos/data/demo-dataset-annotation-human-preference.jsonl' 7 | np: 1 # Use single process for annotation tasks 8 | open_tracer: true 9 | 10 | export_path: './outputs/demo-annotation-human-preference/annotated-data.jsonl' 11 | 12 | # use prompt as the text field, which will be used in the label studio template 13 | text_keys: "prompt" 14 | 15 | # Process schedule 16 | process: 17 | # Annotation step using Label Studio with human preference 18 | - human_preference_annotation_mapper: 19 | # general annotation project settings 20 | project_name_prefix: "Human_Preference_Annotation" 21 | wait_for_annotations: true # Whether to wait for annotations to complete 22 | timeout: 3600 # Maximum time to wait for annotations in seconds (1 hour) 23 | poll_interval: 10 # Time between annotation status checks in seconds 24 | max_tasks_per_batch: 10 # Maximum number of tasks in a single batch 25 | notification_config: 26 | enabled: false 27 | 28 | # label studio connection settings 29 | api_url: "http://localhost:7070" # Default Label Studio URL 30 | api_key: "05409236-67a5-4169-af96-a52a818d0e81" # Your API key for label studio authentication # pragma: allowlist secret 31 | 32 | # human preference annotation settings 33 | prompt_key: "prompt" # Prompt field 34 | answer1_key: "answer1" # First answer option 35 | answer2_key: "answer2" # Second answer option 36 | chosen_key: "chosen" # Chosen field 37 | rejected_key: "rejected" # Rejected field 38 | -------------------------------------------------------------------------------- /configs/config_min.yaml: -------------------------------------------------------------------------------- 1 | project_name: 'min' 2 | 3 | np: 4 # number of subprocess to process your dataset 4 | export_path: './outputs/' 5 | 6 | text_keys: 'text' 7 | video_key: 'videos' 8 | image_key: 'images' 9 | audio_key: 'audios' 10 | executor_type: default # type of executor, support "default" or "ray" for now. 11 | ray_address: auto # the address of the Ray cluster. 12 | suffixes: null 13 | add_suffix: false 14 | -------------------------------------------------------------------------------- /configs/data_juicer_recipes/alpaca_cot/alpaca-cot-en-refine.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'Data-Juicer-recipes-alpaca-cot-en' 3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 4 | export_path: '/path/to/your/dataset.jsonl' 5 | 6 | np: 50 # number of subprocess to process your dataset 7 | open_tracer: true 8 | 9 | # process schedule 10 | # a list of several process operators with their arguments 11 | process: 12 | - document_deduplicator: # 104636705 13 | lowercase: true 14 | ignore_non_character: true 15 | 16 | - alphanumeric_filter: # 104636381 17 | tokenization: false 18 | min_ratio: 0.1 19 | - character_repetition_filter: # 104630030 20 | rep_len: 10 21 | max_ratio: 0.6 22 | - flagged_words_filter: # 104576967 23 | lang: en 24 | tokenization: true 25 | max_ratio: 0.017 26 | - maximum_line_length_filter: # 104575811 27 | min_len: 20 28 | - text_length_filter: # 104573711 29 | min_len: 30 30 | 31 | - document_simhash_deduplicator: # 72855345 32 | tokenization: space 33 | window_size: 3 34 | lowercase: true 35 | ignore_pattern: '\p{P}' 36 | num_blocks: 9 37 | hamming_distance: 7 38 | -------------------------------------------------------------------------------- /configs/data_juicer_recipes/alpaca_cot/alpaca-cot-zh-refine.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'Data-Juicer-recipes-alpaca-cot-zh' 3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 4 | export_path: '/path/to/your/dataset.jsonl' 5 | 6 | np: 50 # number of subprocess to process your dataset 7 | open_tracer: true 8 | 9 | # process schedule 10 | # a list of several process operators with their arguments 11 | process: 12 | - document_deduplicator: # 16957516 13 | lowercase: true # whether to convert text to lower case 14 | ignore_non_character: true 15 | 16 | - alphanumeric_filter: # 16957388 17 | tokenization: false 18 | min_ratio: 0.10 19 | - character_repetition_filter: # 16956845 20 | rep_len: 10 21 | max_ratio: 0.6 22 | - flagged_words_filter: # 16954629 23 | lang: zh 24 | tokenization: true 25 | use_words_aug: true 26 | max_ratio: 0.017 27 | - text_length_filter: # 16954317 28 | min_len: 10 29 | 30 | - document_simhash_deduplicator: # 9873214 31 | tokenization: character 32 | window_size: 4 # small window size for short texts 33 | lowercase: true 34 | ignore_pattern: '\p{P}' 35 | num_blocks: 10 36 | hamming_distance: 8 # larger hamming distance threshold for short texts 37 | -------------------------------------------------------------------------------- /configs/data_juicer_recipes/data-juicer-sandbox-optimal.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'Data-Juicer-recipes-T2V-optimal' 3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 4 | export_path: '/path/to/your/dataset.jsonl' 5 | 6 | np: 4 # number of subprocess to process your dataset 7 | 8 | # process schedule 9 | # a list of several process operators with their arguments 10 | process: 11 | - video_nsfw_filter: 12 | hf_nsfw_model: Falconsai/nsfw_image_detection 13 | max_score: 0.000195383 14 | frame_sampling_method: uniform 15 | frame_num: 3 16 | reduce_mode: avg 17 | any_or_all: any 18 | mem_required: '1GB' 19 | - video_frames_text_similarity_filter: 20 | hf_clip: openai/clip-vit-base-patch32 21 | min_score: 0.306337 22 | max_score: 1.0 23 | frame_sampling_method: uniform 24 | frame_num: 3 25 | horizontal_flip: false 26 | vertical_flip: false 27 | reduce_mode: avg 28 | any_or_all: any 29 | mem_required: '10GB' 30 | -------------------------------------------------------------------------------- /configs/data_juicer_recipes/data-juicer-sandbox-self-evolution.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'Data-Juicer-recipes-T2V-evolution' 3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 4 | export_path: '/path/to/your/dataset.jsonl' 5 | 6 | np: 4 # number of subprocess to process your dataset 7 | 8 | # process schedule 9 | # a list of several process operators with their arguments 10 | process: 11 | - video_nsfw_filter: 12 | hf_nsfw_model: Falconsai/nsfw_image_detection 13 | max_score: 0.000195383 14 | frame_sampling_method: uniform 15 | frame_num: 3 16 | reduce_mode: avg 17 | any_or_all: any 18 | mem_required: '1GB' 19 | - video_frames_text_similarity_filter: 20 | hf_clip: openai/clip-vit-base-patch32 21 | min_score: 0.306337 22 | max_score: 1.0 23 | frame_sampling_method: uniform 24 | frame_num: 3 25 | horizontal_flip: false 26 | vertical_flip: false 27 | reduce_mode: avg 28 | any_or_all: any 29 | mem_required: '10GB' 30 | - video_motion_score_filter: 31 | min_score: 3 32 | max_score: 20 33 | sampling_fps: 2 34 | any_or_all: any 35 | - video_aesthetics_filter: 36 | hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE 37 | min_score: 0.418164 38 | max_score: 1.0 39 | frame_sampling_method: 'uniform' 40 | frame_num: 3 41 | reduce_mode: avg 42 | any_or_all: any 43 | mem_required: '1500MB' 44 | - video_duration_filter: 45 | min_duration: 2 46 | max_duration: 100000 47 | any_or_all: any 48 | -------------------------------------------------------------------------------- /configs/data_juicer_recipes/github_code/redpajama-code-refine.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'Data-Juicer-recipes-code-rp' 3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 4 | export_path: '/path/to/your/dataset.jsonl' 5 | 6 | np: 50 # number of subprocess to process your dataset 7 | open_tracer: true 8 | 9 | # process schedule 10 | # a list of several process operators with their arguments 11 | process: 12 | - clean_email_mapper: 13 | - clean_links_mapper: 14 | - fix_unicode_mapper: 15 | - punctuation_normalization_mapper: 16 | - whitespace_normalization_mapper: 17 | - clean_copyright_mapper: 18 | 19 | - alphanumeric_filter: 20 | tokenization: False 21 | min_ratio: 0.4 22 | max_ratio: 0.8 23 | - alphanumeric_filter: 24 | tokenization: True 25 | min_ratio: 1.5 26 | max_ratio: 3 27 | - average_line_length_filter: 28 | min_len: 15 29 | max_len: 100 30 | - character_repetition_filter: 31 | rep_len: 10 32 | min_ratio: 0.05 33 | max_ratio: 0.3 34 | - maximum_line_length_filter: 35 | min_len: 50 36 | max_len: 500 37 | - text_length_filter: 38 | min_len: 300 39 | - words_num_filter: 40 | lang: en 41 | tokenization: False 42 | min_num: 30 43 | max_num: 5000 44 | - word_repetition_filter: 45 | lang: en 46 | tokenization: False 47 | rep_len: 10 48 | max_ratio: 0.1 49 | - document_simhash_deduplicator: 50 | tokenization: space 51 | window_size: 6 52 | lowercase: true 53 | ignore_pattern: '\p{P}' 54 | num_blocks: 6 55 | hamming_distance: 4 56 | -------------------------------------------------------------------------------- /configs/data_juicer_recipes/github_code/redpajama-stack-code-deduplicate.yaml: -------------------------------------------------------------------------------- 1 | project_name: 'Data-Juicer-recipes-code' 2 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 3 | export_path: '/path/to/your/dataset.jsonl' # path to your dataset result file 4 | 5 | np: 50 # number of subprocess to process your dataset 6 | open_tracer: true 7 | 8 | # process schedule 9 | # a list of several process operators with their arguments 10 | 11 | process: 12 | - document_simhash_deduplicator: 13 | tokenization: space 14 | window_size: 6 15 | lowercase: true 16 | ignore_pattern: '\p{P}' 17 | num_blocks: 6 18 | hamming_distance: 4 19 | -------------------------------------------------------------------------------- /configs/data_juicer_recipes/github_code/stack-code-refine.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'Data-Juicer-recipes-the-stack' 3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 4 | export_path: '/path/to/your/dataset.jsonl' 5 | 6 | text_keys: 'content' 7 | 8 | np: 50 # number of subprocess to process your dataset 9 | open_tracer: true 10 | 11 | # process schedule 12 | # a list of several process operators with their arguments 13 | process: 14 | - clean_email_mapper: 15 | - clean_links_mapper: 16 | - fix_unicode_mapper: 17 | - punctuation_normalization_mapper: 18 | - whitespace_normalization_mapper: 19 | - clean_copyright_mapper: 20 | 21 | - alphanumeric_filter: # 18766 22 | tokenization: false 23 | min_ratio: 0.2 # < 3sigma (0.3791) 24 | max_ratio: 0.9163 # 3sigma 25 | - alphanumeric_filter: # 146432 26 | tokenization: true 27 | min_ratio: 0.546 # 3sigma 28 | max_ratio: 3.65 # 3sigma 29 | - average_line_length_filter: # for code 30 | min_len: 10 # > 3sigma (0) -- 48790 31 | max_len: 150 # < 3sigma (15603) -- 233275 32 | - character_repetition_filter: 33 | max_ratio: 0.36 # 3sigma -- 346875 34 | - maximum_line_length_filter: # for code 35 | max_len: 1000 # remove 256670 samples 36 | - text_length_filter: 37 | max_len: 96714 # 3sigma -- 190006 38 | - words_num_filter: 39 | min_num: 20 # remove 1504958 samples 40 | max_num: 6640 # 3sigma -- remove 179847 samples 41 | - word_repetition_filter: 42 | rep_len: 10 43 | max_ratio: 0.357 # 3sigma -- 598462 44 | 45 | - document_simhash_deduplicator: 46 | tokenization: space 47 | window_size: 6 48 | lowercase: true 49 | ignore_pattern: '\p{P}' 50 | num_blocks: 6 51 | hamming_distance: 4 52 | -------------------------------------------------------------------------------- /configs/data_juicer_recipes/pile-europarl-refine.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'Data-Juicer-recipes-EuroParl' 3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 4 | export_path: '/path/to/your/dataset.jsonl' 5 | 6 | np: 50 # number of subprocess to process your dataset 7 | open_tracer: true 8 | 9 | # process schedule 10 | # a list of several process operators with their arguments 11 | process: 12 | - clean_email_mapper: 13 | - clean_links_mapper: 14 | - fix_unicode_mapper: 15 | - punctuation_normalization_mapper: 16 | - whitespace_normalization_mapper: 17 | 18 | - alphanumeric_filter: 19 | tokenization: false 20 | min_ratio: 0.75 # <3sigma (0.779) 21 | max_ratio: 0.90 # >3sigma(0.878) 22 | - average_line_length_filter: # for code 23 | max_len: 588 # 3sigma 24 | - character_repetition_filter: 25 | rep_len: 10 26 | max_ratio: 0.16 # >3sigma (0.114) 27 | - flagged_words_filter: 28 | lang: en 29 | tokenization: true 30 | max_ratio: 0.0007 # 3sigma 31 | - language_id_score_filter: 32 | min_score: 0.7 33 | - maximum_line_length_filter: # for code 34 | max_len: 4000 # >3sigma (3104) 35 | - perplexity_filter: 36 | lang: en 37 | max_ppl: 7596 #(3sigma) 38 | - special_characters_filter: 39 | max_ratio: 0.3 # > 3sigma (0.243) 40 | - text_length_filter: 41 | max_len: 2e5 42 | - words_num_filter: 43 | tokenization: true 44 | min_num: 20 45 | max_num: 1e5 # 3sigma 46 | - word_repetition_filter: 47 | lang: en 48 | tokenization: true 49 | rep_len: 10 50 | max_ratio: 0.2 # > 3sigma (0.185) 51 | 52 | - document_simhash_deduplicator: 53 | tokenization: space 54 | window_size: 6 55 | lowercase: true 56 | ignore_pattern: '\p{P}' 57 | num_blocks: 6 58 | hamming_distance: 4 59 | -------------------------------------------------------------------------------- /configs/data_juicer_recipes/pile-hackernews-refine.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'Data-Juicer-recipes-HackerNews' 3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 4 | export_path: '/path/to/your/dataset.jsonl' 5 | 6 | np: 48 # number of subprocess to process your dataset 7 | open_tracer: true 8 | 9 | # process schedule 10 | # a list of several process operators with their arguments 11 | process: 12 | - clean_email_mapper: 13 | #- clean_links_mapper: 14 | - fix_unicode_mapper: 15 | - punctuation_normalization_mapper: 16 | - whitespace_normalization_mapper: 17 | 18 | - alphanumeric_filter: 19 | tokenization: false 20 | min_ratio: 0.2 #<3sigma 21 | - average_line_length_filter: 22 | min_len: 15 # >3sigma 23 | - character_repetition_filter: 24 | rep_len: 10 25 | max_ratio: 0.3 # >3sigma 26 | - flagged_words_filter: 27 | lang: en 28 | tokenization: true 29 | max_ratio: 0.05 # >3sigma 30 | - language_id_score_filter: 31 | min_score: 0.2 # <3sigma 32 | - maximum_line_length_filter: 33 | min_len: 20 # >3sigma 34 | - perplexity_filter: 35 | lang: en 36 | max_ppl: 10000 # >3sigma 37 | - special_characters_filter: 38 | max_ratio: 0.7 # >3sigma 39 | - text_length_filter: 40 | min_len: 100 # > 3sigma 41 | - words_num_filter: 42 | lang: en 43 | tokenization: true 44 | min_num: 30 # > 3sigma 45 | - word_repetition_filter: 46 | lang: en 47 | tokenization: true 48 | rep_len: 10 49 | max_ratio: 0.8 # > 3sigma 50 | 51 | - document_simhash_deduplicator: 52 | tokenization: space 53 | window_size: 6 54 | lowercase: true 55 | ignore_pattern: '\p{P}' 56 | num_blocks: 6 57 | hamming_distance: 4 58 | -------------------------------------------------------------------------------- /configs/data_juicer_recipes/pile-nih-refine.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'Data-Juicer-recipes-Hin' 3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 4 | export_path: '/path/to/your/dataset.jsonl' 5 | 6 | np: 50 # number of subprocess to process your dataset 7 | open_tracer: true 8 | 9 | # process schedule 10 | # a list of several process operators with their arguments 11 | process: 12 | - clean_email_mapper: 13 | - clean_links_mapper: 14 | - fix_unicode_mapper: 15 | - punctuation_normalization_mapper: 16 | - whitespace_normalization_mapper: 17 | 18 | - alphanumeric_filter: 19 | tokenization: false 20 | min_ratio: 0.75 # <3sigma (0.800) 21 | max_ratio: 0.866 22 | - average_line_length_filter: 23 | max_len: 10000 # >3sigma (5425) 24 | - character_repetition_filter: 25 | rep_len: 10 26 | max_ratio: 0.2 # >3sigma (0.127) 27 | - flagged_words_filter: 28 | lang: en 29 | tokenization: true 30 | max_ratio: 0.0003 # 3sigma 31 | - language_id_score_filter: 32 | min_score: 0.7 33 | - perplexity_filter: 34 | lang: en 35 | max_ppl: 1669 #(3sigma) 36 | - special_characters_filter: 37 | max_ratio: 0.3 # > 3sigma (0.218) 38 | - words_num_filter: 39 | tokenization: true 40 | min_num: 20 41 | max_num: 2000 42 | - word_repetition_filter: 43 | lang: en 44 | tokenization: true 45 | rep_len: 10 46 | max_ratio: 0.104 # 3sigma 47 | 48 | - document_simhash_deduplicator: 49 | tokenization: space 50 | window_size: 6 51 | lowercase: true 52 | ignore_pattern: '\p{P}' 53 | num_blocks: 6 54 | hamming_distance: 4 55 | -------------------------------------------------------------------------------- /configs/data_juicer_recipes/pile-philpaper-refine.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'Data-Juicer-recipes-Philpaper' 3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 4 | export_path: '/path/to/your/dataset.jsonl' 5 | 6 | np: 50 # number of subprocess to process your dataset 7 | open_tracer: true 8 | 9 | # process schedule 10 | # a list of several process operators with their arguments 11 | process: 12 | - clean_email_mapper: 13 | - clean_links_mapper: 14 | - fix_unicode_mapper: 15 | - punctuation_normalization_mapper: 16 | - whitespace_normalization_mapper: 17 | 18 | - alphanumeric_filter: 19 | tokenization: false 20 | min_ratio: 0.7 # <3sigma (0.72) 21 | - average_line_length_filter: # for code 22 | max_len: 5e5 # >3sigma (406006) 23 | - character_repetition_filter: 24 | rep_len: 10 25 | max_ratio: 0.2 # >3sigma (0.145) 26 | - flagged_words_filter: 27 | lang: en 28 | tokenization: true 29 | max_ratio: 0.0007 # 3sigma 30 | - language_id_score_filter: 31 | min_score: 0.6 32 | - maximum_line_length_filter: # for code 33 | max_len: 1e6 # 3sigma 34 | - perplexity_filter: 35 | lang: en 36 | max_ppl: 5000 37 | - special_characters_filter: 38 | max_ratio: 0.4 # > 3sigma (0.302) 39 | - words_num_filter: 40 | lang: en 41 | tokenization: true 42 | min_num: 1000 43 | max_num: 2e5 # 3sigma 44 | - word_repetition_filter: 45 | lang: en 46 | tokenization: true 47 | rep_len: 10 48 | max_ratio: 0.3 # > 3sigma (0.249) 49 | 50 | - document_simhash_deduplicator: 51 | tokenization: space 52 | window_size: 6 53 | lowercase: true 54 | ignore_pattern: '\p{P}' 55 | num_blocks: 6 56 | hamming_distance: 4 57 | -------------------------------------------------------------------------------- /configs/data_juicer_recipes/redpajama-arxiv-refine.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'Data-Juicer-arxivrecipes-arxiv' 3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 4 | export_path: '/path/to/your/dataset.jsonl' 5 | 6 | np: 50 # number of subprocess to process your dataset 7 | open_tracer: true 8 | 9 | # process schedule 10 | # a list of several process operators with their arguments 11 | process: 12 | - clean_email_mapper: 13 | - clean_links_mapper: 14 | - fix_unicode_mapper: 15 | - punctuation_normalization_mapper: 16 | - whitespace_normalization_mapper: 17 | 18 | - alphanumeric_filter: 19 | tokenization: false 20 | min_ratio: 0.516 # 3sigma 21 | max_ratio: 0.915 # 3sigma 22 | - average_line_length_filter: # for code 23 | max_len: 682 # 3sigma 24 | - character_repetition_filter: 25 | rep_len: 10 26 | max_ratio: 0.3 27 | - flagged_words_filter: 28 | lang: en 29 | tokenization: true 30 | max_ratio: 0.00076 # 3sigma 31 | #- language_id_score_filter: # remove language filter 32 | - maximum_line_length_filter: # for code 33 | max_len: 4000 34 | - perplexity_filter: 35 | lang: en 36 | max_ppl: 8000 37 | - special_characters_filter: 38 | max_ratio: 0.6 39 | - text_length_filter: 40 | max_len: 350000 41 | - words_num_filter: 42 | lang: en 43 | tokenization: true 44 | min_num: 20 45 | max_num: 100000 46 | - word_repetition_filter: 47 | lang: en 48 | tokenization: true 49 | rep_len: 10 50 | max_ratio: 0.574 # 3sigma 51 | 52 | - document_simhash_deduplicator: 53 | tokenization: space 54 | window_size: 6 55 | lowercase: true 56 | ignore_pattern: '\p{P}' 57 | num_blocks: 6 58 | hamming_distance: 4 59 | -------------------------------------------------------------------------------- /configs/data_juicer_recipes/redpajama-book-refine.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'Data-Juicer-recipes-book' 3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 4 | export_path: '/path/to/your/dataset.jsonl' 5 | 6 | np: 50 # number of subprocess to process your dataset 7 | open_tracer: true 8 | 9 | # process schedule 10 | # a list of several process operators with their arguments 11 | process: 12 | - clean_email_mapper: 13 | - clean_links_mapper: 14 | - fix_unicode_mapper: 15 | - punctuation_normalization_mapper: 16 | - whitespace_normalization_mapper: 17 | 18 | - alphanumeric_filter: 19 | tokenization: false 20 | min_ratio: 0.55 # <3sigma (0.697) 21 | max_ratio: 0.854 # 3sigma 22 | - average_line_length_filter: # for code 23 | max_len: 500 # >3sigma (364) 24 | - character_repetition_filter: 25 | rep_len: 10 26 | max_ratio: 0.2 # >3sigma (0.12) 27 | - flagged_words_filter: 28 | lang: en 29 | tokenization: true 30 | max_ratio: 0.00047 # 3sigma 31 | - language_id_score_filter: # remove language filter 32 | min_score: 0.2 33 | - maximum_line_length_filter: # for code 34 | max_len: 13381 # 3sigma 35 | - perplexity_filter: 36 | lang: en 37 | max_ppl: 6000 # <3sigma (16516) 38 | - special_characters_filter: 39 | max_ratio: 0.5 # >3sigma (0.32) 40 | - words_num_filter: 41 | lang: en 42 | tokenization: true 43 | min_num: 1000 44 | max_num: 539754 # 3sigma 45 | - word_repetition_filter: 46 | lang: en 47 | tokenization: true 48 | rep_len: 10 49 | max_ratio: 0.194 # 3sigma 50 | 51 | - document_simhash_deduplicator: 52 | tokenization: space 53 | window_size: 6 54 | lowercase: true 55 | ignore_pattern: '\p{P}' 56 | num_blocks: 6 57 | hamming_distance: 4 58 | -------------------------------------------------------------------------------- /configs/data_juicer_recipes/redpajama-c4-refine.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'Data-Juicer-recipes-c4' 3 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 4 | export_path: '/path/to/your/dataset.jsonl' # path to your dataset result file 5 | 6 | np: 50 # number of subprocess to process your dataset 7 | open_tracer: True 8 | 9 | # process schedule 10 | # a list of several process operators with their arguments 11 | process: 12 | - clean_email_mapper: 13 | - clean_links_mapper: 14 | - fix_unicode_mapper: 15 | - punctuation_normalization_mapper: 16 | - whitespace_normalization_mapper: 17 | 18 | - alphanumeric_filter: 19 | tokenization: false 20 | min_ratio: 0.65 # <3sigma (0.740) 21 | max_ratio: 0.9 # >3sigma (0.867) 22 | - average_line_length_filter: # for code 23 | max_len: 3000 # >3sigma (1277) 24 | - character_repetition_filter: 25 | rep_len: 10 26 | max_ratio: 0.3 # >3sigma (0.168) 27 | - language_id_score_filter: 28 | min_score: 0.6 29 | - maximum_line_length_filter: # for code 30 | max_len: 4000 # >3sigma (2017) 31 | - perplexity_filter: 32 | lang: en 33 | max_ppl: 6000 #(>3sigma 4543) 34 | - special_characters_filter: 35 | max_ratio: 0.4 # > 3sigma (0.303) 36 | - words_num_filter: 37 | tokenization: true 38 | min_num: 20 39 | max_num: 10000 40 | - word_repetition_filter: 41 | lang: en 42 | tokenization: true 43 | rep_len: 10 44 | max_ratio: 0.231 # 3sigma 45 | 46 | - document_simhash_deduplicator: 47 | tokenization: space 48 | window_size: 6 49 | lowercase: true 50 | ignore_pattern: '\p{P}' 51 | num_blocks: 6 52 | hamming_distance: 4 53 | -------------------------------------------------------------------------------- /configs/datasets/local_json.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'dataset-local-json' 3 | dataset: 4 | configs: 5 | - type: 'local' 6 | path: 'path/to/json/file' 7 | -------------------------------------------------------------------------------- /configs/datasets/local_parquet.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'dataset-local-parquet' 3 | dataset: 4 | configs: 5 | - type: 'local' 6 | path: 'path/to/parquet/file' 7 | -------------------------------------------------------------------------------- /configs/datasets/mixture.yaml: -------------------------------------------------------------------------------- 1 | project_name: 'dataset-mixture' 2 | dataset: 3 | max_sample_num: 10000 4 | configs: 5 | - type: 'local' 6 | weight: 1.0 7 | path: 'path/to/json/file' 8 | - type: 'local' 9 | weight: 1.0 10 | path: 'path/to/csv/file' 11 | -------------------------------------------------------------------------------- /configs/datasets/remote_arxiv.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'dataset-remote-arxiv' 3 | dataset: 4 | configs: 5 | - type: 'remote' 6 | source: 'arxiv' 7 | lang: 'en' 8 | dump_date: 'latest' 9 | force_download: false 10 | url_limit: 2 11 | -------------------------------------------------------------------------------- /configs/datasets/remote_commoncrawl.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'dataset-remote-commoncrawl' 3 | dataset: 4 | configs: 5 | - type: 'remote' 6 | source: 'commoncrawl' 7 | start_snapshot: '2020-50' 8 | end_snapshot: '2021-04' 9 | aws: true 10 | force_download: false 11 | url_limit: 2 12 | -------------------------------------------------------------------------------- /configs/datasets/remote_huggingface.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'dataset-remote-huggingface' 3 | dataset: 4 | configs: 5 | - type: 'remote' 6 | source: 'huggingface' 7 | path: "HuggingFaceFW/fineweb" 8 | name: "CC-MAIN-2024-10" 9 | split: "train" 10 | limit: 1000 11 | -------------------------------------------------------------------------------- /configs/datasets/remote_modelscope.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'dataset-remote-modelscope' 3 | dataset: 4 | configs: 5 | - type: 'remote' 6 | source: 'modelscope' 7 | path: 'modelscope/clue' 8 | subset_name: 'afqmc' 9 | split: 'train' 10 | limit: 1000 11 | -------------------------------------------------------------------------------- /configs/datasets/remote_wiki.yaml: -------------------------------------------------------------------------------- 1 | # global parameters 2 | project_name: 'dataset-remote-wiki' 3 | dataset: 4 | configs: 5 | - type: 'remote' 6 | source: 'wiki' 7 | lang: 'en' 8 | dump_date: 'latest' 9 | force_download: false 10 | url_limit: 2 11 | -------------------------------------------------------------------------------- /configs/datasets/validation_required_fields.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | configs: 3 | - type: local 4 | path: /path/to/data.json 5 | 6 | validators: 7 | - type: required_fields 8 | required_fields: 9 | - "text" 10 | - "metadata" 11 | - "language" 12 | field_types: 13 | text: "str" 14 | metadata: "dict" 15 | language: "str" 16 | -------------------------------------------------------------------------------- /configs/datasets/validation_swift_messages.yaml: -------------------------------------------------------------------------------- 1 | project_name: validation_swift_messages_demo 2 | 3 | dataset: 4 | configs: 5 | - type: local 6 | path: ./demos/data/demo-dataset-chatml.jsonl 7 | 8 | text_keys: messages 9 | 10 | export_path: ./outputs/validation_swift_messages_demo/output.jsonl 11 | 12 | validators: 13 | - type: swift_messages 14 | min_turns: 1 # Minimum number of user-assistant turns 15 | max_turns: 20 # Maximum number of turns including system message 16 | 17 | process: 18 | - text_length_filter: 19 | max_len: 18221 # 3sigma 20 | -------------------------------------------------------------------------------- /configs/demo/analyzer.yaml: -------------------------------------------------------------------------------- 1 | # Process config example for dataset 2 | 3 | # global parameters 4 | project_name: 'demo-analyzer' 5 | dataset_path: './demos/data/demo-dataset.jsonl' # path to your dataset directory or file 6 | np: 4 # number of subprocess to process your dataset 7 | 8 | export_path: './outputs/demo-analyzer/demo-analyzer-result.jsonl' 9 | 10 | # process schedule 11 | # a list of several process operators with their arguments 12 | process: 13 | - language_id_score_filter: 14 | lang: 'en' 15 | - perplexity_filter: 16 | lang: 'en' 17 | max_ppl: 1500 18 | -------------------------------------------------------------------------------- /configs/demo/bench/3_duplicate_pipeline.yaml: -------------------------------------------------------------------------------- 1 | # Sandbox config example 2 | 3 | # global parameters 4 | project_name: 'demo-bench' 5 | experiment_name: 'single_op_language_score' # for wandb tracer name 6 | work_dir: './outputs/demo-bench' # the default output dir for meta logging 7 | 8 | # configs for each job, the jobs will be executed according to the order in the list 9 | probe_job_configs: 10 | 11 | refine_recipe_job_configs: 12 | 13 | execution_job_configs: 14 | # train model 15 | - hook: 'TrainModelHook' 16 | meta_name: 17 | dj_configs: 18 | extra_configs: './configs/demo/bench/model_train_2_epoch.yaml' # the input data is set to be demo-dataset-with-multi-op-stats.jsonl 19 | # infer model 20 | - hook: 'InferModelHook' 21 | meta_name: 22 | dj_configs: 23 | extra_configs: './configs/demo/bench/model_infer.yaml' 24 | 25 | evaluation_job_configs: 26 | # vbench evaluation 27 | - hook: 'EvaluateDataHook' 28 | meta_name: 'vbench_eval' 29 | dj_configs: 30 | extra_configs: './configs/demo/bench/vbench_eval.yaml' 31 | -------------------------------------------------------------------------------- /configs/demo/bench/model_infer.yaml: -------------------------------------------------------------------------------- 1 | type: easyanimate 2 | model_name: "easyanimate" 3 | infer_name: "easyanimate-lora-generate" 4 | train: 5 | model_path: 6 | # path to the pixart model or the hugging face model 7 | pretrained_model_name_or_path: "PixArt-alpha/PixArt-XL-2-512x512" 8 | # path to pretrained easyanimate checkpoint. Following are the links to available checkpoints. 9 | # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors 10 | transformer_path: "/PATH/TO/EASYANIMATE_MODEL" 11 | # Note that the root path is in "thirdparty/easy_animate" 12 | lora_path: "../../../outputs/demo-bench/models/checkpoint-2.safetensors" 13 | 14 | infer_config: 15 | # must match the pretrained easyanimate checkpoint. 16 | image_size: 256 17 | prompt_info_path: "../../../tools/mm_eval/vbench_metrics/VBench_mini_info.json" # Use VBench_full_info.json for full eval. 18 | gpu_num: 1 19 | batch_size: 8 20 | mixed_precision: "bf16" 21 | video_num_per_prompt: 5 22 | seed: 43 23 | 24 | saving_config: 25 | output_video_dir: "../../../outputs/demo-bench/generated_videos" 26 | -------------------------------------------------------------------------------- /configs/demo/bench/model_train.yaml: -------------------------------------------------------------------------------- 1 | type: easyanimate 2 | model_name: "easyanimate" 3 | trainer_name: "easyanimate-lora-trainer" 4 | train: 5 | tracker_config: 6 | # config for wandb 7 | project_name: "demo-bench" 8 | experiment_name: 'demo-single-op-model-train' 9 | model_path: 10 | # path to the pixart model or the hugging face model 11 | pretrained_model_name_or_path: "PixArt-alpha/PixArt-XL-2-512x512" 12 | # path to pretrained easyanimate checkpoint. Following are the links to available checkpoints. 13 | # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors 14 | transformer_path: "/PATH/TO/EASYANIMATE_MODEL" 15 | dataset_path: 16 | # The root directory to videos. Set empty if it is the absolute path in the dataset. 17 | dataset_name: "" 18 | # path to the Data-Juicer dataset. Note that the root path is in "thirdparty/models/EasyAnimate" 19 | dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-for-train.jsonl" 20 | training_config: 21 | # image size, must match the pretrained easyanimate checkpoint. 22 | sample_size: 256 23 | mixed_precision: "bf16" 24 | batch_size_per_gpu: 8 25 | gradient_accumulation_steps: 1 26 | num_train_epochs: 1 27 | dataloader_num_workers: 8 28 | seed: 42 29 | saving_config: 30 | # Note that the root path is in "thirdparty/models/EasyAnimate" 31 | output_dir: "../../../outputs/demo-bench/models" 32 | -------------------------------------------------------------------------------- /configs/demo/bench/model_train_2_epoch.yaml: -------------------------------------------------------------------------------- 1 | type: easyanimate 2 | model_name: "easyanimate" 3 | trainer_name: "easyanimate-lora-trainer" 4 | train: 5 | tracker_config: 6 | # config for wandb 7 | project_name: "demo-bench" 8 | experiment_name: 'demo-single-op-model-train' 9 | model_path: 10 | # path to the pixart model or the hugging face model 11 | pretrained_model_name_or_path: "PixArt-alpha/PixArt-XL-2-512x512" 12 | # path to pretrained easyanimate checkpoint. Following are the links to available checkpoints. 13 | # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors 14 | transformer_path: "/PATH/TO/EASYANIMATE_MODEL" 15 | dataset_path: 16 | # The root directory to videos. Set empty if it is the absolute path in the dataset. 17 | dataset_name: "" 18 | # path to the Data-Juicer dataset. Note that the root path is in "thirdparty/easy_animate" 19 | dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-with-multi-op-stats.jsonl" 20 | training_config: 21 | # image size, must match the pretrained easyanimate checkpoint. 22 | sample_size: 256 23 | mixed_precision: "bf16" 24 | batch_size_per_gpu: 8 25 | gradient_accumulation_steps: 1 26 | num_train_epochs: 2 27 | dataloader_num_workers: 8 28 | seed: 42 29 | saving_config: 30 | # Note that the root path is in "thirdparty/easy_animate" 31 | output_dir: "../../../outputs/demo-bench/models" 32 | -------------------------------------------------------------------------------- /configs/demo/bench/vbench_eval.yaml: -------------------------------------------------------------------------------- 1 | type: vbench_video_evaluator 2 | 3 | # The vbench prompts for video generation. Use VBench_full_info.json for full eval. 4 | prompt_path: ./tools/mm_eval/vbench_metrics/VBench_mini_info.json 5 | 6 | # The path to the dir of generated videos 7 | videos_path: ./outputs/demo-bench/generated_videos 8 | 9 | # The dir to store the eval results 10 | result_dir: ./outputs/demo-bench/eval_results 11 | 12 | # Give a name for this eval 13 | eval_name: mini_test 14 | 15 | # If true, load the required model for VBench from the cache path of environment parameter VBENCH_CACHE_DIR 16 | load_ckpt_from_local: false 17 | 18 | # The dimensions considered in this eval. 19 | # All dimensions include: ['subject_consistency', 'background_consistency', 'temporal_flickering', 20 | # 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality', 'object_class', 21 | # 'multiple_objects', 'human_action', 'color', 'spatial_relationship', 'scene', 'temporal_style', 22 | # 'appearance_style', 'overall_consistency'] 23 | dimension_list: 24 | - subject_consistency 25 | - dynamic_degree 26 | -------------------------------------------------------------------------------- /configs/demo/dedup.yaml: -------------------------------------------------------------------------------- 1 | # Process config example for dataset 2 | 3 | # global parameters 4 | project_name: 'demo-dedup' 5 | dataset_path: './demos/data/demo-dataset-deduplication.jsonl' # path to your dataset directory or file 6 | np: 4 # number of subprocess to process your dataset 7 | 8 | open_tracer: true 9 | 10 | export_path: './outputs/demo-dedup/demo-dedup-processed.jsonl' 11 | 12 | # process schedule 13 | # a list of several process operators with their arguments 14 | process: 15 | - language_id_score_filter: 16 | lang: en 17 | min_score: 0.5 18 | # - document_deduplicator: 19 | # lowercase: false 20 | # ignore_non_character: false 21 | - document_minhash_deduplicator: 22 | tokenization: 'character' 23 | -------------------------------------------------------------------------------- /configs/demo/process-huggingface.yaml: -------------------------------------------------------------------------------- 1 | # Process config example for dataset 2 | 3 | # global parameters 4 | project_name: 'demo-process' 5 | dataset: 6 | configs: 7 | - type: 'remote' 8 | source: 'huggingface' 9 | path: 'hugfaceguy0001/retarded_bar' 10 | name: 'question' 11 | split: 'train' 12 | 13 | np: 4 # number of subprocess to process your dataset 14 | 15 | export_path: './outputs/demo-process/demo-processed.jsonl' 16 | 17 | # process schedule 18 | # a list of several process operators with their arguments 19 | process: 20 | - language_id_score_filter: 21 | lang: 'zh' 22 | min_score: 0.8 23 | -------------------------------------------------------------------------------- /configs/demo/process.yaml: -------------------------------------------------------------------------------- 1 | # Process config example for dataset 2 | 3 | # global parameters 4 | project_name: 'demo-process' 5 | dataset_path: './demos/data/demo-dataset.jsonl' # path to your dataset directory or file 6 | np: 4 # number of subprocess to process your dataset 7 | 8 | export_path: './outputs/demo-process/demo-processed.jsonl' 9 | 10 | # process schedule 11 | # a list of several process operators with their arguments 12 | process: 13 | - language_id_score_filter: 14 | lang: 'zh' 15 | min_score: 0.8 16 | -------------------------------------------------------------------------------- /configs/demo/sandbox/gpt3_data_quality_eval_config.yaml: -------------------------------------------------------------------------------- 1 | type: dj_text_quality_classifier 2 | dataset_path: './outputs/demo-process/demo-processed.jsonl' 3 | -------------------------------------------------------------------------------- /configs/demo/sandbox/gpt3_extra_train_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "modelscope", 3 | "dataset_path": "./outputs/demo-process/demo-processed.jsonl", 4 | "work_dir": "./outputs/sandbox-train/", 5 | "model_name": "iic/nlp_gpt3_text-generation_chinese-base", 6 | "trainer_name": "nlp-base-trainer", 7 | "key_remapping": { 8 | "text": "src_txt" 9 | }, 10 | "train": { 11 | "max_epochs": 2, 12 | "lr_scheduler": { 13 | "type": "StepLR", 14 | "step_size": 2, 15 | "options": { 16 | "by_epoch": false 17 | } 18 | }, 19 | "optimizer": { 20 | "type": "AdamW", 21 | "lr": 3e-4 22 | }, 23 | "dataloader": { 24 | "batch_size_per_gpu": 2, 25 | "workers_per_gpu": 0 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /configs/demo/sandbox/gpt3_extra_train_config.yaml: -------------------------------------------------------------------------------- 1 | type: modelscope 2 | dataset_path: './outputs/demo-process/demo-processed.jsonl' 3 | work_dir: './outputs/sandbox-train/' 4 | model_name: "iic/nlp_gpt3_text-generation_chinese-base" 5 | trainer_name: "nlp-base-trainer" 6 | key_remapping: 7 | text: "src_txt" 8 | train: 9 | max_epochs: 2 10 | lr_scheduler: 11 | type: "StepLR" 12 | step_size: 2 13 | options: 14 | by_epoch: false 15 | optimizer: 16 | type: "AdamW" 17 | lr: 0.0003 18 | dataloader: 19 | batch_size_per_gpu: 2 20 | workers_per_gpu: 0 21 | -------------------------------------------------------------------------------- /configs/reproduced_bloom/README.md: -------------------------------------------------------------------------------- 1 | # BLOOM Config Files 2 | 3 | This folder contains example configuration files to easily and quickly reproduce the processing flow of the [ROOTS](https://github.com/bigscience-workshop/data-preparation) dataset, created by the BigScience initiative to train the BLOOM models. 4 | 5 | ## Oscar 6 | The raw data files can be downloaded as described in [BLOOM/Oscar](https://github.com/bigscience-workshop/data-preparation/tree/main/preprocessing/training/01b_oscar_cleaning_and_filtering). Then use [bloom-oscar.yaml](bloom-oscar.yaml) to perform the whole processing. 7 | 8 | An analysis of our reproduction will be published soon. 9 | -------------------------------------------------------------------------------- /configs/reproduced_bloom/README_ZH.md: -------------------------------------------------------------------------------- 1 | # BLOOM 配置文件 2 | 3 | 此文件夹包含的配置文件用于轻松复现 [ROOTS](https://github.com/bigscience-workshop/data-preparation) 的处理流程,该数据集由 BigScience 创建并用于训练 BLOOM 模型。 4 | 5 | ## Oscar 6 | 7 | 原始文件可以参照 [BLOOM/Oscar](https://github.com/bigscience-workshop/data-preparation/tree/main/preprocessing/training/01b_oscar_cleaning_and_filtering) 下载,然后使用 [bloom-oscar.yaml](bloom-oscar.yaml) 进行完整的处理流程。 8 | 9 | 对我们复现结果的分析将在稍后发布。 10 | -------------------------------------------------------------------------------- /configs/reproduced_bloom/bloom-oscar.yaml: -------------------------------------------------------------------------------- 1 | # Process config example for Oscar used in BLOOM 2 | 3 | # global parameters 4 | project_name: 'bloom_oscar' 5 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 6 | np: 4 # number of subprocess to process your dataset 7 | 8 | export_path: '/path/to/result/dataset.jsonl' 9 | 10 | # process schedule 11 | # a list of several process operators with their arguments 12 | process: 13 | # filter English corpus 14 | - language_id_score_filter: 15 | lang: en 16 | min_score: 0.8 17 | 18 | # basic process for regular English text 19 | - whitespace_normalization_mapper: 20 | - punctuation_normalization_mapper: 21 | - fix_unicode_mapper: 22 | - remove_words_with_incorrect_substrings_mapper: 23 | - remove_long_words_mapper: 24 | max_len: 25 25 | 26 | # basic filter rules for regular English text 27 | - words_num_filter: 28 | min_num: 20 29 | max_num: 100000 30 | - character_repetition_filter: 31 | rep_len: 10 32 | min_ratio: 0.0 33 | max_ratio: 0.106 34 | - word_repetition_filter: 35 | rep_len: 5 36 | min_ratio: 0.0 37 | max_ratio: 0.19 38 | - special_characters_filter: 39 | min_ratio: 0.0 40 | max_ratio: 0.4 41 | - stopwords_filter: 42 | lang: en 43 | min_ratio: 0.3 44 | - flagged_words_filter: 45 | lang: en 46 | max_ratio: 0.01 47 | - perplexity_filter: 48 | lang: en 49 | max_ppl: 1500 50 | 51 | # basic deduplication rules for regular English text 52 | - document_simhash_deduplicator: 53 | tokenization: space 54 | window_size: 6 55 | lowercase: true 56 | ignore_pattern: '\p{P}' 57 | num_blocks: 6 58 | hamming_distance: 4 59 | -------------------------------------------------------------------------------- /configs/reproduced_redpajama/redpajama-arxiv.yaml: -------------------------------------------------------------------------------- 1 | # Process config example for arXiv dataset 2 | 3 | # global parameters 4 | project_name: 'arXiv' 5 | dataset_path: '/path/to/your/dataset/dir/or/file' # path to your dataset directory or file 6 | np: 32 # number of subprocess to process your dataset 7 | 8 | export_path: '/path/to/your/exported/dataset/file' 9 | 10 | # process schedule 11 | # a list of several process operators with their arguments 12 | process: 13 | - remove_header_mapper: 14 | drop_no_head: true 15 | - remove_comments_mapper: 16 | doc_type: ['md', 'tex'] 17 | inline: true 18 | multiline: true 19 | - remove_bibliography_mapper: 20 | - expand_macro_mapper: 21 | - text_length_filter: 22 | min_len: 1 23 | -------------------------------------------------------------------------------- /configs/reproduced_redpajama/redpajama-books.yaml: -------------------------------------------------------------------------------- 1 | # Process config example for Books used in RedPajam 2 | 3 | # global parameters 4 | project_name: 'RedPajam-books' 5 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 6 | np: 4 # number of subprocess to process your dataset 7 | 8 | export_path: '/path/to/result/dataset.jsonl' 9 | 10 | # process schedule 11 | # a list of several process operators with their arguments 12 | process: 13 | - document_simhash_deduplicator: 14 | tokenization: character 15 | window_size: 6 16 | lowercase: True 17 | ignore_pattern: '[^\w]+' 18 | num_blocks: 6 19 | hamming_distance: 5 20 | -------------------------------------------------------------------------------- /configs/reproduced_redpajama/redpajama-code.yaml: -------------------------------------------------------------------------------- 1 | # Process config example for codes used in RedPajam 2 | 3 | # global parameters 4 | project_name: 'RedPajam-codes' 5 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 6 | np: 4 # number of subprocess to process your dataset 7 | 8 | export_path: '/path/to/result/dataset.jsonl' 9 | 10 | # process schedule 11 | # a list of several process operators with their arguments 12 | process: 13 | - document_deduplicator: 14 | - clean_copyright_mapper: 15 | - maximum_line_length_filter: 16 | min_len: 1 17 | max_len: 1000 18 | - average_line_length_filter: 19 | min_len: 1 20 | max_len: 100 21 | - alphanumeric_filter: 22 | min_ratio: 0.25 23 | max_ratio: 1.0 24 | - alphanumeric_filter: 25 | tokenization: True 26 | min_ratio: 1.5 27 | - suffix_filter: 28 | suffixes: [".asm", ".bat", ".cmd", ".c", ".h", ".cs", ".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H", ".cmake", ".css", 29 | ".dockerfile", ".f90", ".f", ".f03", ".f08", ".f77", ".f95", ".for", ".fpp", ".go", ".hs", ".html", ".java", ".js", 30 | ".jl", ".lua", ".md", ".markdown", ".php", ".php3", ".php4", ".php5", ".phps", ".phpt", ".pl", ".pm", ".pod", ".perl", 31 | ".ps1", ".psd1", ".psm1", ".py", ".rb", ".rs", ".sql", ".scala", ".sh", ".bash", ".command", ".zsh", ".ts", ".tsx", 32 | ".tex", ".vb", "Dockerfile", "Makefile", ".xml", ".rst", ".m", ".smali"] 33 | -------------------------------------------------------------------------------- /configs/reproduced_redpajama/redpajama-stackexchange.yaml: -------------------------------------------------------------------------------- 1 | # Process config example for stackexchange used in RedPajam 2 | 3 | # global parameters 4 | project_name: 'RedPajam-stackexchange' 5 | dataset_path: '/path/to/your/dataset' # path to your dataset directory or file 6 | np: 4 # number of subprocess to process your dataset 7 | use_cache: False 8 | 9 | export_path: '/path/to/result/dataset.jsonl' 10 | 11 | # process schedule 12 | # a list of several process operators with their arguments 13 | process: 14 | - clean_html_mapper: 15 | - language_id_score_filter: 16 | lang: '' 17 | min_score: 0.0 18 | -------------------------------------------------------------------------------- /data_juicer/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.3.3' 2 | 3 | import os 4 | import subprocess 5 | import sys 6 | 7 | from loguru import logger 8 | # allow loading truncated images for some too large images. 9 | from PIL import ImageFile 10 | 11 | from data_juicer.utils.availability_utils import _is_package_available 12 | from data_juicer.utils.lazy_loader import LazyLoader 13 | 14 | torch = LazyLoader('torch') 15 | ImageFile.LOAD_TRUNCATED_IMAGES = True 16 | 17 | # For now, only INFO will be shown. Later the severity level will be changed 18 | # when setup_logger is called to initialize the logger. 19 | logger.remove() 20 | logger.add(sys.stderr, level='INFO') 21 | 22 | 23 | def _cuda_device_count(): 24 | _torch_available = _is_package_available('torch') 25 | 26 | if _torch_available: 27 | return torch.cuda.device_count() 28 | 29 | try: 30 | nvidia_smi_output = subprocess.check_output(['nvidia-smi', '-L'], 31 | text=True) 32 | all_devices = nvidia_smi_output.strip().split('\n') 33 | 34 | cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES') 35 | if cuda_visible_devices is not None: 36 | logger.warning( 37 | 'CUDA_VISIBLE_DEVICES is ignored when torch is unavailable. ' 38 | 'All detected GPUs will be used.') 39 | 40 | return len(all_devices) 41 | except Exception: 42 | # nvidia-smi not found or other error 43 | return 0 44 | 45 | 46 | def cuda_device_count(): 47 | return _cuda_device_count() 48 | 49 | 50 | def is_cuda_available(): 51 | return cuda_device_count() > 0 52 | -------------------------------------------------------------------------------- /data_juicer/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | from .column_wise_analysis import ColumnWiseAnalysis 2 | from .diversity_analysis import DiversityAnalysis 3 | from .overall_analysis import OverallAnalysis 4 | 5 | __all__ = [ 6 | 'ColumnWiseAnalysis', 7 | 'DiversityAnalysis', 8 | 'OverallAnalysis', 9 | ] 10 | -------------------------------------------------------------------------------- /data_juicer/analysis/draw.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import seaborn as sns 4 | 5 | 6 | def draw_heatmap(data, 7 | xlabels, 8 | ylabels='auto', 9 | figsize=None, 10 | triangle=False, 11 | show=False): 12 | """ 13 | Draw heatmap of input data with special labels. 14 | 15 | :param data: input data, now support 16 | [`list`, `tuple`, `numpy array`, 'torch tensor'] 17 | :param xlabels: x axis labels. 18 | :param ylabels: y axis labels, if None, use xlabels. 19 | :param figsize: figure size. 20 | :param triangle: only display triangle. 21 | :return: a plot figure. 22 | """ 23 | figsize = figsize if figsize else (8 * 2.5, 6 * 2.5) 24 | _, ax = plt.subplots(figsize=figsize) 25 | mask = None 26 | if triangle: 27 | mask = np.triu(np.ones_like(data)) 28 | ax.tick_params( 29 | right=True, 30 | top=True, 31 | labelright=True, 32 | labeltop=True, 33 | ) 34 | sns.heatmap(data, 35 | ax=ax, 36 | cmap='Oranges', 37 | annot=True, 38 | mask=mask, 39 | linewidths=.05, 40 | square=True, 41 | xticklabels=xlabels, 42 | yticklabels=ylabels, 43 | annot_kws={'size': 8}) 44 | plt.subplots_adjust(left=.1, right=0.95, bottom=0.22, top=0.95) 45 | fig = plt.gcf() 46 | if show: 47 | plt.show() 48 | return fig 49 | -------------------------------------------------------------------------------- /data_juicer/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import (export_config, get_default_cfg, get_init_configs, 2 | init_configs, merge_config, prepare_side_configs) 3 | 4 | __all__ = [ 5 | 'init_configs', 'get_init_configs', 'export_config', 'merge_config', 6 | 'prepare_side_configs', 'get_default_cfg' 7 | ] 8 | -------------------------------------------------------------------------------- /data_juicer/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .adapter import Adapter 2 | from .analyzer import Analyzer 3 | from .data import NestedDataset 4 | from .executor import DefaultExecutor, ExecutorBase, ExecutorFactory 5 | from .exporter import Exporter 6 | from .monitor import Monitor 7 | from .tracer import Tracer 8 | 9 | __all__ = [ 10 | 'Adapter', 11 | 'Analyzer', 12 | 'NestedDataset', 13 | 'ExecutorBase', 14 | 'ExecutorFactory', 15 | 'DefaultExecutor', 16 | 'Exporter', 17 | 'Monitor', 18 | 'Tracer', 19 | ] 20 | -------------------------------------------------------------------------------- /data_juicer/core/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .dj_dataset import (DJDataset, NestedDataset, 2 | add_same_content_to_new_column, 3 | wrap_func_with_nested_access) 4 | 5 | __all__ = [ 6 | 'DJDataset', 'NestedDataset', 'wrap_func_with_nested_access', 7 | 'add_same_content_to_new_column' 8 | ] 9 | -------------------------------------------------------------------------------- /data_juicer/core/executor/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import ExecutorBase 2 | from .default_executor import DefaultExecutor 3 | from .factory import ExecutorFactory 4 | 5 | __all__ = ['ExecutorBase', 'ExecutorFactory', 'DefaultExecutor'] 6 | -------------------------------------------------------------------------------- /data_juicer/core/executor/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Optional 3 | 4 | from jsonargparse import Namespace 5 | from pydantic import PositiveInt 6 | 7 | from data_juicer.config import init_configs 8 | 9 | 10 | class ExecutorBase(ABC): 11 | 12 | @abstractmethod 13 | def __init__(self, cfg: Optional[Namespace] = None): 14 | self.cfg = init_configs() if cfg is None else cfg 15 | self.executor_type = 'base' 16 | 17 | @abstractmethod 18 | def run(self, 19 | load_data_np: Optional[PositiveInt] = None, 20 | skip_return=False): 21 | raise NotImplementedError 22 | -------------------------------------------------------------------------------- /data_juicer/core/executor/factory.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from .default_executor import DefaultExecutor 4 | from .ray_executor import RayExecutor 5 | 6 | 7 | class ExecutorFactory: 8 | 9 | @staticmethod 10 | def create_executor( 11 | executor_type: str) -> Union[DefaultExecutor, RayExecutor]: 12 | if executor_type in ('local', 'default'): 13 | return DefaultExecutor() 14 | elif executor_type == 'ray': 15 | return RayExecutor() 16 | # TODO: add nemo support 17 | # elif executor_type == "nemo": 18 | # return NemoExecutor() 19 | # TODO: add dask support 20 | # elif executor_type == "dask": 21 | # return DaskExecutor() 22 | else: 23 | raise ValueError('Unsupported executor type') 24 | -------------------------------------------------------------------------------- /data_juicer/download/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/data_juicer/download/__init__.py -------------------------------------------------------------------------------- /data_juicer/download/commoncrawl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/data_juicer/download/commoncrawl.py -------------------------------------------------------------------------------- /data_juicer/format/__init__.py: -------------------------------------------------------------------------------- 1 | from .csv_formatter import CsvFormatter 2 | from .empty_formatter import EmptyFormatter, RayEmptyFormatter 3 | from .formatter import LocalFormatter, RemoteFormatter 4 | from .json_formatter import JsonFormatter 5 | from .parquet_formatter import ParquetFormatter 6 | from .text_formatter import TextFormatter 7 | from .tsv_formatter import TsvFormatter 8 | 9 | __all__ = [ 10 | 'JsonFormatter', 'LocalFormatter', 'RemoteFormatter', 'TextFormatter', 11 | 'ParquetFormatter', 'CsvFormatter', 'TsvFormatter', 'EmptyFormatter', 12 | 'RayEmptyFormatter' 13 | ] 14 | -------------------------------------------------------------------------------- /data_juicer/format/csv_formatter.py: -------------------------------------------------------------------------------- 1 | from .formatter import FORMATTERS, LocalFormatter 2 | 3 | 4 | @FORMATTERS.register_module() 5 | class CsvFormatter(LocalFormatter): 6 | """ 7 | The class is used to load and format csv-type files. 8 | 9 | Default suffixes is `['.csv']` 10 | """ 11 | SUFFIXES = ['.csv'] 12 | 13 | def __init__(self, dataset_path, suffixes=None, **kwargs): 14 | """ 15 | Initialization method. 16 | 17 | :param dataset_path: a dataset file or a dataset directory 18 | :param suffixes: files with specified suffixes to be processed 19 | :param kwargs: extra args 20 | """ 21 | super().__init__( 22 | dataset_path=dataset_path, 23 | suffixes=suffixes if suffixes else self.SUFFIXES, 24 | type='csv', 25 | **kwargs, 26 | ) 27 | -------------------------------------------------------------------------------- /data_juicer/format/json_formatter.py: -------------------------------------------------------------------------------- 1 | from .formatter import FORMATTERS, LocalFormatter 2 | 3 | 4 | @FORMATTERS.register_module() 5 | class JsonFormatter(LocalFormatter): 6 | """ 7 | The class is used to load and format json-type files. 8 | 9 | Default suffixes is `['.json', '.jsonl', '.jsonl.zst']` 10 | """ 11 | SUFFIXES = ['.json', '.jsonl', '.jsonl.zst'] 12 | 13 | def __init__(self, dataset_path, suffixes=None, **kwargs): 14 | """ 15 | Initialization method. 16 | 17 | :param dataset_path: a dataset file or a dataset directory 18 | :param suffixes: files with specified suffixes to be processed 19 | :param kwargs: extra args 20 | """ 21 | super().__init__( 22 | dataset_path=dataset_path, 23 | suffixes=suffixes if suffixes else self.SUFFIXES, 24 | type='json', 25 | **kwargs, 26 | ) 27 | -------------------------------------------------------------------------------- /data_juicer/format/parquet_formatter.py: -------------------------------------------------------------------------------- 1 | from .formatter import FORMATTERS, LocalFormatter 2 | 3 | 4 | @FORMATTERS.register_module() 5 | class ParquetFormatter(LocalFormatter): 6 | """ 7 | The class is used to load and format parquet-type files. 8 | 9 | Default suffixes is `['.parquet']` 10 | """ 11 | SUFFIXES = ['.parquet'] 12 | 13 | def __init__(self, dataset_path, suffixes=None, **kwargs): 14 | """ 15 | Initialization method. 16 | 17 | :param dataset_path: a dataset file or a dataset directory 18 | :param suffixes: files with specified suffixes to be processed 19 | :param kwargs: extra args 20 | """ 21 | super().__init__( 22 | dataset_path=dataset_path, 23 | suffixes=suffixes if suffixes else self.SUFFIXES, 24 | type='parquet', 25 | **kwargs, 26 | ) 27 | -------------------------------------------------------------------------------- /data_juicer/format/tsv_formatter.py: -------------------------------------------------------------------------------- 1 | from .formatter import FORMATTERS, LocalFormatter 2 | 3 | 4 | @FORMATTERS.register_module() 5 | class TsvFormatter(LocalFormatter): 6 | """ 7 | The class is used to load and format tsv-type files. 8 | 9 | Default suffixes is `['.tsv']` 10 | """ 11 | SUFFIXES = ['.tsv'] 12 | 13 | def __init__(self, dataset_path, suffixes=None, **kwargs): 14 | """ 15 | Initialization method. 16 | 17 | :param dataset_path: a dataset file or a dataset directory 18 | :param suffixes: files with specified suffixes to be processed 19 | :param kwargs: extra args, e.g. `delimiter = ','` 20 | """ 21 | super().__init__( 22 | dataset_path=dataset_path, 23 | suffixes=suffixes if suffixes else self.SUFFIXES, 24 | type='csv', 25 | delimiter='\t', 26 | **kwargs, 27 | ) 28 | -------------------------------------------------------------------------------- /data_juicer/ops/__init__.py: -------------------------------------------------------------------------------- 1 | # yapf: disable 2 | from . import aggregator, deduplicator, filter, grouper, mapper, selector 3 | from .base_op import (NON_STATS_FILTERS, OPERATORS, TAGGING_OPS, UNFORKABLE, 4 | Aggregator, Deduplicator, Filter, Grouper, Mapper, 5 | Selector) 6 | from .load import load_ops 7 | 8 | __all__ = [ 9 | 'load_ops', 10 | 'Filter', 11 | 'Mapper', 12 | 'Deduplicator', 13 | 'Selector', 14 | 'Grouper', 15 | 'Aggregator', 16 | 'UNFORKABLE', 17 | 'NON_STATS_FILTERS', 18 | 'OPERATORS', 19 | 'TAGGING_OPS', 20 | ] 21 | -------------------------------------------------------------------------------- /data_juicer/ops/aggregator/__init__.py: -------------------------------------------------------------------------------- 1 | from .entity_attribute_aggregator import EntityAttributeAggregator 2 | from .meta_tags_aggregator import MetaTagsAggregator 3 | from .most_relevant_entities_aggregator import MostRelevantEntitiesAggregator 4 | from .nested_aggregator import NestedAggregator 5 | 6 | __all__ = [ 7 | 'NestedAggregator', 'MetaTagsAggregator', 'EntityAttributeAggregator', 8 | 'MostRelevantEntitiesAggregator' 9 | ] 10 | -------------------------------------------------------------------------------- /data_juicer/ops/common/__init__.py: -------------------------------------------------------------------------------- 1 | from .helper_func import (get_sentences_from_document, get_words_from_document, 2 | merge_on_whitespace_tab_newline, 3 | split_on_newline_tab_whitespace, split_on_whitespace, 4 | split_text_by_punctuation, strip, words_augmentation, 5 | words_refinement) 6 | from .special_characters import SPECIAL_CHARACTERS 7 | 8 | __all__ = [ 9 | 'get_sentences_from_document', 'get_words_from_document', 10 | 'merge_on_whitespace_tab_newline', 'split_on_newline_tab_whitespace', 11 | 'split_on_whitespace', 'strip', 'words_augmentation', 'words_refinement', 12 | 'split_text_by_punctuation', 'SPECIAL_CHARACTERS' 13 | ] 14 | -------------------------------------------------------------------------------- /data_juicer/ops/common/special_characters.py: -------------------------------------------------------------------------------- 1 | # Most of the code here has been modified from: 2 | # https://huggingface.co/spaces/huggingface/text-data-filtering 3 | # -------------------------------------------------------- 4 | 5 | import string 6 | 7 | import emoji 8 | 9 | # special characters 10 | MAIN_SPECIAL_CHARACTERS = string.punctuation + string.digits \ 11 | + string.whitespace 12 | OTHER_SPECIAL_CHARACTERS = ( 13 | "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═" 14 | "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖" 15 | "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚" 16 | "゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖" 17 | "」﴾》" 18 | ) 19 | EMOJI = list(emoji.EMOJI_DATA.keys()) 20 | SPECIAL_CHARACTERS = set(MAIN_SPECIAL_CHARACTERS + OTHER_SPECIAL_CHARACTERS) 21 | SPECIAL_CHARACTERS.update(EMOJI) 22 | 23 | # various whitespaces for whitespace normalization 24 | # whitespaces in unicode can be found here: 25 | # https://en.wikipedia.org/wiki/Whitespace_character 26 | VARIOUS_WHITESPACES = { 27 | ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 28 | ' ', ' ', ' ', ' ', '​', '‌', '‍', '⁠', '', '„' 29 | } 30 | -------------------------------------------------------------------------------- /data_juicer/ops/deduplicator/__init__.py: -------------------------------------------------------------------------------- 1 | from .document_deduplicator import DocumentDeduplicator 2 | from .document_minhash_deduplicator import DocumentMinhashDeduplicator 3 | from .document_simhash_deduplicator import DocumentSimhashDeduplicator 4 | from .image_deduplicator import ImageDeduplicator 5 | from .ray_basic_deduplicator import RayBasicDeduplicator 6 | from .ray_bts_minhash_deduplicator import RayBTSMinhashDeduplicator 7 | from .ray_document_deduplicator import RayDocumentDeduplicator 8 | from .ray_image_deduplicator import RayImageDeduplicator 9 | from .ray_video_deduplicator import RayVideoDeduplicator 10 | from .video_deduplicator import VideoDeduplicator 11 | 12 | __all__ = [ 13 | 'DocumentDeduplicator', 14 | 'DocumentMinhashDeduplicator', 15 | 'DocumentSimhashDeduplicator', 16 | 'ImageDeduplicator', 17 | 'RayBasicDeduplicator', 18 | 'RayDocumentDeduplicator', 19 | 'RayImageDeduplicator', 20 | 'RayVideoDeduplicator', 21 | 'RayImageDeduplicator', 22 | 'RayBTSMinhashDeduplicator', 23 | 'VideoDeduplicator', 24 | ] 25 | -------------------------------------------------------------------------------- /data_juicer/ops/filter/suffix_filter.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | 3 | from data_juicer.utils.constant import Fields 4 | 5 | from ..base_op import NON_STATS_FILTERS, OPERATORS, Filter 6 | 7 | OP_NAME = 'suffix_filter' 8 | 9 | 10 | @NON_STATS_FILTERS.register_module(OP_NAME) 11 | @OPERATORS.register_module(OP_NAME) 12 | class SuffixFilter(Filter): 13 | """Filter to keep samples with specified suffix.""" 14 | 15 | def __init__(self, suffixes: Union[str, List[str]] = [], *args, **kwargs): 16 | """ 17 | Initialization method. 18 | 19 | :param suffixes: the suffix of text that will be keep. 20 | For example: '.txt', 'txt' or ['txt', '.pdf', 'docx'] 21 | :param args: extra args 22 | :param kwargs: extra args 23 | """ 24 | super().__init__(*args, **kwargs) 25 | if suffixes is None: 26 | self.suffixes = [] 27 | elif isinstance(suffixes, str): 28 | self.suffixes = [suffixes] 29 | else: 30 | self.suffixes = suffixes 31 | 32 | def compute_stats_single(self, sample): 33 | return sample 34 | 35 | def process_single(self, sample): 36 | if self.suffixes: 37 | if sample[Fields.suffix] in self.suffixes: 38 | return True 39 | else: 40 | return False 41 | else: 42 | return True 43 | -------------------------------------------------------------------------------- /data_juicer/ops/grouper/__init__.py: -------------------------------------------------------------------------------- 1 | from .key_value_grouper import KeyValueGrouper 2 | from .naive_grouper import NaiveGrouper 3 | from .naive_reverse_grouper import NaiveReverseGrouper 4 | 5 | __all__ = ['KeyValueGrouper', 'NaiveGrouper', 'NaiveReverseGrouper'] 6 | -------------------------------------------------------------------------------- /data_juicer/ops/grouper/key_value_grouper.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | from data_juicer.utils.common_utils import dict_to_hash, nested_access 4 | 5 | from ..base_op import OPERATORS, Grouper, convert_list_dict_to_dict_list 6 | from .naive_grouper import NaiveGrouper 7 | 8 | 9 | @OPERATORS.register_module('key_value_grouper') 10 | class KeyValueGrouper(Grouper): 11 | """Group samples to batched samples according values in given keys. """ 12 | 13 | def __init__(self, 14 | group_by_keys: Optional[List[str]] = None, 15 | *args, 16 | **kwargs): 17 | """ 18 | Initialization method. 19 | 20 | :param group_by_keys: group samples according values in the keys. 21 | Support for nested keys such as "__dj__stats__.text_len". 22 | It is [self.text_key] in default. 23 | :param args: extra args 24 | :param kwargs: extra args 25 | """ 26 | super().__init__(*args, **kwargs) 27 | 28 | self.group_by_keys = group_by_keys or [self.text_key] 29 | self.naive_grouper = NaiveGrouper() 30 | 31 | def process(self, dataset): 32 | 33 | if len(dataset) == 0: 34 | return dataset 35 | 36 | sample_map = {} 37 | for sample in dataset: 38 | cur_dict = {} 39 | for key in self.group_by_keys: 40 | cur_dict[key] = nested_access(sample, key) 41 | sample_key = dict_to_hash(cur_dict) 42 | if sample_key in sample_map: 43 | sample_map[sample_key].append(sample) 44 | else: 45 | sample_map[sample_key] = [sample] 46 | 47 | batched_samples = [ 48 | convert_list_dict_to_dict_list(sample_map[k]) for k in sample_map 49 | ] 50 | 51 | return batched_samples 52 | -------------------------------------------------------------------------------- /data_juicer/ops/grouper/naive_grouper.py: -------------------------------------------------------------------------------- 1 | from ..base_op import OPERATORS, Grouper, convert_list_dict_to_dict_list 2 | 3 | 4 | @OPERATORS.register_module('naive_grouper') 5 | class NaiveGrouper(Grouper): 6 | """Group all samples to one batched sample. """ 7 | 8 | def __init__(self, *args, **kwargs): 9 | """ 10 | Initialization method. 11 | 12 | :param args: extra args 13 | :param kwargs: extra args 14 | """ 15 | super().__init__(*args, **kwargs) 16 | 17 | def process(self, dataset): 18 | 19 | if len(dataset) == 0: 20 | return dataset 21 | 22 | batched_sample = convert_list_dict_to_dict_list(dataset) 23 | 24 | return [batched_sample] 25 | -------------------------------------------------------------------------------- /data_juicer/ops/grouper/naive_reverse_grouper.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from data_juicer.utils.constant import Fields 5 | from data_juicer.utils.file_utils import create_directory_if_not_exists 6 | 7 | from ..base_op import OPERATORS, Grouper, convert_dict_list_to_list_dict 8 | 9 | 10 | @OPERATORS.register_module('naive_reverse_grouper') 11 | class NaiveReverseGrouper(Grouper): 12 | """Split batched samples to samples. """ 13 | 14 | def __init__(self, batch_meta_export_path=None, *args, **kwargs): 15 | """ 16 | Initialization method. 17 | 18 | :param batch_meta_export_path: the path to export the batch meta. 19 | Just drop the batch meta if it is None. 20 | :param args: extra args 21 | :param kwargs: extra args 22 | """ 23 | super().__init__(*args, **kwargs) 24 | self.batch_meta_export_path = batch_meta_export_path 25 | 26 | def process(self, dataset): 27 | 28 | if len(dataset) == 0: 29 | return dataset 30 | 31 | samples = [] 32 | batch_metas = [] 33 | for sample in dataset: 34 | if Fields.batch_meta in sample: 35 | batch_metas.append(sample[Fields.batch_meta]) 36 | sample = { 37 | k: sample[k] 38 | for k in sample if k != Fields.batch_meta 39 | } 40 | samples.extend(convert_dict_list_to_list_dict(sample)) 41 | if self.batch_meta_export_path is not None: 42 | create_directory_if_not_exists( 43 | os.path.dirname(self.batch_meta_export_path)) 44 | with open(self.batch_meta_export_path, 'w') as f: 45 | for batch_meta in batch_metas: 46 | f.write(json.dumps(batch_meta, ensure_ascii=False) + '\n') 47 | 48 | return samples 49 | -------------------------------------------------------------------------------- /data_juicer/ops/load.py: -------------------------------------------------------------------------------- 1 | from .base_op import OPERATORS 2 | 3 | 4 | def load_ops(process_list): 5 | """ 6 | Load op list according to the process list from config file. 7 | 8 | :param process_list: A process list. Each item is an op name and its 9 | arguments. 10 | :return: The op instance list. 11 | """ 12 | ops = [] 13 | new_process_list = [] 14 | 15 | for process in process_list: 16 | op_name, args = list(process.items())[0] 17 | ops.append(OPERATORS.modules[op_name](**args)) 18 | new_process_list.append(process) 19 | 20 | # store the OP configs into each OP 21 | for op_cfg, op in zip(new_process_list, ops): 22 | op._op_cfg = op_cfg 23 | 24 | return ops 25 | -------------------------------------------------------------------------------- /data_juicer/ops/mapper/annotation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/data_juicer/ops/mapper/annotation/__init__.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/calibrate_query_mapper.py: -------------------------------------------------------------------------------- 1 | from data_juicer.ops.base_op import OPERATORS 2 | from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper 3 | 4 | OP_NAME = 'calibrate_query_mapper' 5 | 6 | 7 | # TODO: LLM-based inference. 8 | @OPERATORS.register_module(OP_NAME) 9 | class CalibrateQueryMapper(CalibrateQAMapper): 10 | """ 11 | Mapper to calibrate query in question-answer pairs based on reference text. 12 | """ 13 | 14 | DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【问题】进行校准,\ 15 | 使其更加详细、准确,且仍可以由原答案回答。只输出校准后的问题,不要输出多余内容。' 16 | 17 | def parse_output(self, raw_output): 18 | return raw_output.strip(), None 19 | -------------------------------------------------------------------------------- /data_juicer/ops/mapper/calibrate_response_mapper.py: -------------------------------------------------------------------------------- 1 | from data_juicer.ops.base_op import OPERATORS 2 | from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper 3 | 4 | OP_NAME = 'calibrate_response_mapper' 5 | 6 | 7 | # TODO: LLM-based inference. 8 | @OPERATORS.register_module(OP_NAME) 9 | class CalibrateResponseMapper(CalibrateQAMapper): 10 | """ 11 | Mapper to calibrate response in question-answer pairs based on reference text. 12 | """ # noqa: E501 13 | 14 | DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【回答】进行校准,\ 15 | 使其更加详细、准确,且仍可以回答原问题。只输出校准后的回答,不要输出多余内容。' 16 | 17 | def parse_output(self, raw_output): 18 | return None, raw_output.strip() 19 | -------------------------------------------------------------------------------- /data_juicer/ops/mapper/clean_email_mapper.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import regex as re 4 | 5 | from ..base_op import OPERATORS, Mapper 6 | 7 | 8 | @OPERATORS.register_module('clean_email_mapper') 9 | class CleanEmailMapper(Mapper): 10 | """Mapper to clean email in text samples.""" 11 | 12 | _batched_op = True 13 | 14 | def __init__(self, 15 | pattern: Optional[str] = None, 16 | repl: str = '', 17 | *args, 18 | **kwargs): 19 | """ 20 | Initialization method. 21 | 22 | :param pattern: regular expression pattern to search for within text. 23 | :param repl: replacement string, default is empty string. 24 | :param args: extra args 25 | :param kwargs: extra args 26 | """ 27 | super().__init__(*args, **kwargs) 28 | if pattern is None: 29 | self.pattern = r'[A-Za-z0-9.\-+_]+@[a-z0-9.\-+_]+\.[a-z]+' 30 | else: 31 | self.pattern = pattern 32 | if ((len(pattern) > 2) and 33 | (pattern.startswith("r'") and pattern.endswith("'") 34 | or pattern.startswith('r"') and pattern.endswith('"'))): 35 | self.pattern = pattern[2:-1] 36 | 37 | self.repl = repl 38 | 39 | def process_batched(self, samples): 40 | for idx, text in enumerate(samples[self.text_key]): 41 | if not re.search(self.pattern, text, flags=re.DOTALL): 42 | continue 43 | samples[self.text_key][idx] = re.sub(pattern=self.pattern, 44 | repl=self.repl, 45 | string=text, 46 | flags=re.DOTALL) 47 | 48 | return samples 49 | -------------------------------------------------------------------------------- /data_juicer/ops/mapper/clean_html_mapper.py: -------------------------------------------------------------------------------- 1 | # Some code here has been modified from: 2 | # https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/ 3 | # -------------------------------------------------------- 4 | 5 | from data_juicer.utils.lazy_loader import LazyLoader 6 | 7 | from ..base_op import OPERATORS, Mapper 8 | 9 | selectolax = LazyLoader('selectolax') 10 | 11 | OP_NAME = 'clean_html_mapper' 12 | 13 | 14 | @OPERATORS.register_module(OP_NAME) 15 | class CleanHtmlMapper(Mapper): 16 | """Mapper to clean html code in text samples.""" 17 | 18 | _batched_op = True 19 | 20 | def __init__(self, *args, **kwargs): 21 | """ 22 | Initialization method. 23 | 24 | :param args: extra args 25 | :param kwargs: extra args 26 | """ 27 | super().__init__(*args, **kwargs) 28 | 29 | def process_batched(self, samples): 30 | 31 | def _clean_html(raw_html): 32 | raw_html = raw_html.replace('
  • ', '\n*') 33 | raw_html = raw_html.replace('
  • ', '') 34 | raw_html = raw_html.replace('
      ', '\n*') 35 | raw_html = raw_html.replace('
    ', '') 36 | parser = selectolax.parser.HTMLParser(raw_html) 37 | return parser.text() 38 | 39 | samples[self.text_key] = [ 40 | _clean_html(text) for text in samples[self.text_key] 41 | ] 42 | return samples 43 | -------------------------------------------------------------------------------- /data_juicer/ops/mapper/fix_unicode_mapper.py: -------------------------------------------------------------------------------- 1 | from data_juicer.utils.lazy_loader import LazyLoader 2 | 3 | from ..base_op import OPERATORS, Mapper 4 | 5 | ftfy = LazyLoader('ftfy') 6 | 7 | OP_NAME = 'fix_unicode_mapper' 8 | 9 | 10 | @OPERATORS.register_module(OP_NAME) 11 | class FixUnicodeMapper(Mapper): 12 | """Mapper to fix unicode errors in text samples.""" 13 | 14 | _batched_op = True 15 | 16 | def __init__(self, normalization: str = None, *args, **kwargs): 17 | """ 18 | Initialization method. 19 | 20 | :param normalization: the specified form of Unicode 21 | normalization mode, which can be one of 22 | ['NFC', 'NFKC', 'NFD', and 'NFKD'], default 'NFC'. 23 | :param args: extra args 24 | :param kwargs: extra args 25 | """ 26 | super().__init__(*args, **kwargs) 27 | if normalization and len(normalization) > 0: 28 | self.normalization = normalization.upper() 29 | else: 30 | self.normalization = 'NFC' 31 | 32 | if self.normalization.upper() not in ['NFC', 'NFKC', 'NFD', 'NFKD']: 33 | raise ValueError(f'Normalization mode [{normalization}] is not ' 34 | 'supported. Can only be one of ' 35 | '["NFC", "NFKC", "NFD", "NFKD"]') 36 | 37 | def process_batched(self, samples): 38 | samples[self.text_key] = [ 39 | ftfy.fix_text(text, normalization=self.normalization) 40 | for text in samples[self.text_key] 41 | ] 42 | return samples 43 | -------------------------------------------------------------------------------- /data_juicer/ops/mapper/optimize_query_mapper.py: -------------------------------------------------------------------------------- 1 | from data_juicer.ops.base_op import OPERATORS 2 | from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper 3 | 4 | OP_NAME = 'optimize_query_mapper' 5 | 6 | 7 | # TODO: Extend LLM-based OPs into API-based implementation. 8 | @OPERATORS.register_module(OP_NAME) 9 | class OptimizeQueryMapper(OptimizeQAMapper): 10 | """ 11 | Mapper to optimize query in question-answer pairs. 12 | """ 13 | 14 | DEFAULT_SYSTEM_PROMPT = '优化问答对中的【问题】,将其更加详细具体,但仍可以由原答案回答。只输出优化后的【问题】,不要输出多余内容。' # noqa: E501 15 | 16 | _accelerator = 'cuda' 17 | 18 | def parse_output(self, raw_output): 19 | return raw_output.strip(), None 20 | -------------------------------------------------------------------------------- /data_juicer/ops/mapper/optimize_response_mapper.py: -------------------------------------------------------------------------------- 1 | from data_juicer.ops.base_op import OPERATORS 2 | from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper 3 | 4 | OP_NAME = 'optimize_response_mapper' 5 | 6 | 7 | # TODO: Extend LLM-based OPs into API-based implementation. 8 | @OPERATORS.register_module(OP_NAME) 9 | class OptimizeResponseMapper(OptimizeQAMapper): 10 | """ 11 | Mapper to optimize response in question-answer pairs. 12 | """ 13 | 14 | DEFAULT_SYSTEM_PROMPT = '请优化问答对中的回答,将其更加详细具体,但仍可以回答原问题。只输出优化后的回答,不要输出多余内容。' 15 | 16 | _accelerator = 'cuda' 17 | 18 | def parse_output(self, raw_output): 19 | return None, raw_output.strip() 20 | -------------------------------------------------------------------------------- /data_juicer/ops/mapper/remove_bibliography_mapper.py: -------------------------------------------------------------------------------- 1 | # Some code here has been modified from: 2 | # https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/ 3 | # -------------------------------------------------------- 4 | 5 | import regex as re 6 | 7 | from ..base_op import OPERATORS, Mapper 8 | 9 | 10 | @OPERATORS.register_module('remove_bibliography_mapper') 11 | class RemoveBibliographyMapper(Mapper): 12 | """Mapper to remove bibliography at the end of documents in Latex 13 | samples.""" 14 | 15 | _batched_op = True 16 | 17 | def __init__(self, *args, **kwargs): 18 | """ 19 | Initialization method. 20 | 21 | :param args: extra args 22 | :param kwargs: extra args 23 | """ 24 | super().__init__(*args, **kwargs) 25 | self.pattern = r'(\\appendix|' 26 | self.pattern += r'\\begin\{references\}|' 27 | self.pattern += r'\\begin\{REFERENCES\}|' 28 | self.pattern += r'\\begin\{thebibliography\}|' 29 | self.pattern += r'\\bibliography\{.*\}' 30 | self.pattern += r').*$' 31 | 32 | def process_batched(self, samples): 33 | samples[self.text_key] = [ 34 | re.sub(pattern=self.pattern, 35 | repl=r'', 36 | string=text, 37 | flags=re.DOTALL) for text in samples[self.text_key] 38 | ] 39 | 40 | return samples 41 | -------------------------------------------------------------------------------- /data_juicer/ops/mapper/remove_non_chinese_character_mapper.py: -------------------------------------------------------------------------------- 1 | import regex as re 2 | 3 | from ..base_op import OPERATORS, Mapper 4 | 5 | 6 | @OPERATORS.register_module('remove_non_chinese_character_mapper') 7 | class RemoveNonChineseCharacterlMapper(Mapper): 8 | """Mapper to remove non chinese Character in text samples.""" 9 | 10 | _batched_op = True 11 | 12 | def __init__(self, 13 | keep_alphabet: bool = True, 14 | keep_number: bool = True, 15 | keep_punc: bool = True, 16 | *args, 17 | **kwargs): 18 | """ 19 | Initialization method. 20 | 21 | :param keep_alphabet: whether to keep alphabet 22 | :param keep_number: whether to keep number 23 | :param keep_punc: whether to keep punctuation 24 | :param args: extra args 25 | :param kwargs: extra args 26 | """ 27 | super().__init__(*args, **kwargs) 28 | self.pattern = u'[^\u4e00-\u9fa5' 29 | if keep_alphabet: 30 | self.pattern += u'A-Za-z' 31 | if keep_number: 32 | self.pattern += u'0-9' 33 | if keep_punc: 34 | self.pattern += u'., ,\\-。%《*》/•、&&(—)(+):?!!“”·]+' 35 | else: 36 | self.pattern += u']' 37 | 38 | def process_batched(self, samples): 39 | for idx, text in enumerate(samples[self.text_key]): 40 | if not re.search(self.pattern, text, flags=re.DOTALL): 41 | continue 42 | 43 | samples[self.text_key][idx] = re.sub(pattern=self.pattern, 44 | repl=r'', 45 | string=text, 46 | flags=re.DOTALL) 47 | return samples 48 | -------------------------------------------------------------------------------- /data_juicer/ops/mapper/remove_specific_chars_mapper.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | 3 | import regex as re 4 | 5 | from ..base_op import OPERATORS, Mapper 6 | 7 | 8 | @OPERATORS.register_module('remove_specific_chars_mapper') 9 | class RemoveSpecificCharsMapper(Mapper): 10 | """Mapper to clean specific chars in text samples.""" 11 | 12 | _batched_op = True 13 | 14 | def __init__(self, 15 | chars_to_remove: Union[str, List[str]] = '◆●■►▼▲▴∆▻▷❖♡□', 16 | *args, 17 | **kwargs): 18 | """ 19 | Initialization method. 20 | 21 | :param chars_to_remove: a list or a string including all 22 | characters that need to be removed from text. 23 | :param args: extra args 24 | :param kwargs: extra args 25 | """ 26 | 27 | super().__init__(*args, **kwargs) 28 | if chars_to_remove: 29 | self.pattern = '[' + '|'.join(chars_to_remove) + ']' 30 | else: 31 | self.pattern = None 32 | 33 | def process_batched(self, samples): 34 | if self.pattern is None: 35 | return samples 36 | 37 | samples[self.text_key] = [ 38 | re.sub(pattern=self.pattern, 39 | repl=r'', 40 | string=text, 41 | flags=re.DOTALL) for text in samples[self.text_key] 42 | ] 43 | return samples 44 | -------------------------------------------------------------------------------- /data_juicer/ops/mapper/remove_table_text_mapper.py: -------------------------------------------------------------------------------- 1 | import regex as re 2 | from pydantic import Field 3 | from typing_extensions import Annotated 4 | 5 | from ..base_op import OPERATORS, Mapper 6 | 7 | 8 | @OPERATORS.register_module('remove_table_text_mapper') 9 | class RemoveTableTextMapper(Mapper): 10 | """ 11 | Mapper to remove table texts from text samples. 12 | 13 | Regular expression is used to remove tables in the range of column 14 | number of tables. 15 | """ 16 | 17 | _batched_op = True 18 | 19 | def __init__(self, 20 | min_col: Annotated[int, Field(ge=2, le=20)] = 2, 21 | max_col: Annotated[int, Field(ge=2, le=20)] = 20, 22 | *args, 23 | **kwargs): 24 | """ 25 | Initialization method. 26 | 27 | :param min_col: The min number of columns of table to remove. 28 | :param max_col: The max number of columns of table to remove. 29 | :param args: extra args 30 | :param kwargs: extra args 31 | """ 32 | super().__init__(*args, **kwargs) 33 | self.min_col = min_col 34 | self.max_col = max_col 35 | self.pattern = r'(?<=\n)((\S+?)([ |\t](\S+?)){%d}\n+){2,}' 36 | 37 | def process_batched(self, samples): 38 | for idx, text in enumerate(samples[self.text_key]): 39 | for i in range(self.min_col - 1, self.max_col): 40 | pattern = re.compile(self.pattern % i) 41 | text = pattern.sub('', text) 42 | 43 | samples[self.text_key][idx] = text 44 | 45 | return samples 46 | -------------------------------------------------------------------------------- /data_juicer/ops/mapper/sentence_split_mapper.py: -------------------------------------------------------------------------------- 1 | from data_juicer.utils.model_utils import get_model, prepare_model 2 | from data_juicer.utils.nltk_utils import patch_nltk_pickle_security 3 | 4 | from ..base_op import OPERATORS, Mapper 5 | from ..common import get_sentences_from_document 6 | 7 | OP_NAME = 'sentence_split_mapper' 8 | 9 | 10 | @OPERATORS.register_module(OP_NAME) 11 | class SentenceSplitMapper(Mapper): 12 | """Mapper to split text samples to sentences.""" 13 | 14 | _batched_op = True 15 | 16 | def __init__(self, lang: str = 'en', *args, **kwargs): 17 | """ 18 | Initialization method. 19 | 20 | :param lang: split sentence of text in which language. 21 | :param args: extra args 22 | :param kwargs: extra args 23 | """ 24 | super().__init__(*args, **kwargs) 25 | self.lang = lang 26 | 27 | # Ensure NLTK pickle security patch is applied 28 | patch_nltk_pickle_security() 29 | 30 | # Prepare the sentence tokenizer model 31 | self.model_key = prepare_model(model_type='nltk', lang=lang) 32 | 33 | def process_batched(self, samples): 34 | # Get the sentence tokenizer model 35 | nltk_model = get_model(self.model_key) 36 | 37 | samples[self.text_key] = [ 38 | get_sentences_from_document( 39 | text, model_func=nltk_model.tokenize if nltk_model else None) 40 | for text in samples[self.text_key] 41 | ] 42 | 43 | return samples 44 | -------------------------------------------------------------------------------- /data_juicer/ops/mapper/whitespace_normalization_mapper.py: -------------------------------------------------------------------------------- 1 | # Most of the code here has been modified from: 2 | # https://github.com/bigscience-workshop/data-preparation 3 | # -------------------------------------------------------- 4 | 5 | from ..base_op import OPERATORS, Mapper 6 | from ..common.special_characters import VARIOUS_WHITESPACES 7 | 8 | 9 | @OPERATORS.register_module('whitespace_normalization_mapper') 10 | class WhitespaceNormalizationMapper(Mapper): 11 | """ 12 | Mapper to normalize different kinds of whitespaces to whitespace ' ' (0x20) 13 | in text samples. 14 | 15 | Different kinds of whitespaces can be found here: 16 | https://en.wikipedia.org/wiki/Whitespace_character 17 | """ 18 | 19 | _batched_op = True 20 | 21 | def __init__(self, *args, **kwargs): 22 | """ 23 | Initialization method. 24 | 25 | :param args: extra args 26 | :param kwargs: extra args 27 | """ 28 | super().__init__(*args, **kwargs) 29 | 30 | def process_batched(self, samples): 31 | for idx, text in enumerate(samples[self.text_key]): 32 | # remove whitespaces before and after the main content 33 | text = text.strip() 34 | 35 | # replace all kinds of whitespaces with ' ' 36 | samples[self.text_key][idx] = ''.join([ 37 | char if char not in VARIOUS_WHITESPACES else ' ' 38 | for char in text 39 | ]) 40 | 41 | return samples 42 | -------------------------------------------------------------------------------- /data_juicer/ops/selector/__init__.py: -------------------------------------------------------------------------------- 1 | from .frequency_specified_field_selector import FrequencySpecifiedFieldSelector 2 | from .random_selector import RandomSelector 3 | from .range_specified_field_selector import RangeSpecifiedFieldSelector 4 | from .tags_specified_field_selector import TagsSpecifiedFieldSelector 5 | from .topk_specified_field_selector import TopkSpecifiedFieldSelector 6 | 7 | __all__ = [ 8 | 'FrequencySpecifiedFieldSelector', 'RandomSelector', 9 | 'RangeSpecifiedFieldSelector', 'TagsSpecifiedFieldSelector', 10 | 'TopkSpecifiedFieldSelector' 11 | ] 12 | -------------------------------------------------------------------------------- /data_juicer/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Some code here has been modified from: 2 | # https://github.com/Megvii-BaseDetection/YOLOX 3 | # -------------------------------------------------------- 4 | 5 | # This file is used for package installation. Script of tools will be 6 | # available. 7 | 8 | import sys 9 | from importlib import abc, util 10 | from pathlib import Path 11 | 12 | _TOOLS_PATH = Path(__file__).resolve().parent.parent.parent / 'tools' 13 | 14 | if _TOOLS_PATH.is_dir(): 15 | # This is true only for in-place installation 16 | # (pip install -e, setup.py develop), 17 | # where setup(package_dir=) does not work: 18 | # https://github.com/pypa/setuptools/issues/230 19 | 20 | class _PathFinder(abc.MetaPathFinder): 21 | 22 | def find_spec(self, name, path, target=None): 23 | if not name.startswith('data_juicer.tools.'): 24 | return 25 | project_name = name.split('.')[-1] + '.py' 26 | target_file = _TOOLS_PATH / project_name 27 | if not target_file.is_file(): 28 | return 29 | return util.spec_from_file_location(name, target_file) 30 | 31 | sys.meta_path.append(_PathFinder()) 32 | -------------------------------------------------------------------------------- /data_juicer/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/data_juicer/utils/__init__.py -------------------------------------------------------------------------------- /data_juicer/utils/resource_utils.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | import psutil 4 | from loguru import logger 5 | 6 | NVSMI_REPORT = True 7 | 8 | 9 | def query_cuda_info(query_key): 10 | global NVSMI_REPORT 11 | # get cuda info using "nvidia-smi" command in MB 12 | try: 13 | nvidia_smi_output = subprocess.check_output([ 14 | 'nvidia-smi', f'--query-gpu={query_key}', 15 | '--format=csv,noheader,nounits' 16 | ]).decode('utf-8') 17 | except Exception as e: 18 | if 'non-zero exit status 2' in str(e): 19 | err_msg = f'The specified query_key [{query_key}] might not be ' \ 20 | f'supported by command nvidia-smi. Please check and ' \ 21 | f'retry!' 22 | elif 'No such file or directory' in str(e): 23 | err_msg = 'Command nvidia-smi is not found. There might be no ' \ 24 | 'GPUs on this machine.' 25 | else: 26 | err_msg = str(e) 27 | if NVSMI_REPORT: 28 | logger.warning(err_msg) 29 | NVSMI_REPORT = False 30 | return None 31 | cuda_info_list = [] 32 | for line in nvidia_smi_output.strip().split('\n'): 33 | cuda_info_list.append(int(line)) 34 | return cuda_info_list 35 | 36 | 37 | def get_cpu_count(): 38 | return psutil.cpu_count() 39 | 40 | 41 | def get_cpu_utilization(): 42 | return psutil.cpu_percent() 43 | 44 | 45 | def query_mem_info(query_key): 46 | mem = psutil.virtual_memory() 47 | if query_key not in mem._fields: 48 | logger.warning(f'No such query key [{query_key}] for memory info. ' 49 | f'Should be one of {mem._fields}') 50 | return None 51 | val = round(mem.__getattribute__(query_key) / (2**20), 2) # in MB 52 | return val 53 | -------------------------------------------------------------------------------- /data_juicer/utils/sample.py: -------------------------------------------------------------------------------- 1 | from itertools import chain, repeat 2 | 3 | import numpy as np 4 | 5 | 6 | def random_sample(dataset, weight=1.0, sample_number=0, seed=None): 7 | """ 8 | Randomly sample a subset from a dataset with weight or number, 9 | if sample number is bigger than 0, we will use sample 10 | number instead of weight. 11 | :param dataset: a HuggingFace dataset 12 | :param weight: sample ratio of dataset 13 | :param sample_number: sample number of dataset 14 | :param seed: random sample seed, if None, 42 as default 15 | :return: a subset of dataset 16 | """ 17 | if seed is None: 18 | seed = 42 19 | 20 | ds_samples = dataset.num_rows 21 | if sample_number <= 0: 22 | sample_number = int(np.ceil(ds_samples * weight)) 23 | 24 | if sample_number == ds_samples: 25 | return dataset 26 | 27 | sample_index = range(sample_number) 28 | 29 | n_repeat = int(np.ceil(sample_number / ds_samples)) - 1 30 | if n_repeat > 0: 31 | remain_samples = sample_number - n_repeat * ds_samples 32 | sample_index = chain(*repeat(range(ds_samples), n_repeat), 33 | range(remain_samples)) 34 | 35 | return dataset.shuffle(seed=seed).select(sample_index) 36 | -------------------------------------------------------------------------------- /demos/README_ZH.md: -------------------------------------------------------------------------------- 1 | # 演示 2 | 3 | 此文件夹包含一些演示样例,帮助用户轻松体验 Data-Juicer 的各种功能和工具。 4 | 5 | ## 用法 6 | 7 | 使用 `demos` 子目录下的 `app.py` 来执行演示样例。 8 | 9 | ```shell 10 | cd 11 | streamlit run app.py 12 | ``` 13 | 14 | ## 可用的演示 15 | 16 | - 数据集样例 (`data`) 17 | - 该文件夹包含一些样例数据集。 18 | 19 | - 初探索 (`overview_scan`) 20 | - 该示例介绍了 Data-Juicer 的基本概念和功能,例如特性、配置系统,算子等等。 21 | 22 | - 数据处理回路 (`data_process_loop`) 23 | - 该示例用来分析和处理数据集,并给出处理前后数据集的统计信息比对。 24 | 25 | - 词法多样性可视化 (`data_visualization_diversity`) 26 | - 该示例可以用来分析 CFT 数据集的动词-名词结构,并绘制成sunburst层级环形图表。 27 | 28 | - 算子效果可视化 (`data_visualization_op_effect`) 29 | - 该示例可以分析数据集的统计信息,并根据这些统计信息可以显示出每个 `Filter` 算子在不同阈值下的效果。 30 | 31 | - 统计信息可视化 (`data_visualization_statistics`) 32 | - 该示例可以分析数据集,并获得多达13种统计信息。 33 | 34 | - 处理 CFT 中文数据 (`process_cft_zh_data`) 35 | - 以 Alpaca-CoT 的部分中文数据为例,演示了 LLM 中指令跟随微调数据和有监督微调数据的分析和处理流程。 36 | 37 | - 处理预训练科学文献类数据 (`process_sci_data`) 38 | - 以 arXiv 的部分数据为例,演示了如何处理 LLM 预训练中的科学文献类数据的分析和处理流程。 39 | 40 | - 处理预训练代码类数据 (`process_code_data`) 41 | - 以 Stack-Exchange 的部分数据为例,演示了如何处理 LLM 预训练中的代码类数据的分析和处理流程。 42 | 43 | - 文本质量打分器 (`tool_quality_classifier`) 44 | - 该示例提供了3种文本质量打分器,对数据集进行打分评估。 45 | 46 | - 按语言分割数据集 (`tool_dataset_splitting_by_language`) 47 | - 该示例按照语言将数据集拆分为不同的子数据集。 48 | 49 | - 数据混合 (`data_mixture`) 50 | - 该示例从多份数据集中进行采样并混合为一个新的数据集。 51 | -------------------------------------------------------------------------------- /demos/api_service/configs/dj_config_template.yaml: -------------------------------------------------------------------------------- 1 | # data-juicer config template 2 | 3 | # global parameters 4 | project_name: 'dj_agent' 5 | dataset_path: '' # path to your dataset directory or file, specified in the agent 6 | np: 4 # number of subprocess to process your dataset 7 | 8 | export_path: '' # path to the output path, specified in the agent 9 | export_original_dataset: true 10 | 11 | # process schedule 12 | # a list of several process operators with their arguments, specified in the agent 13 | process: [] 14 | -------------------------------------------------------------------------------- /demos/api_service/configs/model_configs.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "config_name": "gpt-4", 4 | "model_type": "openai-chat", 5 | "model_name": "gpt-4", 6 | "api_key": "your API key", 7 | "organization": "your organization name", 8 | "generate_args": { 9 | "temperature": 0.5 10 | } 11 | }, 12 | { 13 | "config_name": "dashscope_chat-qwen-max", 14 | "model_type": "dashscope_chat", 15 | "model_name": "qwen-max", 16 | "api_key": "your API key", 17 | "generate_args": { 18 | "temperature": 0.0 19 | } 20 | } 21 | ] 22 | -------------------------------------------------------------------------------- /demos/auto_evaluation_helm/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import streamlit as st 5 | 6 | 7 | class Visualize: 8 | 9 | @staticmethod 10 | def setup(): 11 | st.set_page_config( 12 | page_title='Data-Juicer', 13 | page_icon=':smile', 14 | layout='wide', 15 | # initial_sidebar_state="expanded", 16 | ) 17 | 18 | readme_link = 'https://github.com/alibaba/data-juicer' 19 | st.markdown( 20 | '#
    Data-Juicer
    ', 21 | unsafe_allow_html=True, 22 | ) 23 | st.markdown( 24 | f'
    A One-Stop Data Processing System for \ 25 | Large Language Models, \ 26 | see more details in our Github
    ', 27 | unsafe_allow_html=True, 28 | ) 29 | 30 | @staticmethod 31 | def visualize(): 32 | Visualize.setup() 33 | 34 | 35 | def main(): 36 | 37 | def make_image(line): 38 | pattern = r'!\[(.*?)\]\((.*?)\)' 39 | matches = re.findall(pattern, line) 40 | st.image(matches[0][1], output_format='png', use_column_width=True) 41 | 42 | Visualize.visualize() 43 | buffer = [] 44 | with open('README_ZH.md', 'r', encoding='utf-8') as f: 45 | lines = f.readlines() 46 | for line in lines: 47 | if 'imgs/' in line: 48 | st.markdown('\n'.join(buffer)) 49 | make_image(line) 50 | buffer.clear() 51 | else: 52 | buffer.append(line) 53 | st.markdown('\n'.join(buffer)) 54 | # hello() 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /demos/auto_evaluation_helm/imgs/data-juicer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/auto_evaluation_helm/imgs/data-juicer.png -------------------------------------------------------------------------------- /demos/auto_evaluation_helm/imgs/eval-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/auto_evaluation_helm/imgs/eval-01.png -------------------------------------------------------------------------------- /demos/auto_evaluation_helm/imgs/eval-02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/auto_evaluation_helm/imgs/eval-02.png -------------------------------------------------------------------------------- /demos/data/demo-dataset-annotation-human-preference.jsonl: -------------------------------------------------------------------------------- 1 | {"prompt": "What is the capital of France?", "answer1": "Paris", "answer2": "Lyon"} 2 | {"prompt": "Which planet is known as the Red Planet?", "answer1": "Mars", "answer2": "Venus"} 3 | {"prompt": "What is the chemical symbol for gold?", "answer1": "Au", "answer2": "Ag"} 4 | {"prompt": "Who wrote 'Romeo and Juliet'?", "answer1": "William Shakespeare", "answer2": "Christopher Marlowe"} 5 | {"prompt": "What is the largest mammal on Earth?", "answer1": "Blue Whale", "answer2": "African Elephant"} 6 | {"prompt": "In which year did World War II end?", "answer1": "1945", "answer2": "1944"} 7 | {"prompt": "What is the square root of 64?", "answer1": "8", "answer2": "6"} 8 | {"prompt": "Who painted the Mona Lisa?", "answer1": "Leonardo da Vinci", "answer2": "Michelangelo"} 9 | {"prompt": "What is the main component of the Sun?", "answer1": "Hydrogen", "answer2": "Helium"} 10 | {"prompt": "Which programming language was created by Guido van Rossum?", "answer1": "Python", "answer2": "Java"} -------------------------------------------------------------------------------- /demos/data/demo-dataset-chatml.jsonl: -------------------------------------------------------------------------------- 1 | {"messages": [{"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": "谁在文艺复兴时期绘制人体?"}, {"role": "assistant", "content": "文艺复兴时期是一个关于艺术、文化和学术的复兴运动,在这个时期,许多艺术家都绘制了人体。"},{"role": "user", "content": "那雕塑方面如何呢?"}, {"role": "assistant", "content": "文艺复兴时期的雕塑也非常有名,几位世界级的雕塑大师都出自于这个时期。"}]} 2 | {"messages":[{"content":"You are a helpful assistant","role":"system"},{"content":"什么时期的音乐家开始广泛使用交响乐团?","role":"user"},{"content":"浪漫主义时期,音乐家们开始广泛使用和扩展交响乐团,创作出规模宏大、情感丰富的交响乐作品。","role":"assistant"}]} 3 | {"messages":[{"content":"You are a helpful assistant","role":"system"},{"content":"哪个物理定律描述了物体在不受外力作用时保持静止或匀速直线运动的状态?","role":"user"},{"content":"牛顿第一定律,也称为惯性定律,描述了物体在不受外力作用时保持静止状态或匀速直线运动的状态。","role":"assistant"}]} 4 | {"messages":[{"content":"You are a helpful assistant","role":"system"},{"content":"哪种文学流派强调通过象征和暗喻探索潜意识思维?","role":"user"},{"content":"现代主义文学流派强调通过象征、暗喻以及非线性叙述等手法,深入探索人物的内心世界与潜意识思维。","role":"assistant"}]} -------------------------------------------------------------------------------- /demos/data/demo-dataset-content.jsonl: -------------------------------------------------------------------------------- 1 | {"content": "Today is Sunday and it's a happy day!", "src": "Arxiv", "date": "2023-04-27", "version": "1.0"} 2 | {"content": "Do you need a cup of coffee?", "src": "code", "author": "xxx"} 3 | {"content": "你好,请问你是谁", "src": "customized", "author": "xxx"} 4 | {"content": "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.", "src": "Oscar", "version": "2.0"} 5 | {"content": "欢迎来到阿里巴巴!", "src": "customized", "version": "0.1", "author": "xxx"} 6 | {"content": "This paper proposed a novel method on LLM pretraining.", "src": "customized", "author": "xxx"} 7 | -------------------------------------------------------------------------------- /demos/data/demo-dataset-images.jsonl: -------------------------------------------------------------------------------- 1 | {"images":["../../tests/ops/data/img1.png"], "text": "<__dj__image> A comfortable bed."} 2 | {"images":["../../tests/ops/data/img2.jpg"], "text": "<__dj__image> A bus."} 3 | {"images":["../../tests/ops/data/img3.jpg"], "text": "<__dj__image> Black and white photograph of a woman holding an umbrella."} 4 | -------------------------------------------------------------------------------- /demos/data/demo-dataset-videos.jsonl: -------------------------------------------------------------------------------- 1 | {"videos":["../../tests/ops/data/video1.mp4"], "text": "<__dj__video> a cartoon"} 2 | {"videos":["../../tests/ops/data/video1.mp4"], "text": "<__dj__video> 一段卡通"} 3 | {"videos":["../../tests/ops/data/video2.mp4"], "text": "<__dj__video> a man"} 4 | {"videos":["../../tests/ops/data/video2.mp4"], "text": "<__dj__video> 一个男人"} 5 | {"videos":["../../tests/ops/data/video3.mp4"], "text": "<__dj__video> two women"} 6 | {"videos":["../../tests/ops/data/video3.mp4"], "text": "<__dj__video> 两个女人"} 7 | -------------------------------------------------------------------------------- /demos/data/demo-dataset.jsonl: -------------------------------------------------------------------------------- 1 | {"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}} 2 | {"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}} 3 | {"text": "你好,请问你是谁", "meta": {"src": "customized", "author": "xxx"}} 4 | {"text": "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.", "meta": {"src": "Oscar", "version": "2.0"}} 5 | {"text": "欢迎来到阿里巴巴!", "meta": {"src": "customized", "version": "0.1", "author": "xxx"}} 6 | {"text": "This paper proposed a novel method on LLM pretraining.", "meta": {"src": "customized", "author": "xxx"}} 7 | -------------------------------------------------------------------------------- /demos/data/demo-dataset_1725870268.jsonl: -------------------------------------------------------------------------------- 1 | {"text":"Today is Sunday and it's a happy day!","meta":{"src":"Arxiv","date":1682553600000,"version":"1.0","author":null},"__dj__stats__":{"text_len":37}} 2 | {"text":"Do you need a cup of coffee?","meta":{"src":"code","date":null,"version":null,"author":"xxx"},"__dj__stats__":{"text_len":28}} 3 | {"text":"Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.","meta":{"src":"Oscar","date":null,"version":"2.0","author":null},"__dj__stats__":{"text_len":101}} 4 | {"text":"This paper proposed a novel method on LLM pretraining.","meta":{"src":"customized","date":null,"version":null,"author":"xxx"},"__dj__stats__":{"text_len":54}} 5 | -------------------------------------------------------------------------------- /demos/data/demo-dataset_1725870628.jsonl: -------------------------------------------------------------------------------- 1 | {"text":"Today is Sunday and it's a happy day!","meta":{"src":"Arxiv","date":1682553600000,"version":"1.0","author":null},"__dj__stats__":{"text_len":37}} 2 | {"text":"Do you need a cup of coffee?","meta":{"src":"code","date":null,"version":null,"author":"xxx"},"__dj__stats__":{"text_len":28}} 3 | {"text":"This paper proposed a novel method on LLM pretraining.","meta":{"src":"customized","date":null,"version":null,"author":"xxx"},"__dj__stats__":{"text_len":54}} 4 | -------------------------------------------------------------------------------- /demos/data_process_hpo/app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | 4 | class Visualize: 5 | 6 | @staticmethod 7 | def setup(): 8 | st.set_page_config( 9 | page_title='Data-Juicer', 10 | page_icon=':smile', 11 | #layout='wide', 12 | # initial_sidebar_state="expanded", 13 | ) 14 | 15 | readme_link = 'https://github.com/alibaba/data-juicer' 16 | st.markdown( 17 | '
    Data-Juicer \ 18 |
    ', 19 | unsafe_allow_html=True, 20 | ) 21 | st.markdown( 22 | f'
    A One-Stop Data Processing System for \ 23 | Large Language Models, \ 24 | see more details in our Github
    ', 25 | unsafe_allow_html=True, 26 | ) 27 | 28 | @staticmethod 29 | def visualize(): 30 | Visualize.setup() 31 | 32 | 33 | def main(): 34 | 35 | def hello(): 36 | 37 | st.image('imgs/data-juicer.png', 38 | output_format='png', 39 | use_column_width=True) 40 | demo = 'The demo is coming soon😊' 41 | st.markdown( 42 | f'
    {demo} \ 43 |
    ', 44 | unsafe_allow_html=True, 45 | ) 46 | 47 | Visualize.visualize() 48 | hello() 49 | 50 | 51 | if __name__ == '__main__': 52 | main() 53 | -------------------------------------------------------------------------------- /demos/data_process_hpo/imgs/data-juicer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/data_process_hpo/imgs/data-juicer.png -------------------------------------------------------------------------------- /demos/data_visualization_op_effect/data/demo-dataset.jsonl: -------------------------------------------------------------------------------- 1 | {"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}} 2 | {"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}} 3 | {"text": "你好,请问你是谁", "meta": {"src": "customized", "author": "xxx"}} 4 | {"text": "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.", "meta": {"src": "Oscar", "version": "2.0"}} 5 | {"text": "欢迎来到阿里巴巴!", "meta": {"src": "customized", "version": "0.1", "author": "xxx"}} 6 | {"text": "This paper proposed a novel method on LLM pretraining.", "meta": {"src": "customized", "author": "xxx"}} 7 | -------------------------------------------------------------------------------- /demos/data_visualization_op_insight/cache/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/data_visualization_op_insight/cache/.gitkeep -------------------------------------------------------------------------------- /demos/data_visualization_statistics/data/demo-dataset.jsonl: -------------------------------------------------------------------------------- 1 | {"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}} 2 | {"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}} 3 | {"text": "你好,请问你是谁", "meta": {"src": "customized", "author": "xxx"}} 4 | {"text": "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.", "meta": {"src": "Oscar", "version": "2.0"}} 5 | {"text": "欢迎来到阿里巴巴!", "meta": {"src": "customized", "version": "0.1", "author": "xxx"}} 6 | {"text": "This paper proposed a novel method on LLM pretraining.", "meta": {"src": "customized", "author": "xxx"}} 7 | -------------------------------------------------------------------------------- /demos/process_on_ray/configs/dedup.yaml: -------------------------------------------------------------------------------- 1 | # Process config example for dataset 2 | 3 | # global parameters 4 | project_name: 'demo-dedup' 5 | dataset_path: './demos/process_on_ray/data/' 6 | export_path: './outputs/demo-dedup/demo-ray-bts-dedup-processed' 7 | 8 | executor_type: 'ray' 9 | ray_address: 'auto' 10 | 11 | # process schedule 12 | # a list of several process operators with their arguments 13 | process: 14 | - ray_bts_minhash_deduplicator: 15 | tokenization: 'character' -------------------------------------------------------------------------------- /demos/process_video_on_ray/data/Note.md: -------------------------------------------------------------------------------- 1 | # Note for dataset path 2 | 3 | The videos/images path here support both absolute path and relative path. 4 | Please use an address that can be accessed on all nodes (such as an address within a NAS file-sharing system). 5 | For relative paths, these should be relative to the directory where the dataset file is located (the dataset_path parameter in the config). 6 | - if the dataset_path parameter is a directory, then it's relative to dataset_path 7 | - if the dataset_path parameter is a file, then it's relative to data_path parameter's corresponding dirname 8 | -------------------------------------------------------------------------------- /demos/process_video_on_ray/data/demo-dataset.jsonl: -------------------------------------------------------------------------------- 1 | {"videos": ["./videos/video1.mp4"], "text": "<__dj__video> 10s videos <|__dj__eoc|>'}"} 2 | {"videos": ["./videos/video2.mp4"], "text": "<__dj__video> 23s videos <|__dj__eoc|>'}"} 3 | {"videos": ["./videos/video3.mp4"], "text": "<__dj__video> 46s videos <|__dj__eoc|>'}"} -------------------------------------------------------------------------------- /demos/process_video_on_ray/data/videos/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/process_video_on_ray/data/videos/video1.mp4 -------------------------------------------------------------------------------- /demos/process_video_on_ray/data/videos/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/process_video_on_ray/data/videos/video2.mp4 -------------------------------------------------------------------------------- /demos/process_video_on_ray/data/videos/video3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/process_video_on_ray/data/videos/video3.mp4 -------------------------------------------------------------------------------- /demos/tool_quality_classifier/quality_classifier/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/demos/tool_quality_classifier/quality_classifier/__init__.py -------------------------------------------------------------------------------- /docs/imgs/data-juicer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/docs/imgs/data-juicer.jpg -------------------------------------------------------------------------------- /docs/imgs/eval-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/docs/imgs/eval-01.png -------------------------------------------------------------------------------- /docs/imgs/eval-02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/docs/imgs/eval-02.png -------------------------------------------------------------------------------- /docs/sphinx_doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/sphinx_doc/README.md: -------------------------------------------------------------------------------- 1 | # Data-Juicer Documentation 2 | 3 | We build our documentation with help of Sphinx. 4 | To update the generated 5 | doc, please run the following commands: 6 | 7 | ```bash 8 | # $~/data_juicer/docs/sphinx_doc 9 | # 1. install the sphinx requirements and init the sphinx-quickstart 10 | pip install "py-data-juicer[dev]" 11 | # or pip install -r ../../environments/dev_requires 12 | 13 | # 2. auto generate and build the doc 14 | ./build_doc.sh 15 | 16 | # 3. finalize the doc, which is stored in the `build/` directory 17 | mv build/ position_to_publish 18 | ``` 19 | 20 | Automatic action in github can be found in [here](https://github.com/modelscope/data-juicer/blob/main/.github/workflows/deploy_sphinx_docs.yml). -------------------------------------------------------------------------------- /docs/sphinx_doc/README_ZH.md: -------------------------------------------------------------------------------- 1 | # Data-Juicer 文档 2 | 3 | Data-Juicer 借助 Sphinx 构建文档。 4 | 如需更新生成的文档,请运行以下命令: 5 | 6 | ```bash 7 | # $~/data_juicer/docs/sphinx_doc 8 | # 1.安装 sphinx 的依赖并初始化 sphinx-quickstart 9 | pip install "py-data-juicer[dev]" 10 | # or pip install -r ../../environments/dev_requires 11 | # 2. 运行文档构建脚本 12 | ./build_doc.sh 13 | 14 | # 3. 构建完成的文档存储目录为 `build/` 15 | mv build/ position_to_publish 16 | ``` 17 | 18 | Github上的自动化部署配置可参考 [该处]( 19 | https://github.com/modelscope/data-juicer/blob/main/.github/workflows/deploy_sphinx_docs.yml). -------------------------------------------------------------------------------- /docs/sphinx_doc/_templates/package.rst_t: -------------------------------------------------------------------------------- 1 | {%- macro automodule(modname, options) -%} 2 | .. automodule:: {{ modname }} 3 | {%- for option in options %} 4 | :{{ option }}: 5 | {%- endfor %} 6 | {%- endmacro %} 7 | 8 | {{- pkgname | heading }} 9 | 10 | {%- macro toctree(docnames) -%} 11 | .. toctree:: 12 | :maxdepth: {{ maxdepth }} 13 | {% for docname in docnames %} 14 | {{ docname }} 15 | {%- endfor %} 16 | {%- endmacro %} 17 | 18 | {{ automodule(pkgname, automodule_options) }} 19 | -------------------------------------------------------------------------------- /docs/sphinx_doc/build_doc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | make clean 3 | languages=(en zh_CN) 4 | 5 | for lang in "${languages[@]}"; do 6 | sphinx-multiversion source build/$lang -D "language=$lang" 7 | done 8 | 9 | -------------------------------------------------------------------------------- /docs/sphinx_doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/sphinx_doc/redirect.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Redirecting to https://[REPOSITORY_OWNER].github.io/data-juicer/en/main/ 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /docs/sphinx_doc/source/_static/sidebar-menu.css: -------------------------------------------------------------------------------- 1 | .sidebar-bottom-menu { 2 | position: sticky; 3 | bottom: 0; 4 | width: 100%; 5 | background: var(--color-sidebar-background); 6 | border-top: 1px solid var(--color-sidebar-search-border); 7 | z-index: 1000; 8 | } 9 | 10 | .sidebar-bottom-menu .current-info { 11 | padding: var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal); 12 | text-align: center; 13 | cursor: pointer; 14 | color: var(--color-foreground-primary); 15 | } 16 | 17 | .sidebar-bottom-menu .current-info span { 18 | margin: 0 0.5em; 19 | } 20 | 21 | .sidebar-bottom-menu .dropdown-panel { 22 | display: none; 23 | position: absolute; 24 | bottom: 100%; 25 | left: 0; 26 | width: 100%; 27 | background: var(--color-sidebar-background); 28 | border-top: 1px solid var(--color-sidebar-search-border); 29 | } 30 | 31 | .sidebar-bottom-menu:hover .dropdown-panel { 32 | display: block; 33 | } 34 | 35 | .sidebar-bottom-menu .section { 36 | padding: var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal); 37 | } 38 | 39 | .sidebar-bottom-menu .section dt { 40 | color: var(--color-foreground-secondary); 41 | margin-bottom: 0.5em; 42 | font-weight: bold; 43 | } 44 | 45 | .sidebar-bottom-menu .section dd { 46 | margin: 0; 47 | } 48 | 49 | .sidebar-bottom-menu .section dd a { 50 | display: block; 51 | padding: 0.3em 0; 52 | color: var(--color-sidebar-link-text--top-level); 53 | text-decoration: none; 54 | } 55 | 56 | .sidebar-bottom-menu .section dd a:hover { 57 | color: var(--color-sidebar-link-text--top-level--hover); 58 | } 59 | 60 | .sidebar-bottom-menu .section dd a.active { 61 | font-weight: bold; 62 | } 63 | -------------------------------------------------------------------------------- /docs/sphinx_doc/source/_templates/page.html: -------------------------------------------------------------------------------- 1 | {% extends "!page.html" %} 2 | {% block body %} 3 | {% if current_version and latest_version and current_version != latest_version %} 4 |

    5 | 6 | {% if current_version.is_released %} 7 | You're reading an old version of this documentation. 8 | If you want up-to-date information, please have a look at {{latest_version.name}}. 9 | {% else %} 10 | You're reading the documentation from the main branch. 11 | For the latest released version, please have a look at {{latest_version.name}}. 12 | {% endif %} 13 | 14 |

    15 | {% endif %} 16 | {{ super() }} 17 | {% endblock %}% -------------------------------------------------------------------------------- /docs/sphinx_doc/source/_templates/sidebar/bottom_menu.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/sphinx_doc/source/api.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | .. toctree:: 4 | :maxdepth: 2 5 | :glob: 6 | 7 | data_juicer.core 8 | data_juicer.ops 9 | data_juicer.ops.filter 10 | data_juicer.ops.mapper 11 | data_juicer.ops.deduplicator 12 | data_juicer.ops.selector 13 | data_juicer.ops.common 14 | data_juicer.analysis 15 | data_juicer.config 16 | data_juicer.format 17 | -------------------------------------------------------------------------------- /docs/sphinx_doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. _Data Processing for and with Foundation Models: 2 | .. role:: raw-html-m2r(raw) 3 | :format: html 4 | 5 | `[中文主页] <../../zh_CN/main/index_ZH>`_ | `[DJ-Cookbook] `_ | `[OperatorZoo] `_ | `[API] `_ | `[Awesome LLM Data] `_ 6 | 7 | Data Processing for and with Foundation Models 8 | ============================================== 9 | 10 | .. include:: README.md 11 | :start-after: # Data Processing for and with Foundation Models 12 | :parser: myst_parser.sphinx_ 13 | 14 | .. toctree:: 15 | :maxdepth: 2 16 | :caption: Tutorial 17 | :glob: 18 | :hidden: 19 | 20 | docs/tutorial/* 21 | 22 | .. toctree:: 23 | :maxdepth: 2 24 | :caption: docs 25 | :glob: 26 | :hidden: 27 | 28 | docs/Operators 29 | docs/RecipeGallery 30 | docs/DatasetCfg 31 | docs/* 32 | 33 | .. toctree:: 34 | :maxdepth: 2 35 | :caption: demos 36 | :glob: 37 | :hidden: 38 | 39 | demos/* 40 | demos/**/* 41 | 42 | .. toctree:: 43 | :maxdepth: 2 44 | :caption: tools 45 | :glob: 46 | :hidden: 47 | 48 | tools/* 49 | tools/**/* 50 | 51 | .. toctree:: 52 | :maxdepth: 2 53 | :caption: thirdparty 54 | :glob: 55 | :hidden: 56 | 57 | thirdparty/* 58 | thirdparty/**/* 59 | 60 | .. toctree:: 61 | :maxdepth: 2 62 | :caption: API Reference 63 | :hidden: 64 | 65 | api 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /docs/sphinx_doc/source/index_ZH.rst: -------------------------------------------------------------------------------- 1 | .. _Data Processing for and with Foundation Models: 2 | .. role:: raw-html-m2r(raw) 3 | :format: html 4 | 5 | 6 | `[英文主页] <../../en/main/index>`_ | `[DJ-Cookbook] `_ | `[算子池] `_ | `[API] `_ | `[Awesome LLM Data] `_ 7 | 8 | Data Processing for and with Foundation Models 9 | ============================================== 10 | 11 | .. include:: README_ZH.md 12 | :start-after: # Data Processing for and with Foundation Models 13 | :parser: myst_parser.sphinx_ 14 | 15 | .. toctree:: 16 | :maxdepth: 2 17 | :caption: 教程 18 | :glob: 19 | :hidden: 20 | 21 | docs/tutorial/* 22 | 23 | .. toctree:: 24 | :maxdepth: 2 25 | :caption: 帮助文档 26 | :glob: 27 | :hidden: 28 | 29 | docs/Operators 30 | docs/RecipeGallery_ZH 31 | docs/DatasetCfg_ZH 32 | docs/* 33 | 34 | .. toctree:: 35 | :maxdepth: 2 36 | :caption: demos 37 | :glob: 38 | :hidden: 39 | 40 | demos/* 41 | demos/**/* 42 | 43 | .. toctree:: 44 | :maxdepth: 2 45 | :caption: 工具 46 | :glob: 47 | :hidden: 48 | 49 | tools/* 50 | tools/**/* 51 | 52 | .. toctree:: 53 | :maxdepth: 2 54 | :caption: 第三方 55 | :glob: 56 | :hidden: 57 | 58 | thirdparty/* 59 | thirdparty/**/* 60 | 61 | .. toctree:: 62 | :maxdepth: 2 63 | :hidden: 64 | :caption: API Reference 65 | 66 | api -------------------------------------------------------------------------------- /label_studio_localhost_connection.json: -------------------------------------------------------------------------------- 1 | { 2 | "server_url": "http://localhost:7070", 3 | "api_token": "05409236-67a5-4169-af96-a52a818d0e81", 4 | "username": "admin@example.com", 5 | "password": "abcd1234" 6 | } 7 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # Scripts for Running on Multi Nodes 2 | 3 | 4 | #### Running Using DLC(Deep Learing Containers) 5 | 6 | Internally we use [DLC](https://www.alibabacloud.com/help/zh/pai/user-guide/container-training/) from [PAI](https://www.alibabacloud.com/zh/product/machine-learning) to process data on multiple nodes. 7 | 8 | The scripts to run are in ./dlc folder. 9 | 10 | #### Running Using Slurm 11 | 12 | We provide scripts to support running on slurm, see ./run_slurm.sh. 13 | 14 | You can also manually partition the data according to specific circumstances and then use Slurm to run it on multiple machines by yourself. 15 | -------------------------------------------------------------------------------- /scripts/dlc/run_on_dlc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # parameters 4 | datajuicer_path= # path to data-juicer 5 | config_path= # path to config file 6 | 7 | # hostname 8 | hostname=$(hostname) 9 | 10 | # into datajuicer_path 11 | cd "$datajuicer_path" || { echo "Could not change directory to $datajuicer_path"; exit 1; } 12 | 13 | # copy and generate new config file for current host 14 | 15 | config_basename=$(basename "$config_path") 16 | config_dirname=$(dirname "$config_path") 17 | config_extension="${config_basename##*.}" 18 | config_basename="${config_basename%.*}" 19 | 20 | new_config_file="${config_dirname}/${config_basename}_$hostname.$config_extension" 21 | cp "$config_path" "$new_config_file" || { echo "Could not copy config file"; exit 1; } 22 | 23 | echo "$new_config_file" 24 | 25 | if [[ "$OSTYPE" == "darwin"* ]]; then 26 | SED_I_SUFFIX=".bak" 27 | else 28 | SED_I_SUFFIX="" 29 | fi 30 | 31 | if grep -q "dataset_path: .*\.json" "$new_config_file"; then 32 | # .json data_path 33 | sed -i$SED_I_SUFFIX "s|\(dataset_path: \)\(.*\)\(/[^/]*\)\(.json\)|\1\2\3_$hostname\4|" "$new_config_file" 34 | else 35 | # dir dataset_path 36 | sed -i$SED_I_SUFFIX "s|\(dataset_path: '\)\(.*\)'\(.*\)|\1\2_$hostname'\3|" "$new_config_file" 37 | fi 38 | 39 | if grep -q "export_path: .*\.json" "$new_config_file"; then 40 | # .json data_path 41 | sed -i$SED_I_SUFFIX "s|\(export_path: \)\(.*\)\(/[^/]*\)\(.json\)|\1\2\3_$hostname\4|" "$new_config_file" 42 | else 43 | # dir export_path 44 | sed -i$SED_I_SUFFIX "s|\(export_path: '\)\(.*\)'\(.*\)|\1\2_$hostname'\3|" "$new_config_file" 45 | fi 46 | 47 | # run to process data 48 | python tools/process_data.py --config "$new_config_file" 49 | -------------------------------------------------------------------------------- /scripts/run_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=parallel_data_processing 3 | #SBATCH --ntasks= 4 | #SBATCH --nodes= 5 | #SBATCH --time= 6 | #SBATCH --partition=your_partition_name 7 | #SBATCH --output=parallel_data_processing_%j.out 8 | #SBATCH --error=parallel_data_processing_%j.err 9 | #SBATCH --exclusive 10 | 11 | # set data-juicer and config file path 12 | datajuicer_path= # please fill the actual path of datajuicer 13 | config_path= # please fill the actual path of config file 14 | 15 | 16 | cd $datajuicer_path 17 | 18 | readarray -t nodes <<< "$(sinfo --noheader --states=idle,mixed --format=%n)" 19 | 20 | PARTITION_SCRIPT=./scripts/dlc/partition_data_dlc.py 21 | 22 | # set dataset path 23 | JSON_FILE_PATH = # please fill the actual path of dataset file 24 | 25 | # split_dataset 26 | python $PARTITION_SCRIPT --input_file_path $JSON_FILE_PATH --output_file_path $JSON_FILE_PATH --hostnames "${nodes[@]}" 27 | 28 | # run on nodes 29 | 30 | for node in "${nodes[@]}"; do 31 | echo $node 32 | nohup srun --nodes=1 --ntasks=1 -w $node scripts/dlc/run_on_dlc.sh > output_$node.log 2>&1 & 33 | done 34 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/__init__.py -------------------------------------------------------------------------------- /tests/analysis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/analysis/__init__.py -------------------------------------------------------------------------------- /tests/analysis/test_collector.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | import torch.distributions 5 | 6 | from data_juicer.analysis.collector import TextTokenDistCollector 7 | 8 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 9 | 10 | class TextTokenDistCollectorTest(DataJuicerTestCaseBase): 11 | 12 | test_data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 13 | '..', 14 | '..', 15 | 'demos', 16 | 'data', 17 | 'demo-dataset.jsonl') 18 | 19 | tokenizer_model = 'EleutherAI/pythia-6.9b-deduped' 20 | 21 | @classmethod 22 | def tearDownClass(cls) -> None: 23 | super().tearDownClass(cls.tokenizer_model) 24 | 25 | def test_basic_func(self): 26 | collector = TextTokenDistCollector(self.tokenizer_model) 27 | dist = collector.collect(self.test_data_path, 'text') 28 | self.assertIsInstance(dist, torch.distributions.Categorical) 29 | 30 | 31 | if __name__ == '__main__': 32 | unittest.main() 33 | -------------------------------------------------------------------------------- /tests/analysis/test_draw.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | from data_juicer.analysis.draw import draw_heatmap 6 | 7 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 8 | 9 | class DrawTest(DataJuicerTestCaseBase): 10 | 11 | def test_basic_func(self): 12 | test_data = [ 13 | {'a': 1, 'b': 2, 'c': 3}, 14 | {'a': 4, 'b': 5, 'c': 6}, 15 | {'a': 7, 'b': 8, 'c': 9}, 16 | {'a': 10, 'b': 11, 'c': 12}, 17 | {'a': 13, 'b': 14, 'c': 15}, 18 | ] 19 | data = pd.DataFrame.from_records(test_data) 20 | ret = draw_heatmap(data, data.columns, triangle=True, show=True) 21 | self.assertIsInstance(ret, plt.Figure) 22 | ret = draw_heatmap(data, data.columns, show=True) 23 | self.assertIsInstance(ret, plt.Figure) 24 | 25 | 26 | if __name__ == '__main__': 27 | unittest.main() 28 | -------------------------------------------------------------------------------- /tests/benchmark_performance/configs/audio.yaml: -------------------------------------------------------------------------------- 1 | # The config file for performance benchmark to measure the processing speed for 2 | # the current Data-Juicer system. OPs are selected according to their tags and 3 | # types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md) 4 | 5 | project_name: 'performance-benchmark-audio' 6 | dataset_path: 'perf_bench_data/audio/audio-10k.jsonl' 7 | export_path: 'outputs/performance_benchmark_audio/res.jsonl' 8 | np: 16 9 | use_cache: false 10 | 11 | process: 12 | - audio_duration_filter: 13 | - audio_nmf_snr_filter: 14 | - audio_size_filter: 15 | -------------------------------------------------------------------------------- /tests/benchmark_performance/configs/image.yaml: -------------------------------------------------------------------------------- 1 | # The config file for performance benchmark to measure the processing speed for 2 | # the current Data-Juicer system. OPs are selected according to their tags and 3 | # types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md) 4 | 5 | project_name: 'performance-benchmark-image' 6 | dataset_path: 'perf_bench_data/image/10k.jsonl' 7 | export_path: 'outputs/performance_benchmark_image/res.jsonl' 8 | np: 16 9 | use_cache: false 10 | 11 | process: 12 | - image_aesthetics_filter: 13 | hf_scorer_model: 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE' 14 | min_score: 0.0 15 | mem_required: '1500MB' 16 | - image_captioning_mapper: 17 | hf_img2seq: 'Salesforce/blip2-opt-2.7b' 18 | caption_num: 1 19 | keep_original_sample: false 20 | mem_required: '16GB' 21 | - image_shape_filter: 22 | - image_blur_mapper: 23 | - image_deduplicator: 24 | -------------------------------------------------------------------------------- /tests/benchmark_performance/configs/text.yaml: -------------------------------------------------------------------------------- 1 | # The config file for performance benchmark to measure the processing speed for 2 | # the current Data-Juicer system. OPs are selected according to their tags and 3 | # types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md) 4 | 5 | project_name: 'performance-benchmark-text' 6 | dataset_path: 'perf_bench_data/text/wiki-10k.jsonl' 7 | export_path: 'outputs/performance_benchmark_text/res.jsonl' 8 | np: 16 9 | use_cache: false 10 | 11 | process: 12 | - whitespace_normalization_mapper: 13 | - token_num_filter: 14 | hf_tokenizer: 'EleutherAI/pythia-6.9b-deduped' 15 | min_num: 0 16 | - document_deduplicator: 17 | lowercase: false 18 | ignore_non_character: false 19 | - topk_specified_field_selector: 20 | field_key: '__dj__stats__.num_token' 21 | topk: 1000 22 | -------------------------------------------------------------------------------- /tests/benchmark_performance/configs/video.yaml: -------------------------------------------------------------------------------- 1 | # The config file for performance benchmark to measure the processing speed for 2 | # the current Data-Juicer system. OPs are selected according to their tags and 3 | # types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md) 4 | 5 | project_name: 'performance-benchmark-video' 6 | dataset_path: 'perf_bench_data/video/msr_vtt_train.jsonl' 7 | export_path: 'outputs/performance_benchmark_video/res.jsonl' 8 | np: 16 9 | use_cache: false 10 | 11 | process: 12 | - video_nsfw_filter: 13 | hf_nsfw_model: 'Falconsai/nsfw_image_detection' 14 | max_score: 1.0 15 | mem_required: '1GB' 16 | - video_tagging_from_frames_mapper: 17 | mem_required: '9GB' 18 | - video_duration_filter: 19 | - video_split_by_key_frame_mapper: 20 | keep_original_sample: false 21 | - video_deduplicator: 22 | -------------------------------------------------------------------------------- /tests/benchmark_performance/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # setup wandb configs 4 | export WANDB_BASE_URL=$1 5 | export WANDB_API_KEY=$2 6 | 7 | BENCH_PATH=$(cd "$(dirname "$0")"; pwd) 8 | RELATIVE_DJ_PATH=../.. 9 | MODALITIES=("text" "image" "video" "audio") 10 | 11 | cd $BENCH_PATH 12 | 13 | # 1. prepare dataset 14 | echo "Preparing benchmark dataset..." 15 | wget -q http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/perf_bench_data/perf_bench_data.tar.gz && tar zxf perf_bench_data.tar.gz 16 | 17 | # 2. run the benchmark 18 | for modality in ${MODALITIES[@]} 19 | do 20 | echo "Running benchmark for $modality modality..." 21 | python $RELATIVE_DJ_PATH/tools/process_data.py --config configs/$modality.yaml 22 | done 23 | 24 | # 3. collect & upload benchmark results 25 | echo "Collecting and reporting benchmark results..." 26 | python report.py 27 | 28 | # 4. clear resources 29 | echo "Clearing resources..." 30 | rm -rf perf_bench_data.tar.gz 31 | rm -rf perf_bench_data/ 32 | -------------------------------------------------------------------------------- /tests/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/config/__init__.py -------------------------------------------------------------------------------- /tests/config/demo_4_test.yaml: -------------------------------------------------------------------------------- 1 | # Process config example for Arxiv dataset 2 | 3 | # global parameters 4 | project_name: 'test_demo' 5 | dataset_path: './demos/data/demo-dataset.jsonl' # path to your dataset directory or file 6 | np: 4 # number of subprocess to process your dataset 7 | 8 | export_path: './outputs/demo/demo-processed.parquet' 9 | 10 | # process schedule 11 | # a list of several process operators with their arguments 12 | process: 13 | - whitespace_normalization_mapper: 14 | - language_id_score_filter: 15 | lang: 'zh' 16 | - document_deduplicator: # deduplicate text samples using md5 hashing exact matching method 17 | lowercase: false # whether to convert text to lower case 18 | ignore_non_character: false 19 | - remove_table_text_mapper: 20 | -------------------------------------------------------------------------------- /tests/config/demo_4_test_bad_val.yaml: -------------------------------------------------------------------------------- 1 | # Process config example for Arxiv dataset 2 | 3 | # global parameters 4 | project_name: 'test_demo' 5 | dataset_path: './demos/data/demo-dataset.jsonl' # path to your dataset directory or file 6 | np: 4 # number of subprocess to process your dataset 7 | 8 | export_path: './outputs/demo/demo-processed.parquet' 9 | 10 | # process schedule 11 | # a list of several process operators with their arguments 12 | process: 13 | - whitespace_normalization_mapper: 14 | - language_id_score_filter: 15 | lang: 'zh' 16 | - document_deduplicator: # deduplicate text samples using md5 hashing exact matching method 17 | lowercase: false # whether to convert text to lower case 18 | ignore_non_character: false 19 | - remove_table_text_mapper: 20 | max_col: 30 # !! a bad value !! -------------------------------------------------------------------------------- /tests/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/core/__init__.py -------------------------------------------------------------------------------- /tests/core/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/core/data/__init__.py -------------------------------------------------------------------------------- /tests/core/data/test_data/sample.jsonl: -------------------------------------------------------------------------------- 1 | {"text": "Today is Sunday and it's a happy day!"} 2 | {"text": "Today is Monday and it's a happy day!"} 3 | {"text": "Today is Tuesday and it's a happy day!"} 4 | {"text": "Today is Wednesday and it's a happy day!"} 5 | {"text": "Today is Thursday and it's a happy day!"} 6 | {"text": "Today is Friday and it's a happy day!"} 7 | {"text": "Today is Saturday and it's a happy day!"} 8 | -------------------------------------------------------------------------------- /tests/core/data/test_data/sample.txt: -------------------------------------------------------------------------------- 1 | Today is Sunday and it's a happy day! 2 | -------------------------------------------------------------------------------- /tests/core/data/test_data/test_config.yaml: -------------------------------------------------------------------------------- 1 | project_name: 'dataset-local-json' 2 | dataset: 3 | configs: 4 | - type: 'local' 5 | path: 'sample.jsonl' -------------------------------------------------------------------------------- /tests/core/data/test_data/test_config_list.yaml: -------------------------------------------------------------------------------- 1 | project_name: 'dataset-local-list' 2 | dataset: 3 | configs: 4 | - type: 'local' 5 | path: 'sample.jsonl' 6 | - type: 'local' 7 | path: 'sample.txt' -------------------------------------------------------------------------------- /tests/core/data/test_data/test_config_ray.yaml: -------------------------------------------------------------------------------- 1 | 2 | # global parameters 3 | project_name: 'ray-demo-new-config' 4 | dataset: 5 | configs: 6 | - type: local 7 | path: ./test_data/sample.jsonl # path to your dataset directory or file 8 | 9 | export_path: './outputs/demo/demo-processed' 10 | 11 | executor_type: 'ray' 12 | ray_address: 'auto' 13 | 14 | -------------------------------------------------------------------------------- /tests/download/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/download/__init__.py -------------------------------------------------------------------------------- /tests/format/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/format/__init__.py -------------------------------------------------------------------------------- /tests/format/data/structured/demo-dataset.csv: -------------------------------------------------------------------------------- 1 | text,meta 2 | Today is Sunday and it's a happy day!,"{'src': 'Arxiv', 'date': datetime.datetime(2023, 4, 27, 0, 0), 'version': '1.0', 'author': None}" 3 | Do you need a cup of coffee?,"{'src': 'code', 'date': None, 'version': None, 'author': 'xxx'}" 4 | 你好,请问你是谁,"{'src': 'customized', 'date': None, 'version': None, 'author': 'xxx'}" 5 | "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.","{'src': 'Oscar', 'date': None, 'version': '2.0', 'author': None}" 6 | 欢迎来到阿里巴巴!,"{'src': 'customized', 'date': None, 'version': '0.1', 'author': 'xxx'}" 7 | This paper proposed a novel method on LLM pretraining.,"{'src': 'customized', 'date': None, 'version': None, 'author': 'xxx'}" 8 | -------------------------------------------------------------------------------- /tests/format/data/structured/demo-dataset.jsonl: -------------------------------------------------------------------------------- 1 | {"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}} 2 | {"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}} 3 | {"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}} 4 | {"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}} 5 | {"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}} 6 | {"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}} 7 | -------------------------------------------------------------------------------- /tests/format/data/structured/demo-dataset.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/format/data/structured/demo-dataset.parquet -------------------------------------------------------------------------------- /tests/format/data/structured/demo-dataset.tsv: -------------------------------------------------------------------------------- 1 | text meta 2 | Today is Sunday and it's a happy day! {'src': 'Arxiv', 'date': datetime.datetime(2023, 4, 27, 0, 0), 'version': '1.0', 'author': None} 3 | Do you need a cup of coffee? {'src': 'code', 'date': None, 'version': None, 'author': 'xxx'} 4 | 你好,请问你是谁 {'src': 'customized', 'date': None, 'version': None, 'author': 'xxx'} 5 | Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément. {'src': 'Oscar', 'date': None, 'version': '2.0', 'author': None} 6 | 欢迎来到阿里巴巴! {'src': 'customized', 'date': None, 'version': '0.1', 'author': 'xxx'} 7 | This paper proposed a novel method on LLM pretraining. {'src': 'customized', 'date': None, 'version': None, 'author': 'xxx'} 8 | -------------------------------------------------------------------------------- /tests/format/data/text/sample1.txt: -------------------------------------------------------------------------------- 1 | Today is Sunday and it's a happy day! 2 | -------------------------------------------------------------------------------- /tests/format/data/text/sample2.txt: -------------------------------------------------------------------------------- 1 | Do you need a cup of coffee? 2 | -------------------------------------------------------------------------------- /tests/format/data/text/sample3.txt: -------------------------------------------------------------------------------- 1 | 你好,请问你是谁 2 | -------------------------------------------------------------------------------- /tests/format/data/text/sample4.txt: -------------------------------------------------------------------------------- 1 | Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément. 2 | -------------------------------------------------------------------------------- /tests/format/data/text/sample5.txt: -------------------------------------------------------------------------------- 1 | 欢迎来到阿里巴巴! 2 | -------------------------------------------------------------------------------- /tests/format/data/text/sample6.txt: -------------------------------------------------------------------------------- 1 | This paper proposed a novel method on LLM pretraining. 2 | -------------------------------------------------------------------------------- /tests/format/test_csv_formatter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from data_juicer.format.csv_formatter import CsvFormatter 5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 6 | 7 | 8 | class CsvFormatterTest(DataJuicerTestCaseBase): 9 | 10 | def setUp(self): 11 | self._path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 12 | 'data', 'structured') 13 | self._file = os.path.join(self._path, 'demo-dataset.csv') 14 | print(self._file) 15 | 16 | def test_csv_file(self): 17 | formatter = CsvFormatter(self._file) 18 | ds = formatter.load_dataset() 19 | self.assertEqual(len(ds), 6) 20 | self.assertEqual(list(ds.features.keys()), ['text', 'meta']) 21 | 22 | def test_csv_path(self): 23 | formatter = CsvFormatter(self._path) 24 | ds = formatter.load_dataset() 25 | self.assertEqual(len(ds), 6) 26 | self.assertEqual(list(ds.features.keys()), ['text', 'meta']) 27 | 28 | 29 | if __name__ == '__main__': 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /tests/format/test_empty_formatter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from data_juicer.format.empty_formatter import EmptyFormatter 5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 6 | 7 | 8 | class EmptyFormatterTest(DataJuicerTestCaseBase): 9 | 10 | text_key = 'text' 11 | 12 | def test_empty_dataset(self): 13 | ds_len = 10 14 | formatter = EmptyFormatter(length=ds_len, feature_keys=[self.text_key]) 15 | ds = formatter.load_dataset() 16 | 17 | self.assertEqual(len(ds), ds_len) 18 | self.assertEqual(list(ds.features.keys()), [self.text_key]) 19 | 20 | for item in ds: 21 | self.assertDictEqual(item, {self.text_key: None}) 22 | 23 | # test map 24 | update_column = {self.text_key: 1} 25 | 26 | def map_fn(sample): 27 | sample.update(update_column) 28 | return sample 29 | 30 | ds = ds.map(map_fn) 31 | self.assertEqual(len(ds), ds_len) 32 | for item in ds: 33 | self.assertDictEqual(item, update_column) 34 | 35 | # test filter 36 | def filter_fn(sample): 37 | return sample[self.text_key] > 2 38 | 39 | ds = ds.filter(filter_fn) 40 | self.assertEqual(len(ds), 0) 41 | 42 | 43 | if __name__ == '__main__': 44 | unittest.main() 45 | -------------------------------------------------------------------------------- /tests/format/test_parquet_formatter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from data_juicer.format.parquet_formatter import ParquetFormatter 5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 6 | 7 | 8 | class CsvFormatterTest(DataJuicerTestCaseBase): 9 | 10 | def setUp(self): 11 | self._path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 12 | 'data', 'structured') 13 | self._file = os.path.join(self._path, 'demo-dataset.parquet') 14 | print(self._file) 15 | 16 | def test_parquet_file(self): 17 | formatter = ParquetFormatter(self._file) 18 | ds = formatter.load_dataset() 19 | self.assertEqual(len(ds), 6) 20 | self.assertEqual(list(ds.features.keys()), ['text', 'meta']) 21 | 22 | def test_parquet_path(self): 23 | formatter = ParquetFormatter(self._path) 24 | ds = formatter.load_dataset() 25 | self.assertEqual(len(ds), 6) 26 | self.assertEqual(list(ds.features.keys()), ['text', 'meta']) 27 | 28 | 29 | if __name__ == '__main__': 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /tests/format/test_tsv_formatter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from data_juicer.format.tsv_formatter import TsvFormatter 5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 6 | 7 | 8 | class TsvFormatterTest(DataJuicerTestCaseBase): 9 | 10 | def setUp(self): 11 | self._path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 12 | 'data', 'structured') 13 | self._file = os.path.join(self._path, 'demo-dataset.tsv') 14 | print(self._file) 15 | 16 | def test_tsv_file(self): 17 | formatter = TsvFormatter(self._file) 18 | ds = formatter.load_dataset() 19 | self.assertEqual(len(ds), 6) 20 | self.assertEqual(list(ds.features.keys()), ['text', 'meta']) 21 | 22 | def test_tsv_path(self): 23 | formatter = TsvFormatter(self._path) 24 | ds = formatter.load_dataset() 25 | self.assertEqual(len(ds), 6) 26 | self.assertEqual(list(ds.features.keys()), ['text', 'meta']) 27 | 28 | 29 | if __name__ == '__main__': 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /tests/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/__init__.py -------------------------------------------------------------------------------- /tests/ops/aggregator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/aggregator/__init__.py -------------------------------------------------------------------------------- /tests/ops/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/common/__init__.py -------------------------------------------------------------------------------- /tests/ops/data/audio1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/audio1.wav -------------------------------------------------------------------------------- /tests/ops/data/audio2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/audio2.wav -------------------------------------------------------------------------------- /tests/ops/data/audio3.ogg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/audio3.ogg -------------------------------------------------------------------------------- /tests/ops/data/blip.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/blip.jpg -------------------------------------------------------------------------------- /tests/ops/data/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/cat.jpg -------------------------------------------------------------------------------- /tests/ops/data/img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img1.png -------------------------------------------------------------------------------- /tests/ops/data/img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img2.jpg -------------------------------------------------------------------------------- /tests/ops/data/img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img3.jpg -------------------------------------------------------------------------------- /tests/ops/data/img4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img4.png -------------------------------------------------------------------------------- /tests/ops/data/img5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img5.jpg -------------------------------------------------------------------------------- /tests/ops/data/img6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img6.jpg -------------------------------------------------------------------------------- /tests/ops/data/img7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img7.jpg -------------------------------------------------------------------------------- /tests/ops/data/img8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img8.jpg -------------------------------------------------------------------------------- /tests/ops/data/img_pair_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img_pair_1.jpg -------------------------------------------------------------------------------- /tests/ops/data/img_pair_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/img_pair_2.jpg -------------------------------------------------------------------------------- /tests/ops/data/lena-face.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/lena-face.jpg -------------------------------------------------------------------------------- /tests/ops/data/lena.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/lena.jpg -------------------------------------------------------------------------------- /tests/ops/data/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/video1.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/video2.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video3-no-audio.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/video3-no-audio.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/video3.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video4.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/video4.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video5.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/data/video5.mp4 -------------------------------------------------------------------------------- /tests/ops/deduplicator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/deduplicator/__init__.py -------------------------------------------------------------------------------- /tests/ops/filter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/filter/__init__.py -------------------------------------------------------------------------------- /tests/ops/grouper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/grouper/__init__.py -------------------------------------------------------------------------------- /tests/ops/grouper/test_key_value_grouper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from data_juicer.core.data import NestedDataset as Dataset 4 | from data_juicer.ops.grouper.key_value_grouper import KeyValueGrouper 5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 6 | 7 | 8 | class KeyValueGrouperTest(DataJuicerTestCaseBase): 9 | 10 | def _run_helper(self, op, samples, target): 11 | dataset = Dataset.from_list(samples) 12 | new_dataset = op.run(dataset) 13 | 14 | for batched_sample in new_dataset: 15 | lang = batched_sample['meta'][0]['language'] 16 | self.assertEqual(batched_sample['text'], target[lang]) 17 | 18 | def test_key_value_grouper(self): 19 | 20 | source = [ 21 | { 22 | 'text': "Today is Sunday and it's a happy day!", 23 | 'meta': { 24 | 'language': 'en' 25 | } 26 | }, 27 | { 28 | 'text': "Welcome to Alibaba.", 29 | 'meta': { 30 | 'language': 'en' 31 | } 32 | }, 33 | { 34 | 'text': '欢迎来到阿里巴巴!', 35 | 'meta': { 36 | 'language': 'zh' 37 | } 38 | }, 39 | ] 40 | target = { 41 | 'en':[ 42 | "Today is Sunday and it's a happy day!", 43 | "Welcome to Alibaba." 44 | ], 45 | 'zh':[ 46 | '欢迎来到阿里巴巴!' 47 | ] 48 | } 49 | 50 | op = KeyValueGrouper(['meta.language']) 51 | self._run_helper(op, source, target) 52 | 53 | if __name__ == '__main__': 54 | unittest.main() -------------------------------------------------------------------------------- /tests/ops/grouper/test_naive_grouper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from data_juicer.core.data import NestedDataset as Dataset 4 | from data_juicer.ops.grouper.naive_grouper import NaiveGrouper 5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 6 | 7 | 8 | class NaiveGrouperTest(DataJuicerTestCaseBase): 9 | 10 | def _run_helper(self, op, samples, target): 11 | dataset = Dataset.from_list(samples) 12 | new_dataset = op.run(dataset) 13 | 14 | for d, t in zip(new_dataset, target): 15 | self.assertEqual(d['text'], t['text']) 16 | 17 | def test_naive_group(self): 18 | 19 | source = [ 20 | { 21 | 'text': "Today is Sunday and it's a happy day!" 22 | }, 23 | { 24 | 'text': 25 | "Sur la plateforme MT4, plusieurs manières d'accéder à \n" 26 | 'ces fonctionnalités sont conçues simultanément.' 27 | }, 28 | { 29 | 'text': '欢迎来到阿里巴巴!' 30 | }, 31 | ] 32 | target = [ 33 | { 34 | 'text':[ 35 | "Today is Sunday and it's a happy day!", 36 | "Sur la plateforme MT4, plusieurs manières d'accéder à \n" 37 | 'ces fonctionnalités sont conçues simultanément.', 38 | '欢迎来到阿里巴巴!' 39 | ] 40 | } 41 | ] 42 | 43 | op = NaiveGrouper() 44 | self._run_helper(op, source, target) 45 | 46 | if __name__ == '__main__': 47 | unittest.main() -------------------------------------------------------------------------------- /tests/ops/mapper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/mapper/__init__.py -------------------------------------------------------------------------------- /tests/ops/mapper/annotation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/mapper/annotation/__init__.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_clean_copyright_mapper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from data_juicer.core.data import NestedDataset as Dataset 4 | from data_juicer.ops.mapper.clean_copyright_mapper import CleanCopyrightMapper 5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 6 | 7 | 8 | class CleanCopyrightMapperTest(DataJuicerTestCaseBase): 9 | 10 | def setUp(self): 11 | self.op = CleanCopyrightMapper() 12 | 13 | def _run_clean_copyright(self, samples): 14 | dataset = Dataset.from_list(samples) 15 | dataset = dataset.map(self.op.process, batch_size=2) 16 | 17 | for data in dataset: 18 | self.assertEqual(data['text'], data['target']) 19 | 20 | def test_clean_copyright(self): 21 | 22 | samples = [{ 23 | 'text': '这是一段 /* 多行注释\n注释内容copyright\n*/ 的文本。另外还有一些 // 单行注释。', 24 | 'target': '这是一段 的文本。另外还有一些 // 单行注释。' 25 | }, { 26 | 'text': '如果多行/*注释中没有\n关键词,那么\n这部分注释也不会\n被清除*/\n会保留下来', 27 | 'target': '如果多行/*注释中没有\n关键词,那么\n这部分注释也不会\n被清除*/\n会保留下来' 28 | }, { 29 | 'text': '//if start with\n//that will be cleaned \n evenly', 30 | 'target': ' evenly' 31 | }, { 32 | 'text': 'http://www.nasosnsncc.com', 33 | 'target': 'http://www.nasosnsncc.com' 34 | }, { 35 | 'text': '#if start with\nthat will be cleaned \n#evenly', 36 | 'target': 'that will be cleaned \n#evenly' 37 | }, { 38 | 'text': '--if start with\n--that will be cleaned \n#evenly', 39 | 'target': '' 40 | }] 41 | self._run_clean_copyright(samples) 42 | 43 | 44 | if __name__ == '__main__': 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /tests/ops/mapper/test_fix_unicode_mapper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from data_juicer.core.data import NestedDataset as Dataset 4 | from data_juicer.ops.mapper.fix_unicode_mapper import FixUnicodeMapper 5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 6 | 7 | 8 | class FixUnicodeMapperTest(DataJuicerTestCaseBase): 9 | 10 | def setUp(self): 11 | self.op = FixUnicodeMapper() 12 | 13 | def _run_fix_unicode(self, samples): 14 | dataset = Dataset.from_list(samples) 15 | dataset = dataset.map(self.op.process, batch_size=2) 16 | 17 | for data in dataset: 18 | self.assertEqual(data['text'], data['target']) 19 | 20 | def test_bad_unicode_text(self): 21 | 22 | samples = [ 23 | { 24 | 'text': '✔ No problems', 25 | 'target': '✔ No problems' 26 | }, 27 | { 28 | 'text': 29 | 'The Mona Lisa doesn’t have eyebrows.', 30 | 'target': 'The Mona Lisa doesn\'t have eyebrows.' 31 | }, 32 | ] 33 | 34 | self._run_fix_unicode(samples) 35 | 36 | def test_good_unicode_text(self): 37 | samples = [ 38 | { 39 | 'text': 'No problems', 40 | 'target': 'No problems' 41 | }, 42 | { 43 | 'text': '阿里巴巴', 44 | 'target': '阿里巴巴' 45 | }, 46 | ] 47 | self._run_fix_unicode(samples) 48 | 49 | 50 | if __name__ == '__main__': 51 | unittest.main() 52 | -------------------------------------------------------------------------------- /tests/ops/mapper/test_generate_qa_from_examples_mapper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from loguru import logger 4 | 5 | from data_juicer.ops.mapper.generate_qa_from_examples_mapper import \ 6 | GenerateQAFromExamplesMapper 7 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 8 | 9 | @unittest.skip('unknown vllm connection error') 10 | class GenerateQAFromExamplesMapperTest(DataJuicerTestCaseBase): 11 | text_key = 'text' 12 | 13 | def _run_op(self, enable_vllm=False, sampling_params=None, num_proc=1): 14 | op = GenerateQAFromExamplesMapper( 15 | seed_file='demos/data/demo-dataset-chatml.jsonl', 16 | example_num=3, 17 | enable_vllm=enable_vllm, 18 | sampling_params=sampling_params, 19 | ) 20 | 21 | from data_juicer.format.empty_formatter import EmptyFormatter 22 | dataset = EmptyFormatter(3, [self.text_key]).load_dataset() 23 | 24 | results = dataset.map(op.process, num_proc=num_proc, with_rank=True) 25 | 26 | for row in results: 27 | logger.info(row) 28 | self.assertIn(op.query_key, row) 29 | self.assertIn(op.response_key, row) 30 | 31 | def test(self): 32 | sampling_params = {'max_new_tokens': 200} 33 | self._run_op(sampling_params=sampling_params) 34 | 35 | def test_multi_process(self): 36 | sampling_params = {'max_new_tokens': 200} 37 | self._run_op(sampling_params=sampling_params, num_proc=2) 38 | 39 | def test_vllm(self): 40 | sampling_params = {'max_tokens': 200} 41 | self._run_op(enable_vllm=True, sampling_params=sampling_params) 42 | 43 | 44 | if __name__ == '__main__': 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /tests/ops/mapper/test_mllm_mapper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from data_juicer.ops.mapper.mllm_mapper import MllmMapper 3 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 4 | import os 5 | 6 | class MllmMapperTest(DataJuicerTestCaseBase): 7 | 8 | hf_model = 'llava-hf/llava-v1.6-vicuna-7b-hf' 9 | 10 | text_key = 'text' 11 | image_key = "images" 12 | 13 | @classmethod 14 | def tearDownClass(cls) -> None: 15 | super().tearDownClass(cls.hf_model) 16 | 17 | def _run_mllm(self): 18 | op = MllmMapper( 19 | hf_model=self.hf_model, 20 | temperature=0.9, 21 | top_p=0.95, 22 | max_new_tokens=512 23 | ) 24 | 25 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 26 | 'data') 27 | img2_path = os.path.join(data_path, 'img2.jpg') 28 | img3_path = os.path.join(data_path, 'img3.jpg') 29 | 30 | samples = [ 31 | {self.text_key: 'Describe this image.', self.image_key: [img2_path, img3_path]}, 32 | ] 33 | 34 | for sample in samples: 35 | result = op.process(sample) 36 | self.assertIsInstance(sample[self.text_key], list) 37 | self.assertEqual(len(sample[self.text_key]), 2) 38 | print(f'Output results: {result}') 39 | 40 | def test_mllm(self): 41 | self._run_mllm() 42 | 43 | 44 | if __name__ == '__main__': 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /tests/ops/mapper/test_optimize_qa_mapper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from loguru import logger 4 | 5 | from data_juicer.core.data import NestedDataset as Dataset 6 | from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper 7 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 8 | 9 | @unittest.skip('unknown vllm connection error') 10 | class OptimizeQAMapperTest(DataJuicerTestCaseBase): 11 | 12 | def _run_op(self, enable_vllm=False, sampling_params=None, num_proc=1): 13 | 14 | op = OptimizeQAMapper(enable_vllm=enable_vllm, 15 | sampling_params=sampling_params) 16 | 17 | samples = [{ 18 | 'query': 19 | '鱼香肉丝怎么做?', 20 | 'response': 21 | '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' 22 | }, { 23 | 'query': '什么是蚂蚁上树?', 24 | 'response': '蚂蚁上树是一道中国菜。' 25 | }] 26 | dataset = Dataset.from_list(samples) 27 | results = dataset.map(op.process, num_proc=num_proc, with_rank=True) 28 | 29 | for row in results: 30 | logger.info(f'Output results: {row}') 31 | self.assertNotEqual(row['query'], '') 32 | self.assertNotEqual(row['response'], '') 33 | 34 | def test(self): 35 | sampling_params = {'max_new_tokens': 200} 36 | self._run_op(sampling_params=sampling_params) 37 | 38 | def test_multi_process(self): 39 | sampling_params = {'max_new_tokens': 200} 40 | self._run_op(sampling_params=sampling_params, num_proc=2) 41 | 42 | def test_vllm(self): 43 | sampling_params = {'max_tokens': 200} 44 | self._run_op(enable_vllm=True, sampling_params=sampling_params) 45 | 46 | 47 | if __name__ == '__main__': 48 | unittest.main() 49 | -------------------------------------------------------------------------------- /tests/ops/mapper/test_optimize_query_mapper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from loguru import logger 4 | 5 | from data_juicer.core.data import NestedDataset as Dataset 6 | from data_juicer.ops.mapper.optimize_query_mapper import OptimizeQueryMapper 7 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 8 | 9 | @unittest.skip('unknown vllm connection error') 10 | class OptimizeQueryMapperTest(DataJuicerTestCaseBase): 11 | 12 | def _run_op(self, enable_vllm=False, sampling_params=None, num_proc=1): 13 | 14 | op = OptimizeQueryMapper( 15 | hf_model='alibaba-pai/Qwen2-7B-Instruct-Refine', 16 | input_template='{}', 17 | qa_pair_template='{}', 18 | enable_vllm=enable_vllm, 19 | sampling_params=sampling_params) 20 | 21 | samples = [{ 22 | 'query': 23 | '鱼香肉丝怎么做?', 24 | 'response': 25 | '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' 26 | }, { 27 | 'query': '什么是蚂蚁上树?', 28 | 'response': '蚂蚁上树是一道中国菜。' 29 | }] 30 | dataset = Dataset.from_list(samples) 31 | results = dataset.map(op.process, num_proc=num_proc, with_rank=True) 32 | 33 | for row in results: 34 | logger.info(f'Output results: {row}') 35 | self.assertNotEqual(row['query'], '') 36 | 37 | def test(self): 38 | sampling_params = {'max_new_tokens': 200} 39 | self._run_op(sampling_params=sampling_params) 40 | 41 | def test_multi_process(self): 42 | sampling_params = {'max_new_tokens': 200} 43 | self._run_op(sampling_params=sampling_params, num_proc=2) 44 | 45 | def test_vllm(self): 46 | sampling_params = {'max_tokens': 200} 47 | self._run_op(enable_vllm=True, sampling_params=sampling_params) 48 | 49 | 50 | if __name__ == '__main__': 51 | unittest.main() 52 | -------------------------------------------------------------------------------- /tests/ops/mapper/test_optimize_response_mapper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from loguru import logger 4 | 5 | from data_juicer.core.data import NestedDataset as Dataset 6 | from data_juicer.ops.mapper.optimize_response_mapper import \ 7 | OptimizeResponseMapper 8 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 9 | 10 | @unittest.skip('unknown vllm connection error') 11 | class OptimizeResponseMapperTest(DataJuicerTestCaseBase): 12 | 13 | def _run_op(self, enable_vllm=False, sampling_params=None, num_proc=1): 14 | 15 | op = OptimizeResponseMapper(enable_vllm=enable_vllm, 16 | sampling_params=sampling_params) 17 | 18 | samples = [{ 19 | 'query': 20 | '鱼香肉丝怎么做?', 21 | 'response': 22 | '鱼香肉丝是将猪肉丝与胡萝卜、青椒、木耳炒制,调入调味料如酱油、醋和辣豆瓣酱,快速翻炒而成的美味佳肴。' 23 | }, { 24 | 'query': '什么是蚂蚁上树?', 25 | 'response': '蚂蚁上树是一道中国菜。' 26 | }] 27 | dataset = Dataset.from_list(samples) 28 | results = dataset.map(op.process, num_proc=num_proc, with_rank=True) 29 | 30 | for row in results: 31 | logger.info(f'Output results: {row}') 32 | self.assertNotEqual(row['response'], '') 33 | 34 | def test(self): 35 | sampling_params = {'max_new_tokens': 200} 36 | self._run_op(sampling_params=sampling_params) 37 | 38 | def test_multi_process(self): 39 | sampling_params = {'max_new_tokens': 200} 40 | self._run_op(sampling_params=sampling_params, num_proc=2) 41 | 42 | def test_vllm(self): 43 | sampling_params = {'max_tokens': 200} 44 | self._run_op(enable_vllm=True, sampling_params=sampling_params) 45 | 46 | 47 | if __name__ == '__main__': 48 | unittest.main() 49 | -------------------------------------------------------------------------------- /tests/ops/mapper/test_punctuation_normalization_mapper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from data_juicer.core.data import NestedDataset as Dataset 4 | from data_juicer.ops.mapper.punctuation_normalization_mapper import \ 5 | PunctuationNormalizationMapper 6 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 7 | 8 | 9 | class PunctuationNormalizationMapperTest(DataJuicerTestCaseBase): 10 | 11 | def setUp(self): 12 | self.op = PunctuationNormalizationMapper() 13 | 14 | def _run_punctuation_normalization(self, samples): 15 | dataset = Dataset.from_list(samples) 16 | dataset = dataset.map(self.op.process, batch_size=2) 17 | 18 | for data in dataset: 19 | self.assertEqual(data['text'], data['target']) 20 | 21 | def test_case(self): 22 | 23 | samples = [{ 24 | 'text': 25 | ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►', 26 | 'target': 27 | ",.,\"\"\"\"\"\"\"\"\"\"'::?!();- - . ~'...-<>[]%-" 28 | }] 29 | 30 | self._run_punctuation_normalization(samples) 31 | 32 | 33 | if __name__ == '__main__': 34 | unittest.main() 35 | -------------------------------------------------------------------------------- /tests/ops/mapper/test_remove_bibliography_mapper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from data_juicer.core.data import NestedDataset as Dataset 4 | from data_juicer.ops.mapper.remove_bibliography_mapper import \ 5 | RemoveBibliographyMapper 6 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 7 | 8 | 9 | class RemoveBibliographyMapperTest(DataJuicerTestCaseBase): 10 | 11 | def setUp(self): 12 | self.op = RemoveBibliographyMapper() 13 | 14 | def _run_remove_bibliography(self, samples): 15 | dataset = Dataset.from_list(samples) 16 | dataset = dataset.map(self.op.process, batch_size=2) 17 | 18 | for data in dataset: 19 | self.assertEqual(data['text'], data['target']) 20 | 21 | def test_bibliography_case(self): 22 | 23 | samples = [{ 24 | 'text': 25 | "%%\n%% This is file `sample-sigconf.tex\\clearpage\n\\bibliographystyle{ACM-Reference-Format}\n\\bibliography{sample-base}\n\\end{document}\n\\endinput\n%%\n%% End of file `sample-sigconf.tex'.\n", # noqa: E501 26 | 'target': 27 | '%%\n%% This is file `sample-sigconf.tex\\clearpage\n\\bibliographystyle{ACM-Reference-Format}\n' # noqa: E501 28 | }] 29 | 30 | self._run_remove_bibliography(samples) 31 | 32 | def test_ref_case(self): 33 | 34 | samples = [{ 35 | 'text': 36 | "%%\n%% This is file `sample-sigconf.tex\\clearpage\n\\begin{references}\n\\end{document}\n\\endinput\n%%\n%% End of file `sample-sigconf.tex'.\n", # noqa: E501 37 | 'target': 38 | '%%\n%% This is file `sample-sigconf.tex\\clearpage\n' # noqa: E501 39 | }] 40 | 41 | self._run_remove_bibliography(samples) 42 | 43 | 44 | if __name__ == '__main__': 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /tests/ops/mapper/test_remove_specific_chars_mapper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from data_juicer.core.data import NestedDataset as Dataset 4 | from data_juicer.ops.mapper.remove_specific_chars_mapper import \ 5 | RemoveSpecificCharsMapper 6 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 7 | 8 | 9 | class RemoveSpecificCharsMapperTest(DataJuicerTestCaseBase): 10 | 11 | def setUp(self): 12 | self.op = RemoveSpecificCharsMapper() 13 | 14 | def _run_helper(self, samples): 15 | dataset = Dataset.from_list(samples) 16 | dataset = dataset.map(self.op.process, batch_size=2) 17 | 18 | for data in dataset: 19 | self.assertEqual(data['text'], data['target']) 20 | 21 | def test_complete_html_text(self): 22 | 23 | samples = [ 24 | { 25 | 'text': '这是一个干净的文本。Including Chinese and English.', 26 | 'target': '这是一个干净的文本。Including Chinese and English.', 27 | }, 28 | { 29 | 'text': '◆●■►▼▲▴∆▻▷❖♡□', 30 | 'target': '', 31 | }, 32 | { 33 | 'text': '►This is a dirty text ▻ 包括中文和英文', 34 | 'target': 'This is a dirty text 包括中文和英文', 35 | }, 36 | { 37 | 'text': '多个●■►▼这样的特殊字符可以►▼▲▴∆吗?', 38 | 'target': '多个这样的特殊字符可以吗?', 39 | }, 40 | { 41 | 'text': '未指定的●■☛₨➩►▼▲特殊字符会☻▷❖被删掉吗??', 42 | 'target': '未指定的☛₨➩特殊字符会☻被删掉吗??', 43 | }, 44 | ] 45 | self._run_helper(samples) 46 | 47 | 48 | if __name__ == '__main__': 49 | unittest.main() 50 | -------------------------------------------------------------------------------- /tests/ops/mapper/test_sentence_augmentation_mapper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from copy import deepcopy 3 | from data_juicer.ops.mapper.sentence_augmentation_mapper import SentenceAugmentationMapper 4 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 5 | 6 | 7 | class SentenceAugmentationMapperTest(DataJuicerTestCaseBase): 8 | 9 | hf_model = 'Qwen/Qwen2-7B-Instruct' 10 | 11 | text_key = "caption1" 12 | text_key_second = "caption2" 13 | 14 | @classmethod 15 | def tearDownClass(cls) -> None: 16 | super().tearDownClass(cls.hf_model) 17 | 18 | def _run_sentence_augmentation_mapper(self): 19 | op = SentenceAugmentationMapper( 20 | hf_model=self.hf_model, 21 | task_sentence="Please replace one entity in this sentence with " 22 | "another entity, such as an animal, a vehicle, or a " 23 | "piece of furniture. Please only answer with the " 24 | "replaced sentence.", 25 | max_new_tokens=512, 26 | temperature=0.9, 27 | top_p=0.95, 28 | num_beams=1, 29 | text_key=self.text_key, 30 | text_key_second=self.text_key_second 31 | ) 32 | 33 | samples = [ 34 | {self.text_key: 'a book is near a cat and a dog'} 35 | ] 36 | 37 | for sample in samples: 38 | result = op.process(deepcopy(sample)) 39 | print(f'Output results: {result}') 40 | self.assertNotEqual(sample, result) 41 | 42 | def test_sentence_augmentation_mapper(self): 43 | self._run_sentence_augmentation_mapper() 44 | 45 | 46 | if __name__ == '__main__': 47 | unittest.main() 48 | -------------------------------------------------------------------------------- /tests/ops/mapper/test_whitespace_normalization_mapper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from data_juicer.core.data import NestedDataset as Dataset 4 | from data_juicer.ops.mapper.whitespace_normalization_mapper import \ 5 | WhitespaceNormalizationMapper 6 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 7 | 8 | 9 | class WhitespaceNormalizationMapperTest(DataJuicerTestCaseBase): 10 | 11 | def setUp(self): 12 | self.op = WhitespaceNormalizationMapper() 13 | 14 | def _run_whitespace_normalization(self, samples): 15 | dataset = Dataset.from_list(samples) 16 | dataset = dataset.map(self.op.process, batch_size=2) 17 | 18 | for data in dataset: 19 | self.assertEqual(data['text'], data['target']) 20 | 21 | def test_case(self): 22 | 23 | samples = [{ 24 | 'text': 'x \t               \u200B\u200C\u200D\u2060\u0084y', 25 | 'target': 'x y' 26 | }] 27 | 28 | self._run_whitespace_normalization(samples) 29 | 30 | 31 | if __name__ == '__main__': 32 | unittest.main() 33 | -------------------------------------------------------------------------------- /tests/ops/selector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/ops/selector/__init__.py -------------------------------------------------------------------------------- /tests/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/tools/__init__.py -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tests/utils/__init__.py -------------------------------------------------------------------------------- /tests/utils/test_availablility_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from data_juicer.utils.availability_utils import _is_package_available 4 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 5 | 6 | class AvailabilityUtilsTest(DataJuicerTestCaseBase): 7 | 8 | def test_is_package_available(self): 9 | exist = _is_package_available('fsspec') 10 | self.assertTrue(exist) 11 | exist, version = _is_package_available('fsspec', return_version=True) 12 | self.assertTrue(exist) 13 | self.assertEqual(version, '2023.5.0') 14 | 15 | exist = _is_package_available('non_existing_package') 16 | self.assertFalse(exist) 17 | exist, version = _is_package_available('non_existing_package', return_version=True) 18 | self.assertFalse(exist) 19 | self.assertEqual(version, 'N/A') 20 | 21 | 22 | if __name__ == '__main__': 23 | unittest.main() 24 | -------------------------------------------------------------------------------- /tests/utils/test_cache_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import datasets 4 | 5 | from data_juicer.utils.cache_utils import DatasetCacheControl, dataset_cache_control 6 | 7 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 8 | 9 | class DatasetCacheControlTest(DataJuicerTestCaseBase): 10 | 11 | def test_basic_func(self): 12 | self.assertTrue(datasets.is_caching_enabled()) 13 | with DatasetCacheControl(on=False): 14 | self.assertFalse(datasets.is_caching_enabled()) 15 | self.assertTrue(datasets.is_caching_enabled()) 16 | 17 | with DatasetCacheControl(on=False): 18 | self.assertFalse(datasets.is_caching_enabled()) 19 | with DatasetCacheControl(on=True): 20 | self.assertTrue(datasets.is_caching_enabled()) 21 | self.assertFalse(datasets.is_caching_enabled()) 22 | self.assertTrue(datasets.is_caching_enabled()) 23 | 24 | def test_decorator(self): 25 | 26 | @dataset_cache_control(on=False) 27 | def check(): 28 | return datasets.is_caching_enabled() 29 | 30 | self.assertTrue(datasets.is_caching_enabled()) 31 | self.assertFalse(check()) 32 | self.assertTrue(datasets.is_caching_enabled()) 33 | 34 | 35 | if __name__ == '__main__': 36 | unittest.main() 37 | -------------------------------------------------------------------------------- /tests/utils/test_fingerprint_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from data_juicer.core import NestedDataset 4 | from data_juicer.utils.fingerprint_utils import generate_fingerprint 5 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 6 | 7 | class FingerprintUtilsTest(DataJuicerTestCaseBase): 8 | 9 | def test_generate_fingerprint(self): 10 | dataset = NestedDataset.from_list([{'text_key': 'test_val'}]) 11 | fingerprint = generate_fingerprint(dataset) 12 | self.assertLessEqual(len(fingerprint), 64) 13 | 14 | # with func args 15 | new_fingerprint = generate_fingerprint(dataset, lambda x: x['text_key']) 16 | self.assertLessEqual(len(new_fingerprint), 64) 17 | self.assertNotEqual(new_fingerprint, fingerprint) 18 | 19 | 20 | if __name__ == '__main__': 21 | unittest.main() 22 | -------------------------------------------------------------------------------- /tests/utils/test_process_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch 3 | import multiprocess as mp 4 | 5 | from data_juicer.utils.process_utils import setup_mp, get_min_cuda_memory, calculate_np 6 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 7 | 8 | class ProcessUtilsTest(DataJuicerTestCaseBase): 9 | 10 | def test_setup_mp(self): 11 | all_methods = mp.get_all_start_methods() 12 | setup_mp() 13 | self.assertIn(mp.get_start_method(), all_methods) 14 | 15 | setup_mp('spawn') 16 | self.assertEqual(mp.get_start_method(), 'spawn') 17 | 18 | setup_mp(['spawn', 'forkserver', 'fork']) 19 | self.assertEqual(mp.get_start_method(), 'spawn') 20 | 21 | def test_get_min_cuda_memory(self): 22 | if torch.cuda.is_available(): 23 | self.assertIsInstance(get_min_cuda_memory(), int) 24 | else: 25 | with self.assertRaises(AssertionError): 26 | get_min_cuda_memory() 27 | 28 | 29 | if __name__ == '__main__': 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /tests/utils/test_registry.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from data_juicer.utils.registry import Registry 4 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 5 | 6 | class RegistryTest(DataJuicerTestCaseBase): 7 | 8 | def test_basic_func(self): 9 | registry = Registry('test') 10 | 11 | class A: 12 | pass 13 | registry.register_module('module_a', A) 14 | 15 | @registry.register_module('module_b') 16 | class B: 17 | pass 18 | 19 | self.assertEqual(registry.name, 'test') 20 | self.assertEqual(registry.modules, {'module_a': A, 'module_b': B}) 21 | self.assertEqual(registry.list(), ['module_a', 'module_b']) 22 | self.assertEqual(registry.get('module_a'), A) 23 | self.assertEqual(registry.get('module_b'), B) 24 | 25 | with self.assertRaises(KeyError): 26 | registry.register_module('module_b', B) 27 | 28 | with self.assertRaises(TypeError): 29 | registry.register_module(1, A) 30 | 31 | 32 | if __name__ == '__main__': 33 | unittest.main() 34 | -------------------------------------------------------------------------------- /tests/utils/test_resource_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from data_juicer.utils.lazy_loader import LazyLoader 3 | from data_juicer.utils.resource_utils import query_cuda_info, query_mem_info, get_cpu_count, get_cpu_utilization 4 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 5 | 6 | 7 | class RegistryTest(DataJuicerTestCaseBase): 8 | 9 | def test_query_cuda_info(self): 10 | torch = LazyLoader('torch') 11 | if torch.cuda.is_available(): 12 | self.assertIsNotNone(query_cuda_info('memory.used')) 13 | else: 14 | self.assertIsNone(query_cuda_info('memory.used')) 15 | 16 | def test_query_mem_info(self): 17 | self.assertIsInstance(query_mem_info('total'), float) 18 | self.assertIsNone(query_mem_info('invalid key')) 19 | 20 | def test_get_cpu_count(self): 21 | self.assertIsInstance(get_cpu_count(), int) 22 | 23 | def test_get_cpu_utilization(self): 24 | self.assertIsInstance(get_cpu_utilization(), float) 25 | 26 | 27 | if __name__ == '__main__': 28 | unittest.main() 29 | -------------------------------------------------------------------------------- /tests/utils/test_unittest_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase 4 | 5 | class UnittestUtilsTest(DataJuicerTestCaseBase): 6 | 7 | def test_placeholder(self): 8 | # placeholder for test 9 | pass 10 | 11 | 12 | if __name__ == '__main__': 13 | unittest.main() 14 | -------------------------------------------------------------------------------- /thirdparty/LLM_ecosystems/README.md: -------------------------------------------------------------------------------- 1 | # LLM Ecosystems 2 | 3 | Dependencies of Auto Evaluation Toolkit, see [`tools/evaluator/README.md`](../tools/evaluator/README.md) for more details. 4 | 5 | ## Installation 6 | 7 | The auto-evaluation toolkit requires customized [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) and [HELM](https://github.com/stanford-crfm/helm). 8 | To avoid dependency problems when installing those packages, we recommend using NGC's PyTorch container (`nvcr.io/nvidia/pytorch:22.12-py3`). 9 | Assuming the path to your shared file system (where your data and model checkpoints are saved) is `/mnt/shared`, start the docker container with following commands. 10 | 11 | ```shell 12 | docker pull nvcr.io/nvidia/pytorch:22.12-py3 13 | docker run --gpus all -it --rm -v /mnt/shared:/workspace 14 | ``` 15 | 16 | After starting the docker container, run the following scripts in the container to install Megatron-LM or HELM. 17 | 18 | The training machines only need to install Megatron-LM: 19 | 20 | ```shell 21 | ./setup_megatron.sh 22 | ``` 23 | 24 | The evaluation machine needs to install both Megatron-LM and HELM 25 | 26 | ```shell 27 | ./setup_megatron.sh 28 | ./setup_helm.sh 29 | ``` 30 | 31 | The toolkit use [W&B](https://wandb.ai/) (wandb) to monitor the trend of metrics during training. Above steps have installed wandb, and you only need to run `wandb login` and enter your wandb API key. If you have your own instance of wandb, run the following script. 32 | 33 | ```shell 34 | wandb login --host 35 | # enter your api key 36 | ``` 37 | 38 | -------------------------------------------------------------------------------- /thirdparty/LLM_ecosystems/README_ZH.md: -------------------------------------------------------------------------------- 1 | # 大语言模型生态 2 | 3 | 本目录包含了 Auto Evaluation Toolkit 的第三方依赖项,更多细节请参考 `tools/evaluator/README_ZH.md`。 4 | 5 | ## 安装 6 | 7 | Auto Evaluation Toolkit 依赖于定制化的 [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) 和 [HELM](https://github.com/stanford-crfm/helm)。 8 | 为了避免安装这些软件包时可能出现的依赖项问题,我们建议使用 NGC 的 Pytorch 容器(`nvcr.io/nvidia/pytorch:22.12-py3`)。 9 | 假设您共享文件系统的路径(即数据集和模型检查点的存储路径)为`/mnt/shared`,请使用如下指令启动 Docker 容器。 10 | 11 | ```shell 12 | docker pull nvcr.io/nvidia/pytorch:22.12-py3 13 | docker run --gpus all -it --rm -v /mnt/shared:/workspace 14 | ``` 15 | 16 | 启动 Docker 容器后,在容器中运行以下脚本以安装 Megatron-LM 或 HELM。 17 | 18 | 训练机只需要安装 Megatron-LM: 19 | 20 | ```shell 21 | ./setup_megatron.sh 22 | ``` 23 | 24 | 评测机需要同时安装 Megatron-LM 和 HELM 25 | 26 | ```shell 27 | ./setup_megatron.sh 28 | ./setup_helm.sh 29 | ``` 30 | 31 | 工具包使用 [WandB](https://wandb.ai/) 来监视训练期间各指标的趋势。上面的步骤中已安装 wandb,您只需要运行 `wand login` 并输入 wandb API 密钥即可。如果您有自己的 wandb 实例,请运行以下脚本。 32 | 33 | ```shell 34 | wandb login --host 35 | #输入您的 API 密钥 36 | ``` 37 | -------------------------------------------------------------------------------- /thirdparty/LLM_ecosystems/setup_helm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export THIRD_PARTY_DIR=$(cd $(dirname $0); pwd) 4 | export HELM_DIR=${THIRD_PARTY_DIR}/helm 5 | 6 | # install conda 7 | conda &> /dev/null 8 | if [ $? -ne 0 ]; then 9 | echo "> setup conda ..." 10 | CONDA_DIR=${HOME}/miniconda3 11 | wget https://repo.anaconda.com/miniconda/Miniconda3-py38_23.1.0-1-Linux-x86_64.sh 12 | bash Miniconda3-py38_23.1.0-1-Linux-x86_64.sh -b -p $CONDA_DIR 13 | export PATH=$CONDA_DIR/bin:$PATH 14 | fi 15 | 16 | # setup helm 17 | echo "> setup helm ..." 18 | git clone https://github.com/stanford-crfm/helm.git 19 | cd $HELM_DIR 20 | git reset 33ca6e62 --hard 21 | git apply ${THIRD_PARTY_DIR}/patch/helm.diff 22 | conda create -n crfm-helm python=3.8 pip -y 23 | eval "$(conda shell.bash hook)" 24 | conda activate crfm-helm 25 | pip install -e . -------------------------------------------------------------------------------- /thirdparty/LLM_ecosystems/setup_megatron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export THIRD_PARTY_DIR=$(cd $(dirname $0); pwd) 4 | export MEGATRON_DIR=${THIRD_PARTY_DIR}/Megatron-LM 5 | 6 | 7 | # setup megatron 8 | echo "> setup Megatron-LM ..." 9 | git clone https://github.com/NVIDIA/Megatron-LM.git 10 | cd $MEGATRON_DIR 11 | git reset 040eac9 --hard 12 | git apply ${THIRD_PARTY_DIR}/patch/megatron.diff 13 | pip install flash-attn flask flask_restful jsonlines asyncio wandb sentencepiece 14 | -------------------------------------------------------------------------------- /thirdparty/models/README.md: -------------------------------------------------------------------------------- 1 | # Third-party Model Library 2 | 3 | ## EasyAnimate 4 | 5 | Install [EasyAnimate](https://github.com/aigc-apps/EasyAnimate): 6 | 7 | ```shell 8 | bash ./setup_easyanimate.sh 9 | ``` 10 | -------------------------------------------------------------------------------- /thirdparty/models/README_ZH.md: -------------------------------------------------------------------------------- 1 | # 第三方模型库 2 | 3 | ## EasyAnimate 4 | 5 | 安装[EasyAnimate](https://github.com/aigc-apps/EasyAnimate): 6 | 7 | ```shell 8 | bash ./setup_easyanimate.sh 9 | ``` 10 | -------------------------------------------------------------------------------- /thirdparty/models/setup_easyanimate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export THIRD_PARTY_DIR=$(cd $(dirname $0); pwd) 4 | export EASYANIMATE_DIR=${THIRD_PARTY_DIR}/EasyAnimate 5 | 6 | # setup easyanimate 7 | echo "> setup easyanimate ..." 8 | git clone https://github.com/aigc-apps/EasyAnimate.git 9 | cd $EASYANIMATE_DIR 10 | git reset b54412ceb0af6a06bf907e049920f18508c862f1 --hard 11 | git apply ${THIRD_PARTY_DIR}/patch/easyanimate.diff 12 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/__init__.py -------------------------------------------------------------------------------- /tools/analyze_data.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | 3 | from data_juicer.core import Analyzer 4 | 5 | 6 | @logger.catch(reraise=True) 7 | def main(): 8 | analyzer = Analyzer() 9 | analyzer.run() 10 | 11 | 12 | if __name__ == '__main__': 13 | main() 14 | -------------------------------------------------------------------------------- /tools/converter/batch_convert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | WORKDIR='.' 6 | 7 | MODEL_TO_CONVERT=( 8 | ) 9 | 10 | PATH_TO_SAVE=( 11 | ) 12 | 13 | for i in "${!MODEL_TO_CONVERT[@]}"; do 14 | path_model=${MODEL_TO_CONVERT[i]} 15 | path_save=${PATH_TO_SAVE[i]} 16 | 17 | echo $i ":" $path_model "to" $path_save 18 | 19 | python ${WORKDIR}/convert/convert_gpt_to_transformers.py \ 20 | --load_path ${path_model} \ 21 | --save_path ${path_save} \ 22 | --max_shard_size "10GB" \ 23 | --tokenizer_name "decapoda-research/llama-7b-hf" \ 24 | --print-checkpoint-structure 25 | done 26 | -------------------------------------------------------------------------------- /tools/distributed_deduplication/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/distributed_deduplication/__init__.py -------------------------------------------------------------------------------- /tools/evaluator/config/evaluator_example.yaml: -------------------------------------------------------------------------------- 1 | auto_eval: 2 | project_name: 3 | model_name: 4 | cache_dir: 5 | megatron: 6 | process_num: 7 | megatron_home: 8 | checkpoint_path: 9 | tokenizer_type: 10 | vocab_path: 11 | merge_path: 12 | max_tokens: 13 | token_per_iteration: 14 | # tokenizer_path: 15 | # log_path: 16 | helm: 17 | helm_spec_template_path: 18 | helm_output_path: 19 | helm_env_name: 20 | gpt_evaluation: 21 | # openai config 22 | openai_api_key: 23 | openai_organization: 24 | # files config 25 | question_file: ./tools/eval/gpt_eval/config/question.jsonl 26 | answer_file: 27 | baseline_file: ./tools/eval/gpt_eval/answer/openai/chatgpt.jsonl 28 | prompt_file: ./tools/eval/gpt_eval/config/prompt.jsonl 29 | reviewer_file: ./tools/eval/gpt_eval/config/reviewer.jsonl 30 | result_file: 31 | wandb: 32 | project: 33 | base_url: 34 | -------------------------------------------------------------------------------- /tools/evaluator/gpt_eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/evaluator/gpt_eval/__init__.py -------------------------------------------------------------------------------- /tools/evaluator/gpt_eval/config/config.yaml: -------------------------------------------------------------------------------- 1 | answer_generation: 2 | model_name: my_model 3 | question_file: ./config/question.jsonl 4 | answer_file: ./answer/myorg/mymodel.jsonl 5 | batch_size: 4 6 | max_tokens: 512 7 | temperature: 0.7 8 | # config for huggingface 9 | huggingface: 10 | model_path: myorg/mymodel 11 | tokenizer_path: myorg/mymodel 12 | # # config for openai 13 | # openai: 14 | # openai_organization: 15 | # openai_api_key: 16 | # model: 17 | # max_retry: 18 | # # config for megatron-lm 19 | # megatron: 20 | # process_num: 21 | # checkpoint_path: 22 | # tokenizer_type: 23 | # vocab_path: 24 | # merge_path: 25 | # iteration: 26 | gpt_evaluation: 27 | # openai config 28 | openai_organization: 29 | openai_api_key: 30 | # files config 31 | question_file: ./config/question.jsonl 32 | answer_file: ./answer/myorg/mymodel.jsonl 33 | baseline_file: ./answer/openai/gpt-3.5-turbo.jsonl 34 | prompt_file: ./config/prompt.jsonl 35 | reviewer_file: ./config/reviewer.jsonl 36 | result_file: ./review/myorg/mymodel-gpt3.5-turbo.jsonl 37 | -------------------------------------------------------------------------------- /tools/evaluator/gpt_eval/config/reviewer.jsonl: -------------------------------------------------------------------------------- 1 | {"category": "general", "metadata": {"temperature": 0.2, "max_tokens": 1024, "model": "gpt-3.5-turbo"}} 2 | {"category": "coding", "metadata": {"temperature": 0.2, "max_tokens": 1024, "model": "gpt-3.5-turbo"}} 3 | {"category": "math", "metadata": {"temperature": 0.2, "max_tokens": 1024, "model": "gpt-3.5-turbo"}} 4 | -------------------------------------------------------------------------------- /tools/evaluator/recorder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/evaluator/recorder/__init__.py -------------------------------------------------------------------------------- /tools/evaluator/recorder/config/leaderboard_example.yaml: -------------------------------------------------------------------------------- 1 | project: 2 | base_url: 3 | leaderboard: True 4 | leaderboard_metrics: 5 | - mmlu.EM 6 | - boolq.EM 7 | - quac.F1 8 | - raft.EM 9 | - hellaswag.EM 10 | - ... 11 | excluded_runs: 12 | - 13 | - ... 14 | -------------------------------------------------------------------------------- /tools/evaluator/recorder/config/llama_example.yaml: -------------------------------------------------------------------------------- 1 | project: 2 | base_url: 3 | evals: 4 | - eval_type: helm 5 | model_name: llama-7b 6 | source: file 7 | token_num: 1000 8 | eval_result: 9 | mmlu: 10 | EM: 0.345 11 | raft: 12 | EM: 0.583 13 | imdb: 14 | EM: 0.933 15 | truthful_qa: 16 | EM: 0.297 17 | summarization_cnndm: 18 | ROUGE-2: 0.149 19 | summarization_xsum: 20 | ROUGE-2: 0.127 21 | boolq: 22 | EM: 0.751 23 | msmarco_trec: 24 | NDCG@10: 0.482 25 | msmarco_regular: 26 | RR@10: 0.252 27 | narrative_qa: 28 | F1: 0.524 29 | natural_qa_closedbook: 30 | F1: 0.297 31 | natural_qa_openbook_longans: 32 | F1: 0.580 33 | quac: 34 | F1: 0.332 35 | civil_comments: 36 | EM: 0.578 37 | hellaswag: 38 | EM: 0.747 39 | openbookqa: 40 | EM: 0.574 41 | -------------------------------------------------------------------------------- /tools/evaluator/recorder/config/mymodel_example.yaml: -------------------------------------------------------------------------------- 1 | project: 2 | base_url: 3 | evals: 4 | - eval_type: helm 5 | model_name: 6 | source: helm 7 | helm_output_dir: 8 | helm_suite_name: 9 | token_per_iteration: 10 | benchmarks: 11 | - name: mmlu 12 | metrics: 13 | - EM 14 | - name: boolq 15 | metrics: 16 | - EM 17 | - name: quac 18 | metrics: 19 | - F1 20 | - name: raft 21 | metrics: 22 | - EM 23 | - name: hellaswag 24 | metrics: 25 | - EM 26 | - ... 27 | -------------------------------------------------------------------------------- /tools/fmt_conversion/README_ZH.md: -------------------------------------------------------------------------------- 1 | # 格式转换工具 2 | 3 | 在这里,Data-Juicer 为各式各样的数据集提供了十数种格式转换工具,包括多模态数据集,后微调数据集等等。 4 | 这些工具能够将原始格式的数据集转换为Data-Juicer所需的统一中间格式(我们称之为"DJ格式")。DJ算子的默认实现基于该格式进行设计,例如会直接从'text'字段读取数据载荷(payload)执行相应处理操作。对于特殊格式需求,用户既可以通过调整算子参数配置来适配,也可通过自定义算子实现进行扩展支持。 5 | 6 | DJ 格式的一个示例如下所示: 7 | 8 | ```python 9 | { 10 | // >>> 核心内容:文本,对话,...... 11 | "text": "xxx", 12 | "query": "xxx", 13 | "response": "xxx", 14 | ...... 15 | // <<< 核心内容 16 | 17 | // >>> 额外数据内容:多模态数据路径,...... 18 | "images": [ 19 | "path/to/the/image/of/antarctica_snowfield", 20 | "path/to/the/image/of/antarctica_map", 21 | "path/to/the/image/of/europe_map" 22 | ], 23 | "audios": [ 24 | "path/to/the/audio/of/sound_of_waves_in_Antarctic_Ocean" 25 | ], 26 | "videos": [ 27 | "path/to/the/video/of/remote_sensing_view_of_antarctica" 28 | ], 29 | // <<< 额外数据内容 30 | 31 | // >>> meta 信息和 stats,它们可能是数据集原生的,也可以由 Data-Juicer 产出 32 | "meta": { 33 | "src": "customized", 34 | "version": "0.1", 35 | "author": "xxx" 36 | }, 37 | "stats": { 38 | "lang": "en", 39 | "image_widths": [224, 336, 512], 40 | ... 41 | }, 42 | // <<< meta 信息和 stats 43 | } 44 | ``` 45 | 46 | 在 DJ 格式中大概包括三个部分: 47 | 1. 核心内容:例如 LLM 的预训练数据集中的文本内容,后微调数据集中的对话内容等。它们与数据集的下游使用的训练或者微调过程直接相关。 48 | 2. 额外数据内容:例如多模态数据集中的多模态数据路径。它们被组织为路径列表。 49 | 3. Meta 信息和 Stats:例如从原始数据集中继承而来的数据集版本或来源信息,或者由 Data-Juicer 的算子产出的类别 tags 和 stats 信息。 50 | 51 | 其中,第 2 和第 3 部分对于不同的数据集来说是通用的,而且都会被组织为几乎相同的结构。 52 | 作为对比,第 1 部分,也就是核心内容部分,对于各种数据集来说可能非常不同。 53 | 这里列举了针对不同种类数据集介绍这个部分更多细节的对应的文档: 54 | - [多模态数据集](multimodal/README_ZH.md) 55 | - [后微调数据集](post_tuning_dialog/README_ZH.md) -------------------------------------------------------------------------------- /tools/hpo/configs/process.yaml: -------------------------------------------------------------------------------- 1 | # Process config example for dataset 2 | 3 | # global parameters 4 | project_name: 'demo-process-hpo' 5 | dataset_path: 'demo-redpajama-c4-refined.jsonl' 6 | np: 4 # number of subprocess to process your dataset 7 | 8 | export_path: './outputs/demo-hpo-process/demo-hpo-processed.jsonl' 9 | 10 | # process schedule 11 | # a list of several process operators with their arguments 12 | process: 13 | - character_repetition_filter: # filter text with the character repetition ratio out of specific range 14 | rep_len: 10 # repetition length for char-level n-gram 15 | min_ratio: 0.0 # the min ratio of filter range 16 | max_ratio: 0.5 17 | - text_length_filter: # filter text with length out of specific range 18 | min_len: 10 # the min length of filter range 19 | max_len: 10000 20 | -------------------------------------------------------------------------------- /tools/hpo/configs/quality_score_hpo.yaml: -------------------------------------------------------------------------------- 1 | 2 | sweep_name: hpo_for_data-juicer 3 | sweep_max_count: 1000 # the maximal number of trials; `None` for unlimited 4 | 5 | # hpo configuration from original sweep, see more options and details in 6 | # https://docs.wandb.ai/guides/sweeps/define-sweep-configuration 7 | 8 | method: bayes # ["random", "grid", "bayes"] 9 | 10 | metric: 11 | name: quality_score # defined in hpo/objects.py 12 | goal: maximize # ["maximize", "minimize", +"target"] 13 | 14 | parameters: 15 | # can be [single value, multiple values, probabilities, distribution, nested] 16 | character_repetition_filter.rep_len: 17 | values: [2, 4, 8, 16] 18 | character_repetition_filter.max_ratio: 19 | values: [0.3, 0.5, 0.7] 20 | text_length_filter.min_len: 21 | distribution: q_log_uniform_values 22 | min: 256 23 | max: 8192 24 | 25 | early_terminate: 26 | type: hyperband 27 | max_iter: 27 28 | s: 2 29 | -------------------------------------------------------------------------------- /tools/hpo/execute_hpo_wandb.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import wandb 4 | import yaml 5 | from jsonargparse import namespace_to_dict 6 | from objects import get_hpo_objective 7 | 8 | from data_juicer.config import init_configs, merge_config 9 | 10 | # 1: load the defined search space 11 | sweep_cfg_file_path = None 12 | for i in range(len(sys.argv) - 1): 13 | if sys.argv[i] == '--hpo_config': 14 | sweep_cfg_file_path = sys.argv[i + 1] 15 | break 16 | if not sweep_cfg_file_path: 17 | raise ValueError('Not found --hpo_config, you should specify your ' 18 | 'hpo cfg file path following `--hpo_config`') 19 | with open(sweep_cfg_file_path) as f: 20 | sweep_configuration = yaml.safe_load(f) 21 | 22 | 23 | def search(): 24 | wandb.init(project=sweep_configuration['sweep_name']) 25 | 26 | # 2.1: Choose objective that links the hyper-parameters you want to search 27 | object_func = get_hpo_objective(sweep_configuration['metric']['name']) 28 | 29 | dj_cfg = init_configs() 30 | # merge the new hyper-parameters selected by HPO scheduler 31 | dj_cfg = merge_config(dj_cfg, wandb.config) 32 | wandb.config = namespace_to_dict(dj_cfg) # for configuration track 33 | 34 | # 2.2: calculate objective using new hyper-parameters, track the results 35 | score = float(object_func(dj_cfg)) 36 | wandb.log({sweep_configuration['metric']['name']: score}) 37 | 38 | 39 | # 3: Start the sweep, iteratively search hyper-parameters 40 | sweep_id = wandb.sweep(sweep=sweep_configuration, 41 | project=sweep_configuration['sweep_name']) 42 | 43 | wandb.agent(sweep_id, 44 | function=search, 45 | count=sweep_configuration['sweep_max_count'] 46 | if 'sweep_max_count' in sweep_configuration else None) 47 | -------------------------------------------------------------------------------- /tools/humanops/enable_legacy_token.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/humanops/enable_legacy_token.png -------------------------------------------------------------------------------- /tools/mm_eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/mm_eval/__init__.py -------------------------------------------------------------------------------- /tools/mm_eval/vbench_metrics/README.md: -------------------------------------------------------------------------------- 1 | VBench from the paper "VBench: Comprehensive Benchmark Suite for Video Generative Models". 2 | 3 | Please refer to [GitHub](https://github.com/Vchitect/VBench) for more detail. 4 | -------------------------------------------------------------------------------- /tools/mm_eval/vbench_metrics/README_ZH.md: -------------------------------------------------------------------------------- 1 | VBench来自paper:"VBench: Comprehensive Benchmark Suite for Video Generative Models"。 2 | 3 | 请跳转[GitHub](https://github.com/Vchitect/VBench)查看更多信息。 4 | -------------------------------------------------------------------------------- /tools/multimodal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/multimodal/__init__.py -------------------------------------------------------------------------------- /tools/process_data.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | 3 | from data_juicer.config import init_configs 4 | from data_juicer.core import DefaultExecutor 5 | 6 | 7 | @logger.catch(reraise=True) 8 | def main(): 9 | cfg = init_configs() 10 | if cfg.executor_type == 'default': 11 | executor = DefaultExecutor(cfg) 12 | elif cfg.executor_type == 'ray': 13 | from data_juicer.core.executor.ray_executor import RayExecutor 14 | executor = RayExecutor(cfg) 15 | executor.run() 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /tools/quality_classifier/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modelscope/data-juicer/076d6e4ea192980a1691682a81e21de689a9cfbe/tools/quality_classifier/__init__.py --------------------------------------------------------------------------------