├── .coveragerc ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── custom.md │ ├── feature_request.yml │ └── question.yml └── workflows │ ├── deploy_sphinx_docs.yml │ ├── docker │ └── docker-compose.yml │ ├── perf-bench.yml │ ├── pre-commit.yml │ ├── publish-docker-oss.yml │ ├── publish-docker.yml │ ├── publish-pypi.yml │ ├── stale.yml │ ├── unit-test-partial.yml │ └── unit-test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .pre-commit-hooks ├── build_op_doc.py └── tag_mappings.json ├── .secrets.baseline ├── Dockerfile ├── LICENSE ├── README.md ├── README_ZH.md ├── app.py ├── data_juicer ├── __init__.py ├── analysis │ ├── __init__.py │ ├── collector.py │ ├── column_wise_analysis.py │ ├── correlation_analysis.py │ ├── diversity_analysis.py │ ├── measure.py │ └── overall_analysis.py ├── config │ ├── __init__.py │ ├── config.py │ ├── config_all.yaml │ └── config_min.yaml ├── core │ ├── __init__.py │ ├── adapter.py │ ├── analyzer.py │ ├── data │ │ ├── __init__.py │ │ ├── config_validator.py │ │ ├── data_validator.py │ │ ├── dataset_builder.py │ │ ├── dj_dataset.py │ │ ├── load_strategy.py │ │ ├── ray_dataset.py │ │ └── schema.py │ ├── executor │ │ ├── __init__.py │ │ ├── base.py │ │ ├── default_executor.py │ │ ├── factory.py │ │ └── ray_executor.py │ ├── exporter.py │ ├── monitor.py │ ├── ray_exporter.py │ └── tracer.py ├── download │ ├── __init__.py │ ├── arxiv.py │ ├── commoncrawl.py │ ├── downloader.py │ └── wikipedia.py ├── format │ ├── __init__.py │ ├── csv_formatter.py │ ├── empty_formatter.py │ ├── formatter.py │ ├── json_formatter.py │ ├── load.py │ ├── parquet_formatter.py │ ├── text_formatter.py │ └── tsv_formatter.py ├── ops │ ├── __init__.py │ ├── aggregator │ │ ├── __init__.py │ │ ├── entity_attribute_aggregator.py │ │ ├── meta_tags_aggregator.py │ │ ├── most_relevant_entities_aggregator.py │ │ └── nested_aggregator.py │ ├── base_op.py │ ├── common │ │ ├── __init__.py │ │ ├── dwpose_func.py │ │ ├── helper_func.py │ │ ├── prompt2prompt_pipeline.py │ │ └── special_characters.py │ ├── deduplicator │ │ ├── __init__.py │ │ ├── document_deduplicator.py │ │ ├── document_minhash_deduplicator.py │ │ ├── document_simhash_deduplicator.py │ │ ├── image_deduplicator.py │ │ ├── ray_basic_deduplicator.py │ │ ├── ray_bts_minhash_deduplicator.py │ │ ├── ray_document_deduplicator.py │ │ ├── ray_image_deduplicator.py │ │ ├── ray_video_deduplicator.py │ │ └── video_deduplicator.py │ ├── filter │ │ ├── __init__.py │ │ ├── alphanumeric_filter.py │ │ ├── audio_duration_filter.py │ │ ├── audio_nmf_snr_filter.py │ │ ├── audio_size_filter.py │ │ ├── average_line_length_filter.py │ │ ├── character_repetition_filter.py │ │ ├── flagged_words_filter.py │ │ ├── general_field_filter.py │ │ ├── image_aesthetics_filter.py │ │ ├── image_aspect_ratio_filter.py │ │ ├── image_face_count_filter.py │ │ ├── image_face_ratio_filter.py │ │ ├── image_nsfw_filter.py │ │ ├── image_pair_similarity_filter.py │ │ ├── image_shape_filter.py │ │ ├── image_size_filter.py │ │ ├── image_text_matching_filter.py │ │ ├── image_text_similarity_filter.py │ │ ├── image_watermark_filter.py │ │ ├── in_context_influence_filter.py │ │ ├── instruction_following_difficulty_filter.py │ │ ├── language_id_score_filter.py │ │ ├── llm_analysis_filter.py │ │ ├── llm_difficulty_score_filter.py │ │ ├── llm_perplexity_filter.py │ │ ├── llm_quality_score_filter.py │ │ ├── llm_task_relevance_filter.py │ │ ├── maximum_line_length_filter.py │ │ ├── perplexity_filter.py │ │ ├── phrase_grounding_recall_filter.py │ │ ├── special_characters_filter.py │ │ ├── specified_field_filter.py │ │ ├── specified_numeric_field_filter.py │ │ ├── stopwords_filter.py │ │ ├── suffix_filter.py │ │ ├── text_action_filter.py │ │ ├── text_embd_similarity_filter.py │ │ ├── text_entity_dependency_filter.py │ │ ├── text_length_filter.py │ │ ├── text_pair_similarity_filter.py │ │ ├── token_num_filter.py │ │ ├── video_aesthetics_filter.py │ │ ├── video_aspect_ratio_filter.py │ │ ├── video_duration_filter.py │ │ ├── video_frames_text_similarity_filter.py │ │ ├── video_motion_score_filter.py │ │ ├── video_motion_score_raft_filter.py │ │ ├── video_nsfw_filter.py │ │ ├── video_ocr_area_ratio_filter.py │ │ ├── video_resolution_filter.py │ │ ├── video_tagging_from_frames_filter.py │ │ ├── video_watermark_filter.py │ │ ├── word_repetition_filter.py │ │ └── words_num_filter.py │ ├── grouper │ │ ├── __init__.py │ │ ├── key_value_grouper.py │ │ ├── naive_grouper.py │ │ └── naive_reverse_grouper.py │ ├── load.py │ ├── mapper │ │ ├── __init__.py │ │ ├── annotation │ │ │ ├── __init__.py │ │ │ ├── annotation_mapper.py │ │ │ └── human_preference_annotation_mapper.py │ │ ├── audio_add_gaussian_noise_mapper.py │ │ ├── audio_ffmpeg_wrapped_mapper.py │ │ ├── calibrate_qa_mapper.py │ │ ├── calibrate_query_mapper.py │ │ ├── calibrate_response_mapper.py │ │ ├── chinese_convert_mapper.py │ │ ├── clean_copyright_mapper.py │ │ ├── clean_email_mapper.py │ │ ├── clean_html_mapper.py │ │ ├── clean_ip_mapper.py │ │ ├── clean_links_mapper.py │ │ ├── detect_character_attributes_mapper.py │ │ ├── detect_character_locations_mapper.py │ │ ├── detect_main_character_mapper.py │ │ ├── dialog_intent_detection_mapper.py │ │ ├── dialog_sentiment_detection_mapper.py │ │ ├── dialog_sentiment_intensity_mapper.py │ │ ├── dialog_topic_detection_mapper.py │ │ ├── download_file_mapper.py │ │ ├── expand_macro_mapper.py │ │ ├── extract_entity_attribute_mapper.py │ │ ├── extract_entity_relation_mapper.py │ │ ├── extract_event_mapper.py │ │ ├── extract_keyword_mapper.py │ │ ├── extract_nickname_mapper.py │ │ ├── extract_support_text_mapper.py │ │ ├── extract_tables_from_html_mapper.py │ │ ├── fix_unicode_mapper.py │ │ ├── generate_qa_from_examples_mapper.py │ │ ├── generate_qa_from_text_mapper.py │ │ ├── image_blur_mapper.py │ │ ├── image_captioning_from_gpt4v_mapper.py │ │ ├── image_captioning_mapper.py │ │ ├── image_detection_yolo_mapper.py │ │ ├── image_diffusion_mapper.py │ │ ├── image_face_blur_mapper.py │ │ ├── image_remove_background_mapper.py │ │ ├── image_segment_mapper.py │ │ ├── image_tagging_mapper.py │ │ ├── imgdiff_difference_area_generator_mapper.py │ │ ├── imgdiff_difference_caption_generator_mapper.py │ │ ├── mllm_mapper.py │ │ ├── nlpaug_en_mapper.py │ │ ├── nlpcda_zh_mapper.py │ │ ├── optimize_prompt_mapper.py │ │ ├── optimize_qa_mapper.py │ │ ├── optimize_query_mapper.py │ │ ├── optimize_response_mapper.py │ │ ├── pair_preference_mapper.py │ │ ├── punctuation_normalization_mapper.py │ │ ├── python_file_mapper.py │ │ ├── python_lambda_mapper.py │ │ ├── query_intent_detection_mapper.py │ │ ├── query_sentiment_detection_mapper.py │ │ ├── query_topic_detection_mapper.py │ │ ├── relation_identity_mapper.py │ │ ├── remove_bibliography_mapper.py │ │ ├── remove_comments_mapper.py │ │ ├── remove_header_mapper.py │ │ ├── remove_long_words_mapper.py │ │ ├── remove_non_chinese_character_mapper.py │ │ ├── remove_repeat_sentences_mapper.py │ │ ├── remove_specific_chars_mapper.py │ │ ├── remove_table_text_mapper.py │ │ ├── remove_words_with_incorrect_substrings_mapper.py │ │ ├── replace_content_mapper.py │ │ ├── sdxl_prompt2prompt_mapper.py │ │ ├── sentence_augmentation_mapper.py │ │ ├── sentence_split_mapper.py │ │ ├── text_chunk_mapper.py │ │ ├── text_tagging_by_prompt_mapper.py │ │ ├── vggt_mapper.py │ │ ├── video_captioning_from_audio_mapper.py │ │ ├── video_captioning_from_frames_mapper.py │ │ ├── video_captioning_from_summarizer_mapper.py │ │ ├── video_captioning_from_video_mapper.py │ │ ├── video_extract_frames_mapper.py │ │ ├── video_face_blur_mapper.py │ │ ├── video_ffmpeg_wrapped_mapper.py │ │ ├── video_hand_reconstruction_mapper.py │ │ ├── video_remove_watermark_mapper.py │ │ ├── video_resize_aspect_ratio_mapper.py │ │ ├── video_resize_resolution_mapper.py │ │ ├── video_split_by_duration_mapper.py │ │ ├── video_split_by_key_frame_mapper.py │ │ ├── video_split_by_scene_mapper.py │ │ ├── video_tagging_from_audio_mapper.py │ │ ├── video_tagging_from_frames_mapper.py │ │ ├── video_whole_body_pose_estimation_mapper.py │ │ └── whitespace_normalization_mapper.py │ ├── mixins.py │ ├── op_fusion.py │ └── selector │ │ ├── __init__.py │ │ ├── frequency_specified_field_selector.py │ │ ├── random_selector.py │ │ ├── range_specified_field_selector.py │ │ ├── tags_specified_field_selector.py │ │ └── topk_specified_field_selector.py ├── tools │ ├── DJ_mcp_granular_ops.py │ ├── DJ_mcp_recipe_flow.py │ ├── __init__.py │ ├── hpo │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── __init__.py │ │ ├── configs │ │ │ ├── process.yaml │ │ │ └── quality_score_hpo.yaml │ │ ├── demo-redpajama-c4-refined.jsonl │ │ ├── execute_hpo_3sigma.py │ │ ├── execute_hpo_wandb.py │ │ └── objects.py │ ├── mcp_server.py │ ├── mcp_tool.py │ ├── op_search.py │ └── quality_classifier │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── __init__.py │ │ ├── eval.py │ │ ├── predict.py │ │ ├── qc_utils.py │ │ └── train.py └── utils │ ├── __init__.py │ ├── asset_utils.py │ ├── availability_utils.py │ ├── cache_utils.py │ ├── ckpt_utils.py │ ├── common_utils.py │ ├── compress.py │ ├── constant.py │ ├── file_utils.py │ ├── fingerprint_utils.py │ ├── lazy_loader.py │ ├── logger_utils.py │ ├── mm_utils.py │ ├── model_utils.py │ ├── nltk_utils.py │ ├── process_utils.py │ ├── ray_utils.py │ ├── registry.py │ ├── resource_utils.py │ ├── s3_utils.py │ ├── sample.py │ ├── unittest_utils.py │ ├── video_utils.py │ └── webdataset_utils.py ├── demos ├── README.md ├── README_ZH.md ├── analyze_simple │ └── analyzer.yaml ├── api_service │ ├── configs │ │ ├── dj_config_template.yaml │ │ └── model_configs.json │ ├── react_data_filter_process.ipynb │ ├── react_data_mapper_process.ipynb │ ├── utils.py │ ├── wrapped_filters.py │ └── wrapped_mappers.py ├── auto_evaluation_helm │ ├── README_ZH.md │ ├── app.py │ └── imgs │ │ ├── data-juicer.png │ │ ├── eval-01.png │ │ └── eval-02.png ├── data │ ├── auto-prompt-optim │ │ ├── demo-dataset-prompts.jsonl │ │ └── init_infer_dataset.jsonl │ ├── demo-dataset-annotation-human-preference.jsonl │ ├── demo-dataset-chatml.jsonl │ ├── demo-dataset-content.jsonl │ ├── demo-dataset-deduplication.jsonl │ ├── demo-dataset-images-bytes.parquet │ ├── demo-dataset-images.jsonl │ ├── demo-dataset-videos.jsonl │ ├── demo-dataset.jsonl │ ├── demo-dataset_1725870268.jsonl │ └── demo-dataset_1725870628.jsonl ├── data_mixture │ ├── app.py │ └── data │ │ ├── redpajama-c4-refined.jsonl │ │ ├── redpajama-cc-2023-06-refined.jsonl │ │ ├── redpajama-pile-stackexchange-refined.jsonl │ │ ├── the-pile-nih-refined.jsonl │ │ └── the-pile-uspto-refined.jsonl ├── data_process_hpo │ ├── app.py │ └── imgs │ │ └── data-juicer.png ├── data_process_loop │ ├── app.py │ ├── configs │ │ └── demo.yaml │ └── data │ │ └── demo-dataset.jsonl ├── data_visualization_diversity │ ├── app.py │ ├── configs │ │ └── demo.yaml │ └── data │ │ └── demo-dataset.jsonl ├── data_visualization_op_effect │ ├── app.py │ ├── configs │ │ ├── demo_en.yaml │ │ └── demo_zh.yaml │ └── data │ │ └── demo-dataset.jsonl ├── data_visualization_op_insight │ ├── app.css │ ├── app.py │ └── cache │ │ └── .gitkeep ├── data_visualization_statistics │ ├── app.py │ ├── configs │ │ └── demo.yaml │ └── data │ │ └── demo-dataset.jsonl ├── overview_scan │ ├── app.py │ └── data │ │ └── demo-dataset.jsonl ├── process_cft_zh_data │ ├── app.py │ └── data │ │ └── alpaca-cot.jsonl ├── process_code_data │ ├── app.py │ └── data │ │ └── stack_exchange.jsonl ├── process_on_ray │ ├── configs │ │ ├── dedup.yaml │ │ ├── demo-new-config.yaml │ │ └── demo.yaml │ └── data │ │ ├── demo-dataset.json │ │ └── demo-dataset.jsonl ├── process_sci_data │ ├── app.py │ └── data │ │ └── arxiv.jsonl ├── process_simple │ └── process.yaml ├── process_video_on_ray │ ├── configs │ │ ├── demo-new-config.yaml │ │ └── demo.yaml │ └── data │ │ ├── Note.md │ │ ├── demo-dataset.jsonl │ │ └── videos │ │ ├── video1.mp4 │ │ ├── video2.mp4 │ │ └── video3.mp4 ├── role_playing_system_prompt │ ├── README_ZH.md │ ├── role_playing_system_prompt.yaml │ └── system_prompt_generator.py ├── tool_dataset_splitting_by_language │ ├── app.py │ ├── data │ │ └── demo-dataset.jsonl │ └── dataset_splitting_by_language.py └── tool_quality_classifier │ ├── app.py │ ├── data │ └── demo-dataset.jsonl │ └── quality_classifier │ ├── __init__.py │ ├── eval.py │ ├── predict.py │ ├── qc_utils.py │ └── train.py ├── docs ├── BadDataExhibition.md ├── BadDataExhibition_ZH.md ├── DJ_SORA.md ├── DJ_SORA_ZH.md ├── DJ_agents.md ├── DJ_agents_ZH.md ├── DJ_service.md ├── DJ_service_ZH.md ├── DatasetCfg.md ├── DatasetCfg_ZH.md ├── DeveloperGuide.md ├── DeveloperGuide_ZH.md ├── Distributed.md ├── Distributed_ZH.md ├── Operators.md ├── Sandbox.md ├── Sandbox_ZH.md ├── awesome_llm_data.md ├── hub │ ├── AlpacaCOT.md │ ├── AlpacaCOT_ZH.md │ ├── AnnotationNotificationSystem.md │ ├── BLOOM.md │ ├── BLOOM_ZH.md │ ├── RecipeGallery.md │ ├── RecipeGallery_ZH.md │ ├── RedPajama.md │ └── RedPajama_ZH.md ├── imgs │ ├── data-juicer.jpg │ ├── dj_agent_image.png │ ├── dj_dev_agent_image.png │ ├── eval-01.png │ └── eval-02.png ├── op_doc_enhance_workflow │ ├── check_tools │ │ └── check_render.py │ ├── examples.json │ ├── generate_op_details.py │ ├── rewrite_op_docstrings.py │ ├── runner.py │ ├── templates │ │ └── op_doc.md.j2 │ └── utils │ │ ├── __init__.py │ │ ├── example_ir.py │ │ ├── extractor.py │ │ ├── model.py │ │ ├── parse_class.py │ │ ├── router.py │ │ └── view_model.py ├── operators │ ├── aggregator │ │ ├── entity_attribute_aggregator.md │ │ ├── meta_tags_aggregator.md │ │ ├── most_relevant_entities_aggregator.md │ │ └── nested_aggregator.md │ ├── deduplicator │ │ ├── document_deduplicator.md │ │ ├── document_minhash_deduplicator.md │ │ ├── document_simhash_deduplicator.md │ │ ├── image_deduplicator.md │ │ ├── ray_bts_minhash_deduplicator.md │ │ ├── ray_document_deduplicator.md │ │ ├── ray_image_deduplicator.md │ │ ├── ray_video_deduplicator.md │ │ └── video_deduplicator.md │ ├── filter │ │ ├── alphanumeric_filter.md │ │ ├── audio_duration_filter.md │ │ ├── audio_nmf_snr_filter.md │ │ ├── audio_size_filter.md │ │ ├── average_line_length_filter.md │ │ ├── character_repetition_filter.md │ │ ├── flagged_words_filter.md │ │ ├── general_field_filter.md │ │ ├── image_aesthetics_filter.md │ │ ├── image_aspect_ratio_filter.md │ │ ├── image_face_count_filter.md │ │ ├── image_face_ratio_filter.md │ │ ├── image_nsfw_filter.md │ │ ├── image_pair_similarity_filter.md │ │ ├── image_shape_filter.md │ │ ├── image_size_filter.md │ │ ├── image_text_matching_filter.md │ │ ├── image_text_similarity_filter.md │ │ ├── image_watermark_filter.md │ │ ├── in_context_influence_filter.md │ │ ├── instruction_following_difficulty_filter.md │ │ ├── language_id_score_filter.md │ │ ├── llm_analysis_filter.md │ │ ├── llm_difficulty_score_filter.md │ │ ├── llm_perplexity_filter.md │ │ ├── llm_quality_score_filter.md │ │ ├── llm_task_relevance_filter.md │ │ ├── maximum_line_length_filter.md │ │ ├── perplexity_filter.md │ │ ├── phrase_grounding_recall_filter.md │ │ ├── special_characters_filter.md │ │ ├── specified_field_filter.md │ │ ├── specified_numeric_field_filter.md │ │ ├── stopwords_filter.md │ │ ├── suffix_filter.md │ │ ├── text_action_filter.md │ │ ├── text_embd_similarity_filter.md │ │ ├── text_entity_dependency_filter.md │ │ ├── text_length_filter.md │ │ ├── text_pair_similarity_filter.md │ │ ├── token_num_filter.md │ │ ├── video_aesthetics_filter.md │ │ ├── video_aspect_ratio_filter.md │ │ ├── video_duration_filter.md │ │ ├── video_frames_text_similarity_filter.md │ │ ├── video_motion_score_filter.md │ │ ├── video_motion_score_raft_filter.md │ │ ├── video_nsfw_filter.md │ │ ├── video_ocr_area_ratio_filter.md │ │ ├── video_resolution_filter.md │ │ ├── video_tagging_from_frames_filter.md │ │ ├── video_watermark_filter.md │ │ ├── word_repetition_filter.md │ │ └── words_num_filter.md │ ├── formatter │ │ ├── csv_formatter.md │ │ ├── empty_formatter.md │ │ ├── json_formatter.md │ │ ├── parquet_formatter.md │ │ ├── ray_empty_formatter.md │ │ ├── text_formatter.md │ │ └── tsv_formatter.md │ ├── grouper │ │ ├── key_value_grouper.md │ │ ├── naive_grouper.md │ │ └── naive_reverse_grouper.md │ ├── mapper │ │ ├── audio_add_gaussian_noise_mapper.md │ │ ├── audio_ffmpeg_wrapped_mapper.md │ │ ├── calibrate_qa_mapper.md │ │ ├── calibrate_query_mapper.md │ │ ├── calibrate_response_mapper.md │ │ ├── chinese_convert_mapper.md │ │ ├── clean_copyright_mapper.md │ │ ├── clean_email_mapper.md │ │ ├── clean_html_mapper.md │ │ ├── clean_ip_mapper.md │ │ ├── clean_links_mapper.md │ │ ├── detect_character_attributes_mapper.md │ │ ├── detect_character_locations_mapper.md │ │ ├── detect_main_character_mapper.md │ │ ├── dialog_intent_detection_mapper.md │ │ ├── dialog_sentiment_detection_mapper.md │ │ ├── dialog_sentiment_intensity_mapper.md │ │ ├── dialog_topic_detection_mapper.md │ │ ├── download_file_mapper.md │ │ ├── expand_macro_mapper.md │ │ ├── extract_entity_attribute_mapper.md │ │ ├── extract_entity_relation_mapper.md │ │ ├── extract_event_mapper.md │ │ ├── extract_keyword_mapper.md │ │ ├── extract_nickname_mapper.md │ │ ├── extract_support_text_mapper.md │ │ ├── extract_tables_from_html_mapper.md │ │ ├── fix_unicode_mapper.md │ │ ├── generate_qa_from_examples_mapper.md │ │ ├── generate_qa_from_text_mapper.md │ │ ├── human_preference_annotation_mapper.md │ │ ├── image_blur_mapper.md │ │ ├── image_captioning_from_gpt4v_mapper.md │ │ ├── image_captioning_mapper.md │ │ ├── image_detection_yolo_mapper.md │ │ ├── image_diffusion_mapper.md │ │ ├── image_face_blur_mapper.md │ │ ├── image_remove_background_mapper.md │ │ ├── image_segment_mapper.md │ │ ├── image_tagging_mapper.md │ │ ├── imgdiff_difference_area_generator_mapper.md │ │ ├── imgdiff_difference_caption_generator_mapper.md │ │ ├── mllm_mapper.md │ │ ├── nlpaug_en_mapper.md │ │ ├── nlpcda_zh_mapper.md │ │ ├── optimize_prompt_mapper.md │ │ ├── optimize_qa_mapper.md │ │ ├── optimize_query_mapper.md │ │ ├── optimize_response_mapper.md │ │ ├── pair_preference_mapper.md │ │ ├── punctuation_normalization_mapper.md │ │ ├── python_file_mapper.md │ │ ├── python_lambda_mapper.md │ │ ├── query_intent_detection_mapper.md │ │ ├── query_sentiment_detection_mapper.md │ │ ├── query_topic_detection_mapper.md │ │ ├── relation_identity_mapper.md │ │ ├── remove_bibliography_mapper.md │ │ ├── remove_comments_mapper.md │ │ ├── remove_header_mapper.md │ │ ├── remove_long_words_mapper.md │ │ ├── remove_non_chinese_character_mapper.md │ │ ├── remove_repeat_sentences_mapper.md │ │ ├── remove_specific_chars_mapper.md │ │ ├── remove_table_text_mapper.md │ │ ├── remove_words_with_incorrect_substrings_mapper.md │ │ ├── replace_content_mapper.md │ │ ├── sdxl_prompt2prompt_mapper.md │ │ ├── sentence_augmentation_mapper.md │ │ ├── sentence_split_mapper.md │ │ ├── text_chunk_mapper.md │ │ ├── vggt_mapper.md │ │ ├── video_captioning_from_audio_mapper.md │ │ ├── video_captioning_from_frames_mapper.md │ │ ├── video_captioning_from_summarizer_mapper.md │ │ ├── video_captioning_from_video_mapper.md │ │ ├── video_extract_frames_mapper.md │ │ ├── video_face_blur_mapper.md │ │ ├── video_ffmpeg_wrapped_mapper.md │ │ ├── video_remove_watermark_mapper.md │ │ ├── video_resize_aspect_ratio_mapper.md │ │ ├── video_resize_resolution_mapper.md │ │ ├── video_split_by_duration_mapper.md │ │ ├── video_split_by_key_frame_mapper.md │ │ ├── video_split_by_scene_mapper.md │ │ ├── video_tagging_from_audio_mapper.md │ │ ├── video_tagging_from_frames_mapper.md │ │ └── whitespace_normalization_mapper.md │ ├── op │ │ └── general_fused_op.md │ └── selector │ │ ├── frequency_specified_field_selector.md │ │ ├── random_selector.md │ │ ├── range_specified_field_selector.md │ │ ├── tags_specified_field_selector.md │ │ └── topk_specified_field_selector.md ├── sphinx_doc │ ├── Makefile │ ├── README.md │ ├── README_ZH.md │ ├── _templates │ │ └── package.rst_t │ ├── build_doc.sh │ ├── build_versions.py │ ├── make.bat │ ├── redirect.html │ └── source │ │ ├── _static │ │ ├── awesome-list.html │ │ ├── images │ │ │ ├── icon.png │ │ │ └── logo.png │ │ ├── sidebar-menu.css │ │ └── tutorial_kdd24.html │ │ ├── _templates │ │ ├── page.html │ │ └── sidebar │ │ │ ├── bottom_menu.html │ │ │ └── brand.html │ │ ├── api.rst │ │ ├── conf.py │ │ ├── custom_myst.py │ │ ├── index.rst │ │ └── index_ZH.rst └── tutorial │ ├── DJ-Cookbook.md │ ├── DJ-Cookbook_ZH.md │ ├── Installation.md │ ├── Installation_ZH.md │ ├── QuickStart.md │ └── QuickStart_ZH.md ├── label_studio_localhost_connection.json ├── pyproject.toml ├── scripts ├── README.md ├── dlc │ ├── partition_data_dlc.py │ └── run_on_dlc.sh └── run_slurm.sh ├── service.py ├── tests ├── __init__.py ├── analysis │ ├── __init__.py │ ├── test_collector.py │ ├── test_column_wise_analysis.py │ ├── test_correlation_analysis.py │ ├── test_diversity_analysis.py │ ├── test_measure.py │ └── test_overall_analysis.py ├── benchmark_performance │ ├── configs │ │ ├── audio.yaml │ │ ├── image.yaml │ │ ├── text.yaml │ │ └── video.yaml │ ├── report.py │ └── run.sh ├── config │ ├── __init__.py │ ├── demo_4_test.yaml │ ├── demo_4_test_bad_val.yaml │ ├── demo_4_test_multiple_text_keys.yaml │ ├── demo_4_test_same_ops.yaml │ └── test_config.py ├── core │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── test_config_validator.py │ │ ├── test_data │ │ │ ├── parquet │ │ │ │ └── sample.parquet │ │ │ ├── sample.jsonl │ │ │ ├── sample.txt │ │ │ ├── test_config.yaml │ │ │ ├── test_config_list.yaml │ │ │ └── test_config_ray.yaml │ │ ├── test_data_validator.py │ │ ├── test_dataset_builder.py │ │ ├── test_dj_dataset.py │ │ ├── test_load_strategy.py │ │ ├── test_ray_dataset.py │ │ └── test_schema.py │ ├── executor │ │ ├── __init__.py │ │ ├── test_base.py │ │ ├── test_default_executor.py │ │ ├── test_factory.py │ │ └── test_ray_executor.py │ ├── test_adapter.py │ ├── test_analyzer.py │ ├── test_exporter.py │ ├── test_monitor.py │ ├── test_ray_exporter.py │ └── test_tracer.py ├── download │ ├── __init__.py │ └── test_download.py ├── format │ ├── __init__.py │ ├── data │ │ ├── structured │ │ │ ├── demo-dataset.csv │ │ │ ├── demo-dataset.jsonl │ │ │ ├── demo-dataset.parquet │ │ │ └── demo-dataset.tsv │ │ └── text │ │ │ ├── sample1.txt │ │ │ ├── sample2.txt │ │ │ ├── sample3.txt │ │ │ ├── sample4.txt │ │ │ ├── sample5.txt │ │ │ └── sample6.txt │ ├── test_csv_formatter.py │ ├── test_empty_formatter.py │ ├── test_json_formatter.py │ ├── test_load_formatter.py │ ├── test_parquet_formatter.py │ ├── test_tsv_formatter.py │ └── test_unify_format.py ├── ops │ ├── __init__.py │ ├── aggregator │ │ ├── __init__.py │ │ ├── test_entity_attribute_aggregator.py │ │ ├── test_meta_tags_aggregator.py │ │ ├── test_most_relevant_entities_aggregator.py │ │ └── test_nested_aggregator.py │ ├── common │ │ └── __init__.py │ ├── data │ │ ├── audio1.wav │ │ ├── audio2.wav │ │ ├── audio3.ogg │ │ ├── blip.jpg │ │ ├── cat.jpg │ │ ├── img1.png │ │ ├── img10.jpg │ │ ├── img2.jpg │ │ ├── img3.jpg │ │ ├── img4.png │ │ ├── img5.jpg │ │ ├── img6.jpg │ │ ├── img7.jpg │ │ ├── img8.jpg │ │ ├── img9.jpg │ │ ├── img_pair_1.jpg │ │ ├── img_pair_2.jpg │ │ ├── lena-face.jpg │ │ ├── lena.jpg │ │ ├── video1.mp4 │ │ ├── video10.mp4 │ │ ├── video11.mp4 │ │ ├── video2.mp4 │ │ ├── video3-no-audio.mp4 │ │ ├── video3.mp4 │ │ ├── video4.mp4 │ │ ├── video5.mp4 │ │ ├── video6.mp4 │ │ ├── video7.mp4 │ │ ├── video8.mp4 │ │ └── video9.mp4 │ ├── deduplicator │ │ ├── __init__.py │ │ ├── test_document_deduplicator.py │ │ ├── test_document_minhash_deduplicator.py │ │ ├── test_document_simhash_deduplicator.py │ │ ├── test_image_deduplicator.py │ │ ├── test_ray_bts_minhash_deduplicator.py │ │ ├── test_ray_document_deduplicator.py │ │ ├── test_ray_image_deduplicator.py │ │ ├── test_ray_video_deduplicator.py │ │ └── test_video_deduplicator.py │ ├── filter │ │ ├── __init__.py │ │ ├── test_alphanumeric_filter.py │ │ ├── test_audio_duration_filter.py │ │ ├── test_audio_nmf_snr_filter.py │ │ ├── test_audio_size_filter.py │ │ ├── test_average_line_length_filter.py │ │ ├── test_character_repetition_filter.py │ │ ├── test_flagged_words_filter.py │ │ ├── test_general_field_filter.py │ │ ├── test_image_aesthetics_filter.py │ │ ├── test_image_aspect_ratio_filter.py │ │ ├── test_image_face_count_filter.py │ │ ├── test_image_face_ratio_filter.py │ │ ├── test_image_nsfw_filter.py │ │ ├── test_image_pair_similarity_filter.py │ │ ├── test_image_shape_filter.py │ │ ├── test_image_size_filter.py │ │ ├── test_image_text_matching_filter.py │ │ ├── test_image_text_similarity_filter.py │ │ ├── test_image_watermark_filter.py │ │ ├── test_in_context_influence_filter.py │ │ ├── test_instruction_following_difficulty_filter.py │ │ ├── test_language_id_score_filter.py │ │ ├── test_llm_analysis_filter.py │ │ ├── test_llm_difficulty_score_filter.py │ │ ├── test_llm_perplexity_filter.py │ │ ├── test_llm_quality_score_filter.py │ │ ├── test_llm_task_relevance_filter.py │ │ ├── test_maximum_line_length_filter.py │ │ ├── test_perplexity_filter.py │ │ ├── test_phrase_grounding_recall_filter.py │ │ ├── test_special_characters_filter.py │ │ ├── test_specified_field_filter.py │ │ ├── test_specified_numeric_field_filter.py │ │ ├── test_stopwords_filter.py │ │ ├── test_suffix_filter.py │ │ ├── test_text_action_filter.py │ │ ├── test_text_embd_similarity_filter.py │ │ ├── test_text_entity_dependency_filter.py │ │ ├── test_text_length_filter.py │ │ ├── test_text_pair_similarity_filter.py │ │ ├── test_token_num_filter.py │ │ ├── test_video_aesthetics_filter.py │ │ ├── test_video_aspect_ratio_filter.py │ │ ├── test_video_duration_filter.py │ │ ├── test_video_frames_text_similarity_filter.py │ │ ├── test_video_motion_score_filter.py │ │ ├── test_video_motion_score_raft_filter.py │ │ ├── test_video_nsfw_filter.py │ │ ├── test_video_ocr_area_ratio_filter.py │ │ ├── test_video_resolution_filter.py │ │ ├── test_video_tagging_from_frames_filter.py │ │ ├── test_video_watermark_filter.py │ │ ├── test_word_repetition_filter.py │ │ └── test_words_num_filter.py │ ├── grouper │ │ ├── __init__.py │ │ ├── test_key_value_grouper.py │ │ ├── test_naive_grouper.py │ │ └── test_naive_reverse_grouper.py │ ├── mapper │ │ ├── __init__.py │ │ ├── annotation │ │ │ ├── __init__.py │ │ │ ├── test_annotation_mapper.py │ │ │ └── test_human_preference_annotation_mapper.py │ │ ├── test_audio_add_gaussian_noise_mapper.py │ │ ├── test_audio_ffmpeg_wrapped_mapper.py │ │ ├── test_calibrate_qa_mapper.py │ │ ├── test_calibrate_query_mapper.py │ │ ├── test_calibrate_response_mapper.py │ │ ├── test_chinese_convert_mapper.py │ │ ├── test_clean_copyright_mapper.py │ │ ├── test_clean_email_mapper.py │ │ ├── test_clean_html_mapper.py │ │ ├── test_clean_ip_mapper.py │ │ ├── test_clean_links_mapper.py │ │ ├── test_detect_character_attributes_mapper.py │ │ ├── test_detect_character_locations_mapper.py │ │ ├── test_detect_main_character_mapper.py │ │ ├── test_dialog_intent_detection_mapper.py │ │ ├── test_dialog_sentiment_detection_mapper.py │ │ ├── test_dialog_sentiment_intensity_mapper.py │ │ ├── test_dialog_topic_detection_mapper.py │ │ ├── test_download_file_mapper.py │ │ ├── test_expand_macro_mapper.py │ │ ├── test_extract_entity_attribute_mapper.py │ │ ├── test_extract_entity_relation_mapper.py │ │ ├── test_extract_event_mapper.py │ │ ├── test_extract_keyword_mapper.py │ │ ├── test_extract_nickname_mapper.py │ │ ├── test_extract_support_text_mapper.py │ │ ├── test_extract_tables_from_html_mapper.py │ │ ├── test_fix_unicode_mapper.py │ │ ├── test_generate_qa_from_examples_mapper.py │ │ ├── test_generate_qa_from_text_mapper.py │ │ ├── test_image_blur_mapper.py │ │ ├── test_image_captioning_from_gpt4v_mapper.py │ │ ├── test_image_captioning_mapper.py │ │ ├── test_image_detection_yolo_mapper.py │ │ ├── test_image_diffusion_mapper.py │ │ ├── test_image_face_blur_mapper.py │ │ ├── test_image_remove_background_mapper.py │ │ ├── test_image_segment_mapper.py │ │ ├── test_image_tagging_mapper.py │ │ ├── test_imgdiff_difference_area_generator_mapper.py │ │ ├── test_imgdiff_difference_caption_generator_mapper.py │ │ ├── test_mllm_mapper.py │ │ ├── test_nlpaug_en_mapper.py │ │ ├── test_nlpcda_zh_mapper.py │ │ ├── test_optimize_prompt_mapper.py │ │ ├── test_optimize_qa_mapper.py │ │ ├── test_optimize_query_mapper.py │ │ ├── test_optimize_response_mapper.py │ │ ├── test_pair_preference_mapper.py │ │ ├── test_punctuation_normalization_mapper.py │ │ ├── test_python_file_mapper.py │ │ ├── test_python_lambda_mapper.py │ │ ├── test_query_intent_detection_mapper.py │ │ ├── test_query_sentiment_detection_mapper.py │ │ ├── test_query_topic_detection_mapper.py │ │ ├── test_relation_identity_mapper.py │ │ ├── test_remove_bibliography_mapper.py │ │ ├── test_remove_comments_mapper.py │ │ ├── test_remove_header_mapper.py │ │ ├── test_remove_long_words_mapper.py │ │ ├── test_remove_non_chinese_character_mapper.py │ │ ├── test_remove_repeat_sentences_mapper.py │ │ ├── test_remove_specific_chars_mapper.py │ │ ├── test_remove_table_text_mapper.py │ │ ├── test_remove_words_with_incorrect_substrings_mapper.py │ │ ├── test_replace_content_mapper.py │ │ ├── test_sdxl_prompt2prompt_mapper.py │ │ ├── test_sentence_augmentation_mapper.py │ │ ├── test_sentence_split_mapper.py │ │ ├── test_text_chunk_mapper.py │ │ ├── test_text_tagging_by_prompt_mapper.py │ │ ├── test_vggt_mapper.py │ │ ├── test_video_captioning_from_audio_mapper.py │ │ ├── test_video_captioning_from_frames_mapper.py │ │ ├── test_video_captioning_from_summarizer_mapper.py │ │ ├── test_video_captioning_from_video_mapper.py │ │ ├── test_video_extract_frames_mapper.py │ │ ├── test_video_face_blur_mapper.py │ │ ├── test_video_ffmpeg_wrapped_mapper.py │ │ ├── test_video_hand_reconstruction_mapper.py │ │ ├── test_video_remove_watermark_mapper.py │ │ ├── test_video_resize_aspect_ratio_mapper.py │ │ ├── test_video_resize_resolution_mapper.py │ │ ├── test_video_split_by_duration_mapper.py │ │ ├── test_video_split_by_key_frame_mapper.py │ │ ├── test_video_split_by_scene_mapper.py │ │ ├── test_video_tagging_from_audio_mapper.py │ │ ├── test_video_tagging_from_frames_mapper.py │ │ ├── test_video_whole_body_pose_estimation_mapper.py │ │ └── test_whitespace_normalization_mapper.py │ ├── selector │ │ ├── __init__.py │ │ ├── test_frequency_specified_field_selector.py │ │ ├── test_random_selector.py │ │ ├── test_range_specified_field_selector.py │ │ ├── test_tags_specified_field_selector.py │ │ └── test_topk_specified_field_selector.py │ ├── test_base_op.py │ └── test_op_fusion.py ├── run.py ├── tools │ ├── __init__.py │ ├── test_mcp_server.py │ └── test_process_data.py └── utils │ ├── __init__.py │ ├── test_asset_utils.py │ ├── test_availablility_utils.py │ ├── test_cache_utils.py │ ├── test_ckpt_utils.py │ ├── test_common_utils.py │ ├── test_compress.py │ ├── test_constant.py │ ├── test_file_utils.py │ ├── test_fingerprint_utils.py │ ├── test_lazy_loader.py │ ├── test_logger_utils.py │ ├── test_mm_utils.py │ ├── test_model_utils.py │ ├── test_process_utils.py │ ├── test_registry.py │ ├── test_resource_utils.py │ ├── test_s3_utils.py │ ├── test_unittest_utils.py │ └── test_video_utils.py ├── thirdparty ├── LLM_ecosystems │ ├── README.md │ ├── README_ZH.md │ ├── patch │ │ ├── helm.diff │ │ └── megatron.diff │ ├── setup_helm.sh │ └── setup_megatron.sh └── models │ ├── README.md │ ├── README_ZH.md │ ├── patch │ └── easyanimate.diff │ └── setup_easyanimate.sh ├── tools ├── __init__.py ├── analyze_data.py ├── check_ray_cluster.py ├── check_s3_integration.py ├── converter │ ├── batch_convert.sh │ ├── convert_gpt_to_transformers.py │ └── modeling_megatron_llama.py ├── data_resplit.py ├── distributed_deduplication │ ├── README.md │ ├── README_ZH.md │ ├── __init__.py │ ├── dedup_utils.py │ └── spark_dedup.py ├── dj_install.py ├── evaluator │ ├── README.md │ ├── README_ZH.md │ ├── config │ │ ├── evaluator_example.yaml │ │ └── helm_spec_template.conf │ ├── evaluator.py │ ├── gpt_eval │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── __init__.py │ │ ├── answer │ │ │ └── openai │ │ │ │ └── gpt-3.5-turbo.jsonl │ │ ├── answer_generator.py │ │ ├── config │ │ │ ├── config.yaml │ │ │ ├── prompt.jsonl │ │ │ ├── question.jsonl │ │ │ └── reviewer.jsonl │ │ └── gpt_evaluator.py │ └── recorder │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── __init__.py │ │ ├── config │ │ ├── leaderboard_example.yaml │ │ ├── llama_example.yaml │ │ └── mymodel_example.yaml │ │ └── wandb_writer.py ├── fmt_conversion │ ├── README.md │ ├── README_ZH.md │ ├── multimodal │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── absolute_path_to_relative_path.py │ │ ├── data_juicer_format_to_target_format │ │ │ ├── dj_to_internvid.py │ │ │ ├── dj_to_llava.py │ │ │ ├── dj_to_mmc4.py │ │ │ ├── dj_to_msrvtt.py │ │ │ ├── dj_to_video_chatgpt.py │ │ │ ├── dj_to_wavcaps.py │ │ │ └── dj_to_youku.py │ │ ├── source_format_to_data_juicer_format │ │ │ ├── internvid_to_dj.py │ │ │ ├── llava_to_dj.py │ │ │ ├── mmc4_to_dj.py │ │ │ ├── msrvtt_to_dj.py │ │ │ ├── video_chatgpt_to_dj.py │ │ │ ├── wavcaps_to_dj.py │ │ │ └── youku_to_dj.py │ │ └── utils.py │ └── post_tuning_dialog │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── data_juicer_format_to_target_format │ │ ├── dj_to_alpaca.py │ │ ├── dj_to_llama_factory_sharegpt.py │ │ ├── dj_to_messages.py │ │ └── dj_to_ms_swift_sharegpt.py │ │ └── source_format_to_data_juicer_format │ │ ├── alpaca_to_dj.py │ │ ├── llama_factory_sharegpt_to_dj.py │ │ ├── messages_to_dj.py │ │ └── ms_swift_sharegpt_to_dj.py ├── generate_smtp_cert.py ├── generate_uv_lock.py ├── humanops │ ├── README.md │ ├── enable_legacy_token.png │ └── label_studio_service.py ├── mm_eval │ ├── __init__.py │ ├── inception_metrics │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── calc_metrics_for_videos.py │ │ ├── dataset.py │ │ ├── distributed.py │ │ ├── util.py │ │ └── video_metrics │ │ │ ├── frechet_inception_distance.py │ │ │ ├── frechet_video_distance.py │ │ │ ├── inception_score.py │ │ │ ├── kernel_inception_distance.py │ │ │ ├── kernel_video_distance.py │ │ │ ├── metric_main.py │ │ │ ├── metric_utils.py │ │ │ ├── precision_recall.py │ │ │ ├── video_inception_score.py │ │ │ └── video_precision_recall.py │ └── vbench_metrics │ │ ├── README.md │ │ ├── README_ZH.md │ │ ├── VBench_full_info.json │ │ ├── VBench_mini_info.json │ │ └── evaluate.py ├── multimodal │ └── __init__.py ├── postprocess │ ├── README.md │ ├── README_ZH.md │ ├── count_token.py │ ├── data_mixture.py │ └── deserialize_meta.py ├── preprocess │ ├── README.md │ ├── README_ZH.md │ ├── dataset_split_by_language.py │ ├── raw_alpaca_cot_merge_add_meta.py │ ├── raw_arxiv_to_jsonl.py │ ├── raw_stackexchange_to_jsonl.py │ ├── reformat_csv_nan_value.py │ ├── reformat_jsonl_nan_value.py │ └── serialize_meta.py └── process_data.py └── uv.lock /.coveragerc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.coveragerc -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.github/ISSUE_TEMPLATE/bug_report.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/custom.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.github/ISSUE_TEMPLATE/custom.md -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.github/ISSUE_TEMPLATE/feature_request.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.github/ISSUE_TEMPLATE/question.yml -------------------------------------------------------------------------------- /.github/workflows/deploy_sphinx_docs.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.github/workflows/deploy_sphinx_docs.yml -------------------------------------------------------------------------------- /.github/workflows/docker/docker-compose.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.github/workflows/docker/docker-compose.yml -------------------------------------------------------------------------------- /.github/workflows/perf-bench.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.github/workflows/perf-bench.yml -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.github/workflows/pre-commit.yml -------------------------------------------------------------------------------- /.github/workflows/publish-docker-oss.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.github/workflows/publish-docker-oss.yml -------------------------------------------------------------------------------- /.github/workflows/publish-docker.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.github/workflows/publish-docker.yml -------------------------------------------------------------------------------- /.github/workflows/publish-pypi.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.github/workflows/publish-pypi.yml -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.github/workflows/stale.yml -------------------------------------------------------------------------------- /.github/workflows/unit-test-partial.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.github/workflows/unit-test-partial.yml -------------------------------------------------------------------------------- /.github/workflows/unit-test.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.github/workflows/unit-test.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.gitignore -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /.pre-commit-hooks/build_op_doc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.pre-commit-hooks/build_op_doc.py -------------------------------------------------------------------------------- /.pre-commit-hooks/tag_mappings.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.pre-commit-hooks/tag_mappings.json -------------------------------------------------------------------------------- /.secrets.baseline: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/.secrets.baseline -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/Dockerfile -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/README.md -------------------------------------------------------------------------------- /README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/README_ZH.md -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/app.py -------------------------------------------------------------------------------- /data_juicer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/__init__.py -------------------------------------------------------------------------------- /data_juicer/analysis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/analysis/__init__.py -------------------------------------------------------------------------------- /data_juicer/analysis/collector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/analysis/collector.py -------------------------------------------------------------------------------- /data_juicer/analysis/column_wise_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/analysis/column_wise_analysis.py -------------------------------------------------------------------------------- /data_juicer/analysis/correlation_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/analysis/correlation_analysis.py -------------------------------------------------------------------------------- /data_juicer/analysis/diversity_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/analysis/diversity_analysis.py -------------------------------------------------------------------------------- /data_juicer/analysis/measure.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/analysis/measure.py -------------------------------------------------------------------------------- /data_juicer/analysis/overall_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/analysis/overall_analysis.py -------------------------------------------------------------------------------- /data_juicer/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/config/__init__.py -------------------------------------------------------------------------------- /data_juicer/config/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/config/config.py -------------------------------------------------------------------------------- /data_juicer/config/config_all.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/config/config_all.yaml -------------------------------------------------------------------------------- /data_juicer/config/config_min.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/config/config_min.yaml -------------------------------------------------------------------------------- /data_juicer/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/__init__.py -------------------------------------------------------------------------------- /data_juicer/core/adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/adapter.py -------------------------------------------------------------------------------- /data_juicer/core/analyzer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/analyzer.py -------------------------------------------------------------------------------- /data_juicer/core/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/data/__init__.py -------------------------------------------------------------------------------- /data_juicer/core/data/config_validator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/data/config_validator.py -------------------------------------------------------------------------------- /data_juicer/core/data/data_validator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/data/data_validator.py -------------------------------------------------------------------------------- /data_juicer/core/data/dataset_builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/data/dataset_builder.py -------------------------------------------------------------------------------- /data_juicer/core/data/dj_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/data/dj_dataset.py -------------------------------------------------------------------------------- /data_juicer/core/data/load_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/data/load_strategy.py -------------------------------------------------------------------------------- /data_juicer/core/data/ray_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/data/ray_dataset.py -------------------------------------------------------------------------------- /data_juicer/core/data/schema.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/data/schema.py -------------------------------------------------------------------------------- /data_juicer/core/executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/executor/__init__.py -------------------------------------------------------------------------------- /data_juicer/core/executor/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/executor/base.py -------------------------------------------------------------------------------- /data_juicer/core/executor/default_executor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/executor/default_executor.py -------------------------------------------------------------------------------- /data_juicer/core/executor/factory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/executor/factory.py -------------------------------------------------------------------------------- /data_juicer/core/executor/ray_executor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/executor/ray_executor.py -------------------------------------------------------------------------------- /data_juicer/core/exporter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/exporter.py -------------------------------------------------------------------------------- /data_juicer/core/monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/monitor.py -------------------------------------------------------------------------------- /data_juicer/core/ray_exporter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/ray_exporter.py -------------------------------------------------------------------------------- /data_juicer/core/tracer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/core/tracer.py -------------------------------------------------------------------------------- /data_juicer/download/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_juicer/download/arxiv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/download/arxiv.py -------------------------------------------------------------------------------- /data_juicer/download/commoncrawl.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_juicer/download/downloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/download/downloader.py -------------------------------------------------------------------------------- /data_juicer/download/wikipedia.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/download/wikipedia.py -------------------------------------------------------------------------------- /data_juicer/format/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/format/__init__.py -------------------------------------------------------------------------------- /data_juicer/format/csv_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/format/csv_formatter.py -------------------------------------------------------------------------------- /data_juicer/format/empty_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/format/empty_formatter.py -------------------------------------------------------------------------------- /data_juicer/format/formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/format/formatter.py -------------------------------------------------------------------------------- /data_juicer/format/json_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/format/json_formatter.py -------------------------------------------------------------------------------- /data_juicer/format/load.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/format/load.py -------------------------------------------------------------------------------- /data_juicer/format/parquet_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/format/parquet_formatter.py -------------------------------------------------------------------------------- /data_juicer/format/text_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/format/text_formatter.py -------------------------------------------------------------------------------- /data_juicer/format/tsv_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/format/tsv_formatter.py -------------------------------------------------------------------------------- /data_juicer/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/__init__.py -------------------------------------------------------------------------------- /data_juicer/ops/aggregator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/aggregator/__init__.py -------------------------------------------------------------------------------- /data_juicer/ops/aggregator/meta_tags_aggregator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/aggregator/meta_tags_aggregator.py -------------------------------------------------------------------------------- /data_juicer/ops/aggregator/nested_aggregator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/aggregator/nested_aggregator.py -------------------------------------------------------------------------------- /data_juicer/ops/base_op.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/base_op.py -------------------------------------------------------------------------------- /data_juicer/ops/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/common/__init__.py -------------------------------------------------------------------------------- /data_juicer/ops/common/dwpose_func.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/common/dwpose_func.py -------------------------------------------------------------------------------- /data_juicer/ops/common/helper_func.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/common/helper_func.py -------------------------------------------------------------------------------- /data_juicer/ops/common/prompt2prompt_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/common/prompt2prompt_pipeline.py -------------------------------------------------------------------------------- /data_juicer/ops/common/special_characters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/common/special_characters.py -------------------------------------------------------------------------------- /data_juicer/ops/deduplicator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/deduplicator/__init__.py -------------------------------------------------------------------------------- /data_juicer/ops/deduplicator/document_deduplicator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/deduplicator/document_deduplicator.py -------------------------------------------------------------------------------- /data_juicer/ops/deduplicator/image_deduplicator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/deduplicator/image_deduplicator.py -------------------------------------------------------------------------------- /data_juicer/ops/deduplicator/ray_basic_deduplicator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/deduplicator/ray_basic_deduplicator.py -------------------------------------------------------------------------------- /data_juicer/ops/deduplicator/ray_image_deduplicator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/deduplicator/ray_image_deduplicator.py -------------------------------------------------------------------------------- /data_juicer/ops/deduplicator/ray_video_deduplicator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/deduplicator/ray_video_deduplicator.py -------------------------------------------------------------------------------- /data_juicer/ops/deduplicator/video_deduplicator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/deduplicator/video_deduplicator.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/__init__.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/alphanumeric_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/alphanumeric_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/audio_duration_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/audio_duration_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/audio_nmf_snr_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/audio_nmf_snr_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/audio_size_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/audio_size_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/average_line_length_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/average_line_length_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/character_repetition_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/character_repetition_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/flagged_words_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/flagged_words_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/general_field_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/general_field_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/image_aesthetics_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/image_aesthetics_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/image_aspect_ratio_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/image_aspect_ratio_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/image_face_count_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/image_face_count_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/image_face_ratio_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/image_face_ratio_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/image_nsfw_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/image_nsfw_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/image_pair_similarity_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/image_pair_similarity_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/image_shape_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/image_shape_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/image_size_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/image_size_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/image_text_matching_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/image_text_matching_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/image_text_similarity_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/image_text_similarity_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/image_watermark_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/image_watermark_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/in_context_influence_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/in_context_influence_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/language_id_score_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/language_id_score_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/llm_analysis_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/llm_analysis_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/llm_difficulty_score_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/llm_difficulty_score_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/llm_perplexity_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/llm_perplexity_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/llm_quality_score_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/llm_quality_score_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/llm_task_relevance_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/llm_task_relevance_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/maximum_line_length_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/maximum_line_length_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/perplexity_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/perplexity_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/special_characters_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/special_characters_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/specified_field_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/specified_field_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/stopwords_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/stopwords_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/suffix_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/suffix_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/text_action_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/text_action_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/text_embd_similarity_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/text_embd_similarity_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/text_entity_dependency_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/text_entity_dependency_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/text_length_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/text_length_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/text_pair_similarity_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/text_pair_similarity_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/token_num_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/token_num_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/video_aesthetics_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/video_aesthetics_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/video_aspect_ratio_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/video_aspect_ratio_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/video_duration_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/video_duration_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/video_motion_score_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/video_motion_score_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/video_nsfw_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/video_nsfw_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/video_ocr_area_ratio_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/video_ocr_area_ratio_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/video_resolution_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/video_resolution_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/video_watermark_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/video_watermark_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/word_repetition_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/word_repetition_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/filter/words_num_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/filter/words_num_filter.py -------------------------------------------------------------------------------- /data_juicer/ops/grouper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/grouper/__init__.py -------------------------------------------------------------------------------- /data_juicer/ops/grouper/key_value_grouper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/grouper/key_value_grouper.py -------------------------------------------------------------------------------- /data_juicer/ops/grouper/naive_grouper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/grouper/naive_grouper.py -------------------------------------------------------------------------------- /data_juicer/ops/grouper/naive_reverse_grouper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/grouper/naive_reverse_grouper.py -------------------------------------------------------------------------------- /data_juicer/ops/load.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/load.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/__init__.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/annotation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_juicer/ops/mapper/annotation/annotation_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/annotation/annotation_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/calibrate_qa_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/calibrate_qa_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/calibrate_query_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/calibrate_query_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/calibrate_response_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/calibrate_response_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/chinese_convert_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/chinese_convert_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/clean_copyright_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/clean_copyright_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/clean_email_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/clean_email_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/clean_html_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/clean_html_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/clean_ip_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/clean_ip_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/clean_links_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/clean_links_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/detect_main_character_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/detect_main_character_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/dialog_topic_detection_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/dialog_topic_detection_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/download_file_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/download_file_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/expand_macro_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/expand_macro_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/extract_event_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/extract_event_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/extract_keyword_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/extract_keyword_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/extract_nickname_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/extract_nickname_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/extract_support_text_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/extract_support_text_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/fix_unicode_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/fix_unicode_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/generate_qa_from_text_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/generate_qa_from_text_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/image_blur_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/image_blur_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/image_captioning_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/image_captioning_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/image_detection_yolo_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/image_detection_yolo_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/image_diffusion_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/image_diffusion_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/image_face_blur_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/image_face_blur_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/image_segment_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/image_segment_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/image_tagging_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/image_tagging_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/mllm_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/mllm_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/nlpaug_en_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/nlpaug_en_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/nlpcda_zh_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/nlpcda_zh_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/optimize_prompt_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/optimize_prompt_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/optimize_qa_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/optimize_qa_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/optimize_query_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/optimize_query_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/optimize_response_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/optimize_response_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/pair_preference_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/pair_preference_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/python_file_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/python_file_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/python_lambda_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/python_lambda_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/query_intent_detection_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/query_intent_detection_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/query_topic_detection_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/query_topic_detection_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/relation_identity_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/relation_identity_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/remove_bibliography_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/remove_bibliography_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/remove_comments_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/remove_comments_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/remove_header_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/remove_header_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/remove_long_words_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/remove_long_words_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/remove_specific_chars_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/remove_specific_chars_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/remove_table_text_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/remove_table_text_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/replace_content_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/replace_content_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/sdxl_prompt2prompt_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/sdxl_prompt2prompt_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/sentence_split_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/sentence_split_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/text_chunk_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/text_chunk_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/vggt_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/vggt_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mapper/video_face_blur_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mapper/video_face_blur_mapper.py -------------------------------------------------------------------------------- /data_juicer/ops/mixins.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/mixins.py -------------------------------------------------------------------------------- /data_juicer/ops/op_fusion.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/op_fusion.py -------------------------------------------------------------------------------- /data_juicer/ops/selector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/selector/__init__.py -------------------------------------------------------------------------------- /data_juicer/ops/selector/random_selector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/ops/selector/random_selector.py -------------------------------------------------------------------------------- /data_juicer/tools/DJ_mcp_granular_ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/DJ_mcp_granular_ops.py -------------------------------------------------------------------------------- /data_juicer/tools/DJ_mcp_recipe_flow.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/DJ_mcp_recipe_flow.py -------------------------------------------------------------------------------- /data_juicer/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/__init__.py -------------------------------------------------------------------------------- /data_juicer/tools/hpo/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/hpo/README.md -------------------------------------------------------------------------------- /data_juicer/tools/hpo/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/hpo/README_ZH.md -------------------------------------------------------------------------------- /data_juicer/tools/hpo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_juicer/tools/hpo/configs/process.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/hpo/configs/process.yaml -------------------------------------------------------------------------------- /data_juicer/tools/hpo/configs/quality_score_hpo.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/hpo/configs/quality_score_hpo.yaml -------------------------------------------------------------------------------- /data_juicer/tools/hpo/execute_hpo_3sigma.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/hpo/execute_hpo_3sigma.py -------------------------------------------------------------------------------- /data_juicer/tools/hpo/execute_hpo_wandb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/hpo/execute_hpo_wandb.py -------------------------------------------------------------------------------- /data_juicer/tools/hpo/objects.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/hpo/objects.py -------------------------------------------------------------------------------- /data_juicer/tools/mcp_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/mcp_server.py -------------------------------------------------------------------------------- /data_juicer/tools/mcp_tool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/mcp_tool.py -------------------------------------------------------------------------------- /data_juicer/tools/op_search.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/op_search.py -------------------------------------------------------------------------------- /data_juicer/tools/quality_classifier/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/quality_classifier/README.md -------------------------------------------------------------------------------- /data_juicer/tools/quality_classifier/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/quality_classifier/README_ZH.md -------------------------------------------------------------------------------- /data_juicer/tools/quality_classifier/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_juicer/tools/quality_classifier/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/quality_classifier/eval.py -------------------------------------------------------------------------------- /data_juicer/tools/quality_classifier/predict.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/quality_classifier/predict.py -------------------------------------------------------------------------------- /data_juicer/tools/quality_classifier/qc_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/quality_classifier/qc_utils.py -------------------------------------------------------------------------------- /data_juicer/tools/quality_classifier/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/tools/quality_classifier/train.py -------------------------------------------------------------------------------- /data_juicer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_juicer/utils/asset_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/asset_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/availability_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/availability_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/cache_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/cache_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/ckpt_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/ckpt_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/common_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/common_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/compress.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/compress.py -------------------------------------------------------------------------------- /data_juicer/utils/constant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/constant.py -------------------------------------------------------------------------------- /data_juicer/utils/file_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/file_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/fingerprint_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/fingerprint_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/lazy_loader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/lazy_loader.py -------------------------------------------------------------------------------- /data_juicer/utils/logger_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/logger_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/mm_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/mm_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/model_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/model_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/nltk_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/nltk_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/process_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/process_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/ray_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/ray_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/registry.py -------------------------------------------------------------------------------- /data_juicer/utils/resource_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/resource_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/s3_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/s3_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/sample.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/sample.py -------------------------------------------------------------------------------- /data_juicer/utils/unittest_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/unittest_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/video_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/video_utils.py -------------------------------------------------------------------------------- /data_juicer/utils/webdataset_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/data_juicer/utils/webdataset_utils.py -------------------------------------------------------------------------------- /demos/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/README.md -------------------------------------------------------------------------------- /demos/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/README_ZH.md -------------------------------------------------------------------------------- /demos/analyze_simple/analyzer.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/analyze_simple/analyzer.yaml -------------------------------------------------------------------------------- /demos/api_service/configs/dj_config_template.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/api_service/configs/dj_config_template.yaml -------------------------------------------------------------------------------- /demos/api_service/configs/model_configs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/api_service/configs/model_configs.json -------------------------------------------------------------------------------- /demos/api_service/react_data_filter_process.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/api_service/react_data_filter_process.ipynb -------------------------------------------------------------------------------- /demos/api_service/react_data_mapper_process.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/api_service/react_data_mapper_process.ipynb -------------------------------------------------------------------------------- /demos/api_service/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/api_service/utils.py -------------------------------------------------------------------------------- /demos/api_service/wrapped_filters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/api_service/wrapped_filters.py -------------------------------------------------------------------------------- /demos/api_service/wrapped_mappers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/api_service/wrapped_mappers.py -------------------------------------------------------------------------------- /demos/auto_evaluation_helm/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/auto_evaluation_helm/README_ZH.md -------------------------------------------------------------------------------- /demos/auto_evaluation_helm/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/auto_evaluation_helm/app.py -------------------------------------------------------------------------------- /demos/auto_evaluation_helm/imgs/data-juicer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/auto_evaluation_helm/imgs/data-juicer.png -------------------------------------------------------------------------------- /demos/auto_evaluation_helm/imgs/eval-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/auto_evaluation_helm/imgs/eval-01.png -------------------------------------------------------------------------------- /demos/auto_evaluation_helm/imgs/eval-02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/auto_evaluation_helm/imgs/eval-02.png -------------------------------------------------------------------------------- /demos/data/demo-dataset-chatml.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data/demo-dataset-chatml.jsonl -------------------------------------------------------------------------------- /demos/data/demo-dataset-content.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data/demo-dataset-content.jsonl -------------------------------------------------------------------------------- /demos/data/demo-dataset-deduplication.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data/demo-dataset-deduplication.jsonl -------------------------------------------------------------------------------- /demos/data/demo-dataset-images-bytes.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data/demo-dataset-images-bytes.parquet -------------------------------------------------------------------------------- /demos/data/demo-dataset-images.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data/demo-dataset-images.jsonl -------------------------------------------------------------------------------- /demos/data/demo-dataset-videos.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data/demo-dataset-videos.jsonl -------------------------------------------------------------------------------- /demos/data/demo-dataset.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data/demo-dataset.jsonl -------------------------------------------------------------------------------- /demos/data/demo-dataset_1725870268.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data/demo-dataset_1725870268.jsonl -------------------------------------------------------------------------------- /demos/data/demo-dataset_1725870628.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data/demo-dataset_1725870628.jsonl -------------------------------------------------------------------------------- /demos/data_mixture/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_mixture/app.py -------------------------------------------------------------------------------- /demos/data_mixture/data/redpajama-c4-refined.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_mixture/data/redpajama-c4-refined.jsonl -------------------------------------------------------------------------------- /demos/data_mixture/data/the-pile-nih-refined.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_mixture/data/the-pile-nih-refined.jsonl -------------------------------------------------------------------------------- /demos/data_mixture/data/the-pile-uspto-refined.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_mixture/data/the-pile-uspto-refined.jsonl -------------------------------------------------------------------------------- /demos/data_process_hpo/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_process_hpo/app.py -------------------------------------------------------------------------------- /demos/data_process_hpo/imgs/data-juicer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_process_hpo/imgs/data-juicer.png -------------------------------------------------------------------------------- /demos/data_process_loop/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_process_loop/app.py -------------------------------------------------------------------------------- /demos/data_process_loop/configs/demo.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_process_loop/configs/demo.yaml -------------------------------------------------------------------------------- /demos/data_process_loop/data/demo-dataset.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_process_loop/data/demo-dataset.jsonl -------------------------------------------------------------------------------- /demos/data_visualization_diversity/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_visualization_diversity/app.py -------------------------------------------------------------------------------- /demos/data_visualization_diversity/configs/demo.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_visualization_diversity/configs/demo.yaml -------------------------------------------------------------------------------- /demos/data_visualization_op_effect/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_visualization_op_effect/app.py -------------------------------------------------------------------------------- /demos/data_visualization_op_insight/app.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_visualization_op_insight/app.css -------------------------------------------------------------------------------- /demos/data_visualization_op_insight/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_visualization_op_insight/app.py -------------------------------------------------------------------------------- /demos/data_visualization_op_insight/cache/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/data_visualization_statistics/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/data_visualization_statistics/app.py -------------------------------------------------------------------------------- /demos/overview_scan/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/overview_scan/app.py -------------------------------------------------------------------------------- /demos/overview_scan/data/demo-dataset.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/overview_scan/data/demo-dataset.jsonl -------------------------------------------------------------------------------- /demos/process_cft_zh_data/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_cft_zh_data/app.py -------------------------------------------------------------------------------- /demos/process_cft_zh_data/data/alpaca-cot.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_cft_zh_data/data/alpaca-cot.jsonl -------------------------------------------------------------------------------- /demos/process_code_data/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_code_data/app.py -------------------------------------------------------------------------------- /demos/process_code_data/data/stack_exchange.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_code_data/data/stack_exchange.jsonl -------------------------------------------------------------------------------- /demos/process_on_ray/configs/dedup.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_on_ray/configs/dedup.yaml -------------------------------------------------------------------------------- /demos/process_on_ray/configs/demo-new-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_on_ray/configs/demo-new-config.yaml -------------------------------------------------------------------------------- /demos/process_on_ray/configs/demo.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_on_ray/configs/demo.yaml -------------------------------------------------------------------------------- /demos/process_on_ray/data/demo-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_on_ray/data/demo-dataset.json -------------------------------------------------------------------------------- /demos/process_on_ray/data/demo-dataset.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_on_ray/data/demo-dataset.jsonl -------------------------------------------------------------------------------- /demos/process_sci_data/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_sci_data/app.py -------------------------------------------------------------------------------- /demos/process_sci_data/data/arxiv.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_sci_data/data/arxiv.jsonl -------------------------------------------------------------------------------- /demos/process_simple/process.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_simple/process.yaml -------------------------------------------------------------------------------- /demos/process_video_on_ray/configs/demo.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_video_on_ray/configs/demo.yaml -------------------------------------------------------------------------------- /demos/process_video_on_ray/data/Note.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_video_on_ray/data/Note.md -------------------------------------------------------------------------------- /demos/process_video_on_ray/data/demo-dataset.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_video_on_ray/data/demo-dataset.jsonl -------------------------------------------------------------------------------- /demos/process_video_on_ray/data/videos/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_video_on_ray/data/videos/video1.mp4 -------------------------------------------------------------------------------- /demos/process_video_on_ray/data/videos/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_video_on_ray/data/videos/video2.mp4 -------------------------------------------------------------------------------- /demos/process_video_on_ray/data/videos/video3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/process_video_on_ray/data/videos/video3.mp4 -------------------------------------------------------------------------------- /demos/role_playing_system_prompt/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/role_playing_system_prompt/README_ZH.md -------------------------------------------------------------------------------- /demos/tool_dataset_splitting_by_language/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/tool_dataset_splitting_by_language/app.py -------------------------------------------------------------------------------- /demos/tool_quality_classifier/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/demos/tool_quality_classifier/app.py -------------------------------------------------------------------------------- /demos/tool_quality_classifier/quality_classifier/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/BadDataExhibition.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/BadDataExhibition.md -------------------------------------------------------------------------------- /docs/BadDataExhibition_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/BadDataExhibition_ZH.md -------------------------------------------------------------------------------- /docs/DJ_SORA.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/DJ_SORA.md -------------------------------------------------------------------------------- /docs/DJ_SORA_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/DJ_SORA_ZH.md -------------------------------------------------------------------------------- /docs/DJ_agents.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/DJ_agents.md -------------------------------------------------------------------------------- /docs/DJ_agents_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/DJ_agents_ZH.md -------------------------------------------------------------------------------- /docs/DJ_service.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/DJ_service.md -------------------------------------------------------------------------------- /docs/DJ_service_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/DJ_service_ZH.md -------------------------------------------------------------------------------- /docs/DatasetCfg.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/DatasetCfg.md -------------------------------------------------------------------------------- /docs/DatasetCfg_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/DatasetCfg_ZH.md -------------------------------------------------------------------------------- /docs/DeveloperGuide.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/DeveloperGuide.md -------------------------------------------------------------------------------- /docs/DeveloperGuide_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/DeveloperGuide_ZH.md -------------------------------------------------------------------------------- /docs/Distributed.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/Distributed.md -------------------------------------------------------------------------------- /docs/Distributed_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/Distributed_ZH.md -------------------------------------------------------------------------------- /docs/Operators.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/Operators.md -------------------------------------------------------------------------------- /docs/Sandbox.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/Sandbox.md -------------------------------------------------------------------------------- /docs/Sandbox_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/Sandbox_ZH.md -------------------------------------------------------------------------------- /docs/awesome_llm_data.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/awesome_llm_data.md -------------------------------------------------------------------------------- /docs/hub/AlpacaCOT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/hub/AlpacaCOT.md -------------------------------------------------------------------------------- /docs/hub/AlpacaCOT_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/hub/AlpacaCOT_ZH.md -------------------------------------------------------------------------------- /docs/hub/AnnotationNotificationSystem.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/hub/AnnotationNotificationSystem.md -------------------------------------------------------------------------------- /docs/hub/BLOOM.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/hub/BLOOM.md -------------------------------------------------------------------------------- /docs/hub/BLOOM_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/hub/BLOOM_ZH.md -------------------------------------------------------------------------------- /docs/hub/RecipeGallery.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/hub/RecipeGallery.md -------------------------------------------------------------------------------- /docs/hub/RecipeGallery_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/hub/RecipeGallery_ZH.md -------------------------------------------------------------------------------- /docs/hub/RedPajama.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/hub/RedPajama.md -------------------------------------------------------------------------------- /docs/hub/RedPajama_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/hub/RedPajama_ZH.md -------------------------------------------------------------------------------- /docs/imgs/data-juicer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/imgs/data-juicer.jpg -------------------------------------------------------------------------------- /docs/imgs/dj_agent_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/imgs/dj_agent_image.png -------------------------------------------------------------------------------- /docs/imgs/dj_dev_agent_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/imgs/dj_dev_agent_image.png -------------------------------------------------------------------------------- /docs/imgs/eval-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/imgs/eval-01.png -------------------------------------------------------------------------------- /docs/imgs/eval-02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/imgs/eval-02.png -------------------------------------------------------------------------------- /docs/op_doc_enhance_workflow/examples.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/op_doc_enhance_workflow/examples.json -------------------------------------------------------------------------------- /docs/op_doc_enhance_workflow/generate_op_details.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/op_doc_enhance_workflow/generate_op_details.py -------------------------------------------------------------------------------- /docs/op_doc_enhance_workflow/runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/op_doc_enhance_workflow/runner.py -------------------------------------------------------------------------------- /docs/op_doc_enhance_workflow/templates/op_doc.md.j2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/op_doc_enhance_workflow/templates/op_doc.md.j2 -------------------------------------------------------------------------------- /docs/op_doc_enhance_workflow/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/op_doc_enhance_workflow/utils/example_ir.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/op_doc_enhance_workflow/utils/example_ir.py -------------------------------------------------------------------------------- /docs/op_doc_enhance_workflow/utils/extractor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/op_doc_enhance_workflow/utils/extractor.py -------------------------------------------------------------------------------- /docs/op_doc_enhance_workflow/utils/model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/op_doc_enhance_workflow/utils/model.py -------------------------------------------------------------------------------- /docs/op_doc_enhance_workflow/utils/parse_class.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/op_doc_enhance_workflow/utils/parse_class.py -------------------------------------------------------------------------------- /docs/op_doc_enhance_workflow/utils/router.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/op_doc_enhance_workflow/utils/router.py -------------------------------------------------------------------------------- /docs/op_doc_enhance_workflow/utils/view_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/op_doc_enhance_workflow/utils/view_model.py -------------------------------------------------------------------------------- /docs/operators/aggregator/meta_tags_aggregator.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/aggregator/meta_tags_aggregator.md -------------------------------------------------------------------------------- /docs/operators/aggregator/nested_aggregator.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/aggregator/nested_aggregator.md -------------------------------------------------------------------------------- /docs/operators/deduplicator/document_deduplicator.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/deduplicator/document_deduplicator.md -------------------------------------------------------------------------------- /docs/operators/deduplicator/image_deduplicator.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/deduplicator/image_deduplicator.md -------------------------------------------------------------------------------- /docs/operators/deduplicator/video_deduplicator.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/deduplicator/video_deduplicator.md -------------------------------------------------------------------------------- /docs/operators/filter/alphanumeric_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/alphanumeric_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/audio_duration_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/audio_duration_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/audio_nmf_snr_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/audio_nmf_snr_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/audio_size_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/audio_size_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/average_line_length_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/average_line_length_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/character_repetition_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/character_repetition_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/flagged_words_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/flagged_words_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/general_field_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/general_field_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/image_aesthetics_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/image_aesthetics_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/image_aspect_ratio_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/image_aspect_ratio_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/image_face_count_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/image_face_count_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/image_face_ratio_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/image_face_ratio_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/image_nsfw_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/image_nsfw_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/image_shape_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/image_shape_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/image_size_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/image_size_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/image_text_matching_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/image_text_matching_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/image_watermark_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/image_watermark_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/in_context_influence_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/in_context_influence_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/language_id_score_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/language_id_score_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/llm_analysis_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/llm_analysis_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/llm_difficulty_score_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/llm_difficulty_score_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/llm_perplexity_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/llm_perplexity_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/llm_quality_score_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/llm_quality_score_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/llm_task_relevance_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/llm_task_relevance_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/maximum_line_length_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/maximum_line_length_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/perplexity_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/perplexity_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/special_characters_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/special_characters_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/specified_field_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/specified_field_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/stopwords_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/stopwords_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/suffix_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/suffix_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/text_action_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/text_action_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/text_embd_similarity_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/text_embd_similarity_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/text_length_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/text_length_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/text_pair_similarity_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/text_pair_similarity_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/token_num_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/token_num_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/video_aesthetics_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/video_aesthetics_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/video_aspect_ratio_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/video_aspect_ratio_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/video_duration_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/video_duration_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/video_motion_score_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/video_motion_score_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/video_nsfw_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/video_nsfw_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/video_ocr_area_ratio_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/video_ocr_area_ratio_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/video_resolution_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/video_resolution_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/video_watermark_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/video_watermark_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/word_repetition_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/word_repetition_filter.md -------------------------------------------------------------------------------- /docs/operators/filter/words_num_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/filter/words_num_filter.md -------------------------------------------------------------------------------- /docs/operators/formatter/csv_formatter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/formatter/csv_formatter.md -------------------------------------------------------------------------------- /docs/operators/formatter/empty_formatter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/formatter/empty_formatter.md -------------------------------------------------------------------------------- /docs/operators/formatter/json_formatter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/formatter/json_formatter.md -------------------------------------------------------------------------------- /docs/operators/formatter/parquet_formatter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/formatter/parquet_formatter.md -------------------------------------------------------------------------------- /docs/operators/formatter/ray_empty_formatter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/formatter/ray_empty_formatter.md -------------------------------------------------------------------------------- /docs/operators/formatter/text_formatter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/formatter/text_formatter.md -------------------------------------------------------------------------------- /docs/operators/formatter/tsv_formatter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/formatter/tsv_formatter.md -------------------------------------------------------------------------------- /docs/operators/grouper/key_value_grouper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/grouper/key_value_grouper.md -------------------------------------------------------------------------------- /docs/operators/grouper/naive_grouper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/grouper/naive_grouper.md -------------------------------------------------------------------------------- /docs/operators/grouper/naive_reverse_grouper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/grouper/naive_reverse_grouper.md -------------------------------------------------------------------------------- /docs/operators/mapper/audio_ffmpeg_wrapped_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/audio_ffmpeg_wrapped_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/calibrate_qa_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/calibrate_qa_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/calibrate_query_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/calibrate_query_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/calibrate_response_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/calibrate_response_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/chinese_convert_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/chinese_convert_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/clean_copyright_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/clean_copyright_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/clean_email_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/clean_email_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/clean_html_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/clean_html_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/clean_ip_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/clean_ip_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/clean_links_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/clean_links_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/download_file_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/download_file_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/expand_macro_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/expand_macro_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/extract_event_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/extract_event_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/extract_keyword_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/extract_keyword_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/extract_nickname_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/extract_nickname_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/extract_support_text_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/extract_support_text_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/fix_unicode_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/fix_unicode_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/image_blur_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/image_blur_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/image_captioning_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/image_captioning_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/image_detection_yolo_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/image_detection_yolo_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/image_diffusion_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/image_diffusion_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/image_face_blur_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/image_face_blur_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/image_segment_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/image_segment_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/image_tagging_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/image_tagging_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/mllm_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/mllm_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/nlpaug_en_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/nlpaug_en_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/nlpcda_zh_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/nlpcda_zh_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/optimize_prompt_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/optimize_prompt_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/optimize_qa_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/optimize_qa_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/optimize_query_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/optimize_query_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/optimize_response_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/optimize_response_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/pair_preference_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/pair_preference_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/python_file_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/python_file_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/python_lambda_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/python_lambda_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/relation_identity_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/relation_identity_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/remove_bibliography_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/remove_bibliography_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/remove_comments_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/remove_comments_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/remove_header_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/remove_header_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/remove_long_words_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/remove_long_words_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/remove_table_text_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/remove_table_text_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/replace_content_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/replace_content_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/sdxl_prompt2prompt_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/sdxl_prompt2prompt_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/sentence_split_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/sentence_split_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/text_chunk_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/text_chunk_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/vggt_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/vggt_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/video_extract_frames_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/video_extract_frames_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/video_face_blur_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/video_face_blur_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/video_ffmpeg_wrapped_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/video_ffmpeg_wrapped_mapper.md -------------------------------------------------------------------------------- /docs/operators/mapper/video_split_by_scene_mapper.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/mapper/video_split_by_scene_mapper.md -------------------------------------------------------------------------------- /docs/operators/op/general_fused_op.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/op/general_fused_op.md -------------------------------------------------------------------------------- /docs/operators/selector/random_selector.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/operators/selector/random_selector.md -------------------------------------------------------------------------------- /docs/sphinx_doc/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/Makefile -------------------------------------------------------------------------------- /docs/sphinx_doc/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/README.md -------------------------------------------------------------------------------- /docs/sphinx_doc/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/README_ZH.md -------------------------------------------------------------------------------- /docs/sphinx_doc/_templates/package.rst_t: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/_templates/package.rst_t -------------------------------------------------------------------------------- /docs/sphinx_doc/build_doc.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/build_doc.sh -------------------------------------------------------------------------------- /docs/sphinx_doc/build_versions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/build_versions.py -------------------------------------------------------------------------------- /docs/sphinx_doc/make.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/make.bat -------------------------------------------------------------------------------- /docs/sphinx_doc/redirect.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/redirect.html -------------------------------------------------------------------------------- /docs/sphinx_doc/source/_static/awesome-list.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/source/_static/awesome-list.html -------------------------------------------------------------------------------- /docs/sphinx_doc/source/_static/images/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/source/_static/images/icon.png -------------------------------------------------------------------------------- /docs/sphinx_doc/source/_static/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/source/_static/images/logo.png -------------------------------------------------------------------------------- /docs/sphinx_doc/source/_static/sidebar-menu.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/source/_static/sidebar-menu.css -------------------------------------------------------------------------------- /docs/sphinx_doc/source/_static/tutorial_kdd24.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/source/_static/tutorial_kdd24.html -------------------------------------------------------------------------------- /docs/sphinx_doc/source/_templates/page.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/source/_templates/page.html -------------------------------------------------------------------------------- /docs/sphinx_doc/source/_templates/sidebar/brand.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/source/_templates/sidebar/brand.html -------------------------------------------------------------------------------- /docs/sphinx_doc/source/api.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/source/api.rst -------------------------------------------------------------------------------- /docs/sphinx_doc/source/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/source/conf.py -------------------------------------------------------------------------------- /docs/sphinx_doc/source/custom_myst.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/source/custom_myst.py -------------------------------------------------------------------------------- /docs/sphinx_doc/source/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/source/index.rst -------------------------------------------------------------------------------- /docs/sphinx_doc/source/index_ZH.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/sphinx_doc/source/index_ZH.rst -------------------------------------------------------------------------------- /docs/tutorial/DJ-Cookbook.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/tutorial/DJ-Cookbook.md -------------------------------------------------------------------------------- /docs/tutorial/DJ-Cookbook_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/tutorial/DJ-Cookbook_ZH.md -------------------------------------------------------------------------------- /docs/tutorial/Installation.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/tutorial/Installation.md -------------------------------------------------------------------------------- /docs/tutorial/Installation_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/tutorial/Installation_ZH.md -------------------------------------------------------------------------------- /docs/tutorial/QuickStart.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/tutorial/QuickStart.md -------------------------------------------------------------------------------- /docs/tutorial/QuickStart_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/docs/tutorial/QuickStart_ZH.md -------------------------------------------------------------------------------- /label_studio_localhost_connection.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/label_studio_localhost_connection.json -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/pyproject.toml -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/scripts/README.md -------------------------------------------------------------------------------- /scripts/dlc/partition_data_dlc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/scripts/dlc/partition_data_dlc.py -------------------------------------------------------------------------------- /scripts/dlc/run_on_dlc.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/scripts/dlc/run_on_dlc.sh -------------------------------------------------------------------------------- /scripts/run_slurm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/scripts/run_slurm.sh -------------------------------------------------------------------------------- /service.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/service.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/analysis/test_collector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/analysis/test_collector.py -------------------------------------------------------------------------------- /tests/analysis/test_column_wise_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/analysis/test_column_wise_analysis.py -------------------------------------------------------------------------------- /tests/analysis/test_correlation_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/analysis/test_correlation_analysis.py -------------------------------------------------------------------------------- /tests/analysis/test_diversity_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/analysis/test_diversity_analysis.py -------------------------------------------------------------------------------- /tests/analysis/test_measure.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/analysis/test_measure.py -------------------------------------------------------------------------------- /tests/analysis/test_overall_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/analysis/test_overall_analysis.py -------------------------------------------------------------------------------- /tests/benchmark_performance/configs/audio.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/benchmark_performance/configs/audio.yaml -------------------------------------------------------------------------------- /tests/benchmark_performance/configs/image.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/benchmark_performance/configs/image.yaml -------------------------------------------------------------------------------- /tests/benchmark_performance/configs/text.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/benchmark_performance/configs/text.yaml -------------------------------------------------------------------------------- /tests/benchmark_performance/configs/video.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/benchmark_performance/configs/video.yaml -------------------------------------------------------------------------------- /tests/benchmark_performance/report.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/benchmark_performance/report.py -------------------------------------------------------------------------------- /tests/benchmark_performance/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/benchmark_performance/run.sh -------------------------------------------------------------------------------- /tests/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/config/demo_4_test.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/config/demo_4_test.yaml -------------------------------------------------------------------------------- /tests/config/demo_4_test_bad_val.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/config/demo_4_test_bad_val.yaml -------------------------------------------------------------------------------- /tests/config/demo_4_test_multiple_text_keys.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/config/demo_4_test_multiple_text_keys.yaml -------------------------------------------------------------------------------- /tests/config/demo_4_test_same_ops.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/config/demo_4_test_same_ops.yaml -------------------------------------------------------------------------------- /tests/config/test_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/config/test_config.py -------------------------------------------------------------------------------- /tests/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/core/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/core/data/test_config_validator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/data/test_config_validator.py -------------------------------------------------------------------------------- /tests/core/data/test_data/parquet/sample.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/data/test_data/parquet/sample.parquet -------------------------------------------------------------------------------- /tests/core/data/test_data/sample.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/data/test_data/sample.jsonl -------------------------------------------------------------------------------- /tests/core/data/test_data/sample.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/data/test_data/sample.txt -------------------------------------------------------------------------------- /tests/core/data/test_data/test_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/data/test_data/test_config.yaml -------------------------------------------------------------------------------- /tests/core/data/test_data/test_config_list.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/data/test_data/test_config_list.yaml -------------------------------------------------------------------------------- /tests/core/data/test_data/test_config_ray.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/data/test_data/test_config_ray.yaml -------------------------------------------------------------------------------- /tests/core/data/test_data_validator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/data/test_data_validator.py -------------------------------------------------------------------------------- /tests/core/data/test_dataset_builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/data/test_dataset_builder.py -------------------------------------------------------------------------------- /tests/core/data/test_dj_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/data/test_dj_dataset.py -------------------------------------------------------------------------------- /tests/core/data/test_load_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/data/test_load_strategy.py -------------------------------------------------------------------------------- /tests/core/data/test_ray_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/data/test_ray_dataset.py -------------------------------------------------------------------------------- /tests/core/data/test_schema.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/data/test_schema.py -------------------------------------------------------------------------------- /tests/core/executor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/core/executor/test_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/executor/test_base.py -------------------------------------------------------------------------------- /tests/core/executor/test_default_executor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/executor/test_default_executor.py -------------------------------------------------------------------------------- /tests/core/executor/test_factory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/executor/test_factory.py -------------------------------------------------------------------------------- /tests/core/executor/test_ray_executor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/executor/test_ray_executor.py -------------------------------------------------------------------------------- /tests/core/test_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/test_adapter.py -------------------------------------------------------------------------------- /tests/core/test_analyzer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/test_analyzer.py -------------------------------------------------------------------------------- /tests/core/test_exporter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/test_exporter.py -------------------------------------------------------------------------------- /tests/core/test_monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/test_monitor.py -------------------------------------------------------------------------------- /tests/core/test_ray_exporter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/test_ray_exporter.py -------------------------------------------------------------------------------- /tests/core/test_tracer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/core/test_tracer.py -------------------------------------------------------------------------------- /tests/download/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/download/test_download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/download/test_download.py -------------------------------------------------------------------------------- /tests/format/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/format/data/structured/demo-dataset.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/format/data/structured/demo-dataset.csv -------------------------------------------------------------------------------- /tests/format/data/structured/demo-dataset.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/format/data/structured/demo-dataset.jsonl -------------------------------------------------------------------------------- /tests/format/data/structured/demo-dataset.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/format/data/structured/demo-dataset.parquet -------------------------------------------------------------------------------- /tests/format/data/structured/demo-dataset.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/format/data/structured/demo-dataset.tsv -------------------------------------------------------------------------------- /tests/format/data/text/sample1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/format/data/text/sample1.txt -------------------------------------------------------------------------------- /tests/format/data/text/sample2.txt: -------------------------------------------------------------------------------- 1 | Do you need a cup of coffee? 2 | -------------------------------------------------------------------------------- /tests/format/data/text/sample3.txt: -------------------------------------------------------------------------------- 1 | 你好,请问你是谁 2 | -------------------------------------------------------------------------------- /tests/format/data/text/sample4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/format/data/text/sample4.txt -------------------------------------------------------------------------------- /tests/format/data/text/sample5.txt: -------------------------------------------------------------------------------- 1 | 欢迎来到阿里巴巴! 2 | -------------------------------------------------------------------------------- /tests/format/data/text/sample6.txt: -------------------------------------------------------------------------------- 1 | This paper proposed a novel method on LLM pretraining. 2 | -------------------------------------------------------------------------------- /tests/format/test_csv_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/format/test_csv_formatter.py -------------------------------------------------------------------------------- /tests/format/test_empty_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/format/test_empty_formatter.py -------------------------------------------------------------------------------- /tests/format/test_json_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/format/test_json_formatter.py -------------------------------------------------------------------------------- /tests/format/test_load_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/format/test_load_formatter.py -------------------------------------------------------------------------------- /tests/format/test_parquet_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/format/test_parquet_formatter.py -------------------------------------------------------------------------------- /tests/format/test_tsv_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/format/test_tsv_formatter.py -------------------------------------------------------------------------------- /tests/format/test_unify_format.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/format/test_unify_format.py -------------------------------------------------------------------------------- /tests/ops/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/ops/aggregator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/ops/aggregator/test_meta_tags_aggregator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/aggregator/test_meta_tags_aggregator.py -------------------------------------------------------------------------------- /tests/ops/aggregator/test_nested_aggregator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/aggregator/test_nested_aggregator.py -------------------------------------------------------------------------------- /tests/ops/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/ops/data/audio1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/audio1.wav -------------------------------------------------------------------------------- /tests/ops/data/audio2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/audio2.wav -------------------------------------------------------------------------------- /tests/ops/data/audio3.ogg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/audio3.ogg -------------------------------------------------------------------------------- /tests/ops/data/blip.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/blip.jpg -------------------------------------------------------------------------------- /tests/ops/data/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/cat.jpg -------------------------------------------------------------------------------- /tests/ops/data/img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/img1.png -------------------------------------------------------------------------------- /tests/ops/data/img10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/img10.jpg -------------------------------------------------------------------------------- /tests/ops/data/img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/img2.jpg -------------------------------------------------------------------------------- /tests/ops/data/img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/img3.jpg -------------------------------------------------------------------------------- /tests/ops/data/img4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/img4.png -------------------------------------------------------------------------------- /tests/ops/data/img5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/img5.jpg -------------------------------------------------------------------------------- /tests/ops/data/img6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/img6.jpg -------------------------------------------------------------------------------- /tests/ops/data/img7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/img7.jpg -------------------------------------------------------------------------------- /tests/ops/data/img8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/img8.jpg -------------------------------------------------------------------------------- /tests/ops/data/img9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/img9.jpg -------------------------------------------------------------------------------- /tests/ops/data/img_pair_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/img_pair_1.jpg -------------------------------------------------------------------------------- /tests/ops/data/img_pair_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/img_pair_2.jpg -------------------------------------------------------------------------------- /tests/ops/data/lena-face.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/lena-face.jpg -------------------------------------------------------------------------------- /tests/ops/data/lena.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/lena.jpg -------------------------------------------------------------------------------- /tests/ops/data/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/video1.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video10.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/video10.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video11.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/video11.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/video2.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video3-no-audio.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/video3-no-audio.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/video3.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video4.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/video4.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video5.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/video5.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video6.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/video6.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video7.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/video7.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video8.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/video8.mp4 -------------------------------------------------------------------------------- /tests/ops/data/video9.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/data/video9.mp4 -------------------------------------------------------------------------------- /tests/ops/deduplicator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/ops/deduplicator/test_document_deduplicator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/deduplicator/test_document_deduplicator.py -------------------------------------------------------------------------------- /tests/ops/deduplicator/test_image_deduplicator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/deduplicator/test_image_deduplicator.py -------------------------------------------------------------------------------- /tests/ops/deduplicator/test_video_deduplicator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/deduplicator/test_video_deduplicator.py -------------------------------------------------------------------------------- /tests/ops/filter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/ops/filter/test_alphanumeric_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_alphanumeric_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_audio_duration_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_audio_duration_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_audio_nmf_snr_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_audio_nmf_snr_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_audio_size_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_audio_size_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_average_line_length_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_average_line_length_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_character_repetition_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_character_repetition_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_flagged_words_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_flagged_words_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_general_field_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_general_field_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_image_aesthetics_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_image_aesthetics_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_image_aspect_ratio_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_image_aspect_ratio_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_image_face_count_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_image_face_count_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_image_face_ratio_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_image_face_ratio_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_image_nsfw_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_image_nsfw_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_image_shape_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_image_shape_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_image_size_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_image_size_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_image_text_matching_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_image_text_matching_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_image_watermark_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_image_watermark_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_in_context_influence_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_in_context_influence_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_language_id_score_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_language_id_score_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_llm_analysis_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_llm_analysis_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_llm_difficulty_score_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_llm_difficulty_score_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_llm_perplexity_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_llm_perplexity_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_llm_quality_score_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_llm_quality_score_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_llm_task_relevance_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_llm_task_relevance_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_maximum_line_length_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_maximum_line_length_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_perplexity_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_perplexity_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_special_characters_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_special_characters_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_specified_field_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_specified_field_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_stopwords_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_stopwords_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_suffix_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_suffix_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_text_action_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_text_action_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_text_embd_similarity_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_text_embd_similarity_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_text_length_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_text_length_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_text_pair_similarity_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_text_pair_similarity_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_token_num_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_token_num_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_video_aesthetics_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_video_aesthetics_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_video_aspect_ratio_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_video_aspect_ratio_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_video_duration_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_video_duration_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_video_motion_score_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_video_motion_score_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_video_nsfw_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_video_nsfw_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_video_ocr_area_ratio_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_video_ocr_area_ratio_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_video_resolution_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_video_resolution_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_video_watermark_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_video_watermark_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_word_repetition_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_word_repetition_filter.py -------------------------------------------------------------------------------- /tests/ops/filter/test_words_num_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/filter/test_words_num_filter.py -------------------------------------------------------------------------------- /tests/ops/grouper/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/ops/grouper/test_key_value_grouper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/grouper/test_key_value_grouper.py -------------------------------------------------------------------------------- /tests/ops/grouper/test_naive_grouper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/grouper/test_naive_grouper.py -------------------------------------------------------------------------------- /tests/ops/grouper/test_naive_reverse_grouper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/grouper/test_naive_reverse_grouper.py -------------------------------------------------------------------------------- /tests/ops/mapper/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/ops/mapper/annotation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/ops/mapper/test_audio_ffmpeg_wrapped_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_audio_ffmpeg_wrapped_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_calibrate_qa_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_calibrate_qa_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_calibrate_query_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_calibrate_query_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_calibrate_response_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_calibrate_response_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_chinese_convert_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_chinese_convert_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_clean_copyright_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_clean_copyright_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_clean_email_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_clean_email_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_clean_html_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_clean_html_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_clean_ip_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_clean_ip_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_clean_links_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_clean_links_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_download_file_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_download_file_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_expand_macro_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_expand_macro_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_extract_event_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_extract_event_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_extract_keyword_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_extract_keyword_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_extract_nickname_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_extract_nickname_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_extract_support_text_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_extract_support_text_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_fix_unicode_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_fix_unicode_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_image_blur_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_image_blur_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_image_captioning_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_image_captioning_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_image_detection_yolo_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_image_detection_yolo_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_image_diffusion_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_image_diffusion_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_image_face_blur_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_image_face_blur_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_image_segment_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_image_segment_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_image_tagging_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_image_tagging_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_mllm_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_mllm_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_nlpaug_en_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_nlpaug_en_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_nlpcda_zh_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_nlpcda_zh_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_optimize_prompt_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_optimize_prompt_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_optimize_qa_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_optimize_qa_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_optimize_query_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_optimize_query_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_optimize_response_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_optimize_response_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_pair_preference_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_pair_preference_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_python_file_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_python_file_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_python_lambda_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_python_lambda_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_relation_identity_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_relation_identity_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_remove_bibliography_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_remove_bibliography_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_remove_comments_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_remove_comments_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_remove_header_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_remove_header_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_remove_long_words_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_remove_long_words_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_remove_table_text_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_remove_table_text_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_replace_content_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_replace_content_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_sdxl_prompt2prompt_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_sdxl_prompt2prompt_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_sentence_split_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_sentence_split_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_text_chunk_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_text_chunk_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_vggt_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_vggt_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_video_extract_frames_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_video_extract_frames_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_video_face_blur_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_video_face_blur_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_video_ffmpeg_wrapped_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_video_ffmpeg_wrapped_mapper.py -------------------------------------------------------------------------------- /tests/ops/mapper/test_video_split_by_scene_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/mapper/test_video_split_by_scene_mapper.py -------------------------------------------------------------------------------- /tests/ops/selector/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/ops/selector/test_random_selector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/selector/test_random_selector.py -------------------------------------------------------------------------------- /tests/ops/test_base_op.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/test_base_op.py -------------------------------------------------------------------------------- /tests/ops/test_op_fusion.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/ops/test_op_fusion.py -------------------------------------------------------------------------------- /tests/run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/run.py -------------------------------------------------------------------------------- /tests/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/tools/test_mcp_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/tools/test_mcp_server.py -------------------------------------------------------------------------------- /tests/tools/test_process_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/tools/test_process_data.py -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/utils/test_asset_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_asset_utils.py -------------------------------------------------------------------------------- /tests/utils/test_availablility_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_availablility_utils.py -------------------------------------------------------------------------------- /tests/utils/test_cache_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_cache_utils.py -------------------------------------------------------------------------------- /tests/utils/test_ckpt_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_ckpt_utils.py -------------------------------------------------------------------------------- /tests/utils/test_common_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_common_utils.py -------------------------------------------------------------------------------- /tests/utils/test_compress.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_compress.py -------------------------------------------------------------------------------- /tests/utils/test_constant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_constant.py -------------------------------------------------------------------------------- /tests/utils/test_file_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_file_utils.py -------------------------------------------------------------------------------- /tests/utils/test_fingerprint_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_fingerprint_utils.py -------------------------------------------------------------------------------- /tests/utils/test_lazy_loader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_lazy_loader.py -------------------------------------------------------------------------------- /tests/utils/test_logger_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_logger_utils.py -------------------------------------------------------------------------------- /tests/utils/test_mm_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_mm_utils.py -------------------------------------------------------------------------------- /tests/utils/test_model_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_model_utils.py -------------------------------------------------------------------------------- /tests/utils/test_process_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_process_utils.py -------------------------------------------------------------------------------- /tests/utils/test_registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_registry.py -------------------------------------------------------------------------------- /tests/utils/test_resource_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_resource_utils.py -------------------------------------------------------------------------------- /tests/utils/test_s3_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_s3_utils.py -------------------------------------------------------------------------------- /tests/utils/test_unittest_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_unittest_utils.py -------------------------------------------------------------------------------- /tests/utils/test_video_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tests/utils/test_video_utils.py -------------------------------------------------------------------------------- /thirdparty/LLM_ecosystems/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/thirdparty/LLM_ecosystems/README.md -------------------------------------------------------------------------------- /thirdparty/LLM_ecosystems/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/thirdparty/LLM_ecosystems/README_ZH.md -------------------------------------------------------------------------------- /thirdparty/LLM_ecosystems/patch/helm.diff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/thirdparty/LLM_ecosystems/patch/helm.diff -------------------------------------------------------------------------------- /thirdparty/LLM_ecosystems/patch/megatron.diff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/thirdparty/LLM_ecosystems/patch/megatron.diff -------------------------------------------------------------------------------- /thirdparty/LLM_ecosystems/setup_helm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/thirdparty/LLM_ecosystems/setup_helm.sh -------------------------------------------------------------------------------- /thirdparty/LLM_ecosystems/setup_megatron.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/thirdparty/LLM_ecosystems/setup_megatron.sh -------------------------------------------------------------------------------- /thirdparty/models/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/thirdparty/models/README.md -------------------------------------------------------------------------------- /thirdparty/models/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/thirdparty/models/README_ZH.md -------------------------------------------------------------------------------- /thirdparty/models/patch/easyanimate.diff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/thirdparty/models/patch/easyanimate.diff -------------------------------------------------------------------------------- /thirdparty/models/setup_easyanimate.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/thirdparty/models/setup_easyanimate.sh -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/analyze_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/analyze_data.py -------------------------------------------------------------------------------- /tools/check_ray_cluster.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/check_ray_cluster.py -------------------------------------------------------------------------------- /tools/check_s3_integration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/check_s3_integration.py -------------------------------------------------------------------------------- /tools/converter/batch_convert.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/converter/batch_convert.sh -------------------------------------------------------------------------------- /tools/converter/convert_gpt_to_transformers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/converter/convert_gpt_to_transformers.py -------------------------------------------------------------------------------- /tools/converter/modeling_megatron_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/converter/modeling_megatron_llama.py -------------------------------------------------------------------------------- /tools/data_resplit.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/data_resplit.py -------------------------------------------------------------------------------- /tools/distributed_deduplication/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/distributed_deduplication/README.md -------------------------------------------------------------------------------- /tools/distributed_deduplication/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/distributed_deduplication/README_ZH.md -------------------------------------------------------------------------------- /tools/distributed_deduplication/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/distributed_deduplication/dedup_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/distributed_deduplication/dedup_utils.py -------------------------------------------------------------------------------- /tools/distributed_deduplication/spark_dedup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/distributed_deduplication/spark_dedup.py -------------------------------------------------------------------------------- /tools/dj_install.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/dj_install.py -------------------------------------------------------------------------------- /tools/evaluator/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/README.md -------------------------------------------------------------------------------- /tools/evaluator/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/README_ZH.md -------------------------------------------------------------------------------- /tools/evaluator/config/evaluator_example.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/config/evaluator_example.yaml -------------------------------------------------------------------------------- /tools/evaluator/config/helm_spec_template.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/config/helm_spec_template.conf -------------------------------------------------------------------------------- /tools/evaluator/evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/evaluator.py -------------------------------------------------------------------------------- /tools/evaluator/gpt_eval/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/gpt_eval/README.md -------------------------------------------------------------------------------- /tools/evaluator/gpt_eval/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/gpt_eval/README_ZH.md -------------------------------------------------------------------------------- /tools/evaluator/gpt_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/evaluator/gpt_eval/answer_generator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/gpt_eval/answer_generator.py -------------------------------------------------------------------------------- /tools/evaluator/gpt_eval/config/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/gpt_eval/config/config.yaml -------------------------------------------------------------------------------- /tools/evaluator/gpt_eval/config/prompt.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/gpt_eval/config/prompt.jsonl -------------------------------------------------------------------------------- /tools/evaluator/gpt_eval/config/question.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/gpt_eval/config/question.jsonl -------------------------------------------------------------------------------- /tools/evaluator/gpt_eval/config/reviewer.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/gpt_eval/config/reviewer.jsonl -------------------------------------------------------------------------------- /tools/evaluator/gpt_eval/gpt_evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/gpt_eval/gpt_evaluator.py -------------------------------------------------------------------------------- /tools/evaluator/recorder/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/recorder/README.md -------------------------------------------------------------------------------- /tools/evaluator/recorder/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/recorder/README_ZH.md -------------------------------------------------------------------------------- /tools/evaluator/recorder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/evaluator/recorder/config/llama_example.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/recorder/config/llama_example.yaml -------------------------------------------------------------------------------- /tools/evaluator/recorder/config/mymodel_example.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/recorder/config/mymodel_example.yaml -------------------------------------------------------------------------------- /tools/evaluator/recorder/wandb_writer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/evaluator/recorder/wandb_writer.py -------------------------------------------------------------------------------- /tools/fmt_conversion/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/fmt_conversion/README.md -------------------------------------------------------------------------------- /tools/fmt_conversion/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/fmt_conversion/README_ZH.md -------------------------------------------------------------------------------- /tools/fmt_conversion/multimodal/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/fmt_conversion/multimodal/README.md -------------------------------------------------------------------------------- /tools/fmt_conversion/multimodal/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/fmt_conversion/multimodal/README_ZH.md -------------------------------------------------------------------------------- /tools/fmt_conversion/multimodal/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/fmt_conversion/multimodal/utils.py -------------------------------------------------------------------------------- /tools/fmt_conversion/post_tuning_dialog/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/fmt_conversion/post_tuning_dialog/README.md -------------------------------------------------------------------------------- /tools/fmt_conversion/post_tuning_dialog/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/fmt_conversion/post_tuning_dialog/README_ZH.md -------------------------------------------------------------------------------- /tools/generate_smtp_cert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/generate_smtp_cert.py -------------------------------------------------------------------------------- /tools/generate_uv_lock.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/generate_uv_lock.py -------------------------------------------------------------------------------- /tools/humanops/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/humanops/README.md -------------------------------------------------------------------------------- /tools/humanops/enable_legacy_token.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/humanops/enable_legacy_token.png -------------------------------------------------------------------------------- /tools/humanops/label_studio_service.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/humanops/label_studio_service.py -------------------------------------------------------------------------------- /tools/mm_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/mm_eval/inception_metrics/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/mm_eval/inception_metrics/README.md -------------------------------------------------------------------------------- /tools/mm_eval/inception_metrics/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/mm_eval/inception_metrics/README_ZH.md -------------------------------------------------------------------------------- /tools/mm_eval/inception_metrics/dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/mm_eval/inception_metrics/dataset.py -------------------------------------------------------------------------------- /tools/mm_eval/inception_metrics/distributed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/mm_eval/inception_metrics/distributed.py -------------------------------------------------------------------------------- /tools/mm_eval/inception_metrics/util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/mm_eval/inception_metrics/util.py -------------------------------------------------------------------------------- /tools/mm_eval/vbench_metrics/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/mm_eval/vbench_metrics/README.md -------------------------------------------------------------------------------- /tools/mm_eval/vbench_metrics/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/mm_eval/vbench_metrics/README_ZH.md -------------------------------------------------------------------------------- /tools/mm_eval/vbench_metrics/VBench_full_info.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/mm_eval/vbench_metrics/VBench_full_info.json -------------------------------------------------------------------------------- /tools/mm_eval/vbench_metrics/VBench_mini_info.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/mm_eval/vbench_metrics/VBench_mini_info.json -------------------------------------------------------------------------------- /tools/mm_eval/vbench_metrics/evaluate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/mm_eval/vbench_metrics/evaluate.py -------------------------------------------------------------------------------- /tools/multimodal/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/postprocess/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/postprocess/README.md -------------------------------------------------------------------------------- /tools/postprocess/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/postprocess/README_ZH.md -------------------------------------------------------------------------------- /tools/postprocess/count_token.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/postprocess/count_token.py -------------------------------------------------------------------------------- /tools/postprocess/data_mixture.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/postprocess/data_mixture.py -------------------------------------------------------------------------------- /tools/postprocess/deserialize_meta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/postprocess/deserialize_meta.py -------------------------------------------------------------------------------- /tools/preprocess/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/preprocess/README.md -------------------------------------------------------------------------------- /tools/preprocess/README_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/preprocess/README_ZH.md -------------------------------------------------------------------------------- /tools/preprocess/dataset_split_by_language.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/preprocess/dataset_split_by_language.py -------------------------------------------------------------------------------- /tools/preprocess/raw_alpaca_cot_merge_add_meta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/preprocess/raw_alpaca_cot_merge_add_meta.py -------------------------------------------------------------------------------- /tools/preprocess/raw_arxiv_to_jsonl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/preprocess/raw_arxiv_to_jsonl.py -------------------------------------------------------------------------------- /tools/preprocess/raw_stackexchange_to_jsonl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/preprocess/raw_stackexchange_to_jsonl.py -------------------------------------------------------------------------------- /tools/preprocess/reformat_csv_nan_value.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/preprocess/reformat_csv_nan_value.py -------------------------------------------------------------------------------- /tools/preprocess/reformat_jsonl_nan_value.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/preprocess/reformat_jsonl_nan_value.py -------------------------------------------------------------------------------- /tools/preprocess/serialize_meta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/preprocess/serialize_meta.py -------------------------------------------------------------------------------- /tools/process_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/tools/process_data.py -------------------------------------------------------------------------------- /uv.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datajuicer/data-juicer/HEAD/uv.lock --------------------------------------------------------------------------------