├── .github └── workflows │ └── documentation.yml ├── .gitignore ├── .gitmodules ├── Awesome_Data_Evaluation.md ├── LICENSE ├── README.md ├── README.zh-CN.md ├── __init__.py ├── configs ├── eval │ ├── all_scorers.yaml │ ├── gen_image.yaml │ ├── gen_text_scorers.yaml │ ├── image_eval.yaml │ ├── image_eval_example.yaml │ ├── image_text_eval.yaml │ ├── image_text_eval_example.yaml │ ├── text_scorer_example1.yaml │ ├── text_scorer_example2.yaml │ ├── video_scorer.yaml │ └── webvid.yaml └── process │ ├── image_deduplicate.yaml │ ├── image_filter.yaml │ ├── image_text_filter.yaml │ ├── text_process.yaml │ ├── text_process_example.yaml │ ├── text_process_reasoner_ansfilter.yaml │ ├── video_process.yaml │ └── video_text_process.yaml ├── dataflow ├── Agent for System │ └── __init__.py ├── Eval │ ├── Text │ │ ├── README.md │ │ ├── README.zh-CN.md │ │ ├── __init__.py │ │ ├── apicaller │ │ │ ├── alpagasus_scorer.py │ │ │ ├── perspective_scorer.py │ │ │ └── treeinstruct_scorer.py │ │ ├── diversity │ │ │ ├── task2vec │ │ │ │ ├── task2vec.py │ │ │ │ ├── task_similarity.py │ │ │ │ └── utils.py │ │ │ ├── task2vec_scorer.py │ │ │ └── vendi_scorer.py │ │ ├── gen │ │ │ ├── bart_scorer.py │ │ │ ├── bert_scorer.py │ │ │ ├── bleu_scorer.py │ │ │ ├── bleurt_scorer.py │ │ │ ├── bleuscorer │ │ │ │ └── bleu.py │ │ │ ├── chrf_scorer.py │ │ │ ├── cider_scorer.py │ │ │ ├── ciderscorer │ │ │ │ └── cider.py │ │ │ ├── embedding_average_scorer.py │ │ │ ├── greedy_matching_scorer.py │ │ │ ├── hlepor_scorer.py │ │ │ ├── meteor_scorer.py │ │ │ ├── meteorscorer │ │ │ │ └── meteor.py │ │ │ ├── rouge_scorer.py │ │ │ ├── rougescorer │ │ │ │ └── rouge.py │ │ │ ├── ter_scorer.py │ │ │ └── wsd_scorer.py │ │ ├── models │ │ │ ├── Kenlm │ │ │ │ └── model.py │ │ │ ├── Qurating │ │ │ │ ├── modeling │ │ │ │ │ └── modeling_flash_llama.py │ │ │ │ └── qurater_annotate.py │ │ │ ├── Superfiltering │ │ │ │ └── data_analysis.py │ │ │ ├── __init__.py │ │ │ ├── debertav3_scorer.py │ │ │ ├── deita_complexity_scorer.py │ │ │ ├── deita_quality_scorer.py │ │ │ ├── fineweb_edu_scorer.py │ │ │ ├── instag_scorer.py │ │ │ ├── perplexity_scorer.py │ │ │ ├── presidio_scorer.py │ │ │ ├── qurating_scorer.py │ │ │ ├── rm_scorer.py │ │ │ ├── superfiltering_scorer.py │ │ │ ├── textbook_scorer.py │ │ │ └── unieval_scorer.py │ │ └── statistics │ │ │ ├── __init__.py │ │ │ ├── langkit_scorer.py │ │ │ ├── lexical_diversity_scorer.py │ │ │ └── ngram_scorer.py │ ├── __init__.py │ ├── image │ │ ├── README.md │ │ ├── README.zh-CN.md │ │ ├── __init__.py │ │ ├── clip_scorer.py │ │ ├── clip_t5 │ │ │ ├── __init__.py │ │ │ └── model │ │ │ │ ├── __init__.py │ │ │ │ ├── language_model │ │ │ │ └── clip_t5.py │ │ │ │ ├── multimodal_encoder │ │ │ │ ├── builder.py │ │ │ │ └── clip_encoder.py │ │ │ │ └── multimodal_projector │ │ │ │ └── builder.py │ │ ├── clip_t5_scorer.py │ │ ├── fid │ │ │ └── inception.py │ │ ├── fid_scorer.py │ │ ├── fleur_scorer.py │ │ ├── image_aspect_ratio_scorer.py │ │ ├── image_resolution_scorer.py │ │ ├── image_text_scorer.py │ │ ├── is_scorer.py │ │ ├── kid │ │ │ ├── inception.py │ │ │ ├── lenet.pth │ │ │ └── lenet.py │ │ ├── kid_scorer.py │ │ ├── longclip_scorer.py │ │ ├── pyiqa_scorer.py │ │ └── visual_dialog_scorer.py │ └── video │ │ ├── README.md │ │ ├── README.zh-CN.md │ │ ├── __init__.py │ │ ├── dover │ │ ├── __init__.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ ├── basic_datasets.py │ │ │ └── dover_datasets.py │ │ └── models │ │ │ ├── __init__.py │ │ │ ├── backbone_get_attention.py │ │ │ ├── backbone_v0_1.py │ │ │ ├── conv_backbone.py │ │ │ ├── evaluator.py │ │ │ ├── head.py │ │ │ ├── swin_backbone.py │ │ │ └── xclip_backbone.py │ │ ├── dover_scorer.py │ │ ├── emscore │ │ ├── __init__.py │ │ └── utils.py │ │ ├── emscorer.py │ │ ├── fastvqa │ │ ├── __init__.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ ├── basic_datasets.py │ │ │ └── fusion_datasets.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── backbone_get_attention.py │ │ │ ├── backbone_v0_1.py │ │ │ ├── conv_backbone.py │ │ │ ├── evaluator.py │ │ │ ├── head.py │ │ │ ├── swin_backbone.py │ │ │ └── xclip_backbone.py │ │ └── version.py │ │ ├── fastvqa_scorer.py │ │ ├── pacscore │ │ └── utils.py │ │ ├── pacscorer.py │ │ ├── video_aesthetic_scorer.py │ │ ├── video_motion_scorer.py │ │ └── video_resolution_scorer.py ├── __init__.py ├── config │ ├── __init__.py │ └── config.py ├── core │ ├── __init__.py │ ├── process │ │ ├── deduplicator.py │ │ ├── filter.py │ │ ├── reasoner.py │ │ └── refiner.py │ └── scorer.py ├── data │ ├── __init__.py │ ├── dataflow_dataset.py │ ├── image_dataset.py │ ├── pure_video_dataset.py │ ├── text_dataset.py │ └── video_caption_dataset.py ├── format │ ├── __init__.py │ ├── image_formatter.py │ ├── text_formatter.py │ └── video_formatter.py ├── generate │ └── __init__.py ├── process │ ├── __init__.py │ ├── image │ │ ├── README.md │ │ ├── README.zh-CN.md │ │ ├── __init__.py │ │ ├── deduplicators │ │ │ ├── __init__.py │ │ │ └── deduplicator.py │ │ └── filters │ │ │ ├── __init__.py │ │ │ ├── image_aspect_ratio_filter.py │ │ │ ├── image_resolution_filter.py │ │ │ ├── image_text_similarity_filter.py │ │ │ └── pyiqa_filter.py │ ├── text │ │ ├── README.md │ │ ├── README.zh-CN.md │ │ ├── __init__.py │ │ ├── deduplicators │ │ │ ├── __init__.py │ │ │ ├── ccnet_deduplicator.py │ │ │ ├── hash_deduplicator.py │ │ │ ├── minhash_deduplicator.py │ │ │ ├── ngramhash_deduplicator.py │ │ │ ├── sem_deduplicator.py │ │ │ └── simhash_deduplicator.py │ │ ├── filters │ │ │ ├── __init__.py │ │ │ ├── alpagasus_filter.py │ │ │ ├── blocklist │ │ │ │ └── en.txt │ │ │ ├── debertav3_filter.py │ │ │ ├── deita_complexity_filter.py │ │ │ ├── deita_quality_filter.py │ │ │ ├── finewebedu_filter.py │ │ │ ├── heuristics.py │ │ │ ├── instag_filter.py │ │ │ ├── langkit_filter.py │ │ │ ├── language_filter.py │ │ │ ├── lexical_diversity_filter.py │ │ │ ├── ngram_filter.py │ │ │ ├── perplexity_filter.py │ │ │ ├── perspective_filter.py │ │ │ ├── presidio_filter.py │ │ │ ├── qurating_filter.py │ │ │ ├── reward_model_filter.py │ │ │ ├── superfiltering_filter.py │ │ │ ├── textbook_filter.py │ │ │ ├── treeinstrct_filter.py │ │ │ └── unieval_filter.py │ │ ├── reasoners │ │ │ ├── __init__.py │ │ │ ├── answer_formatter_filter.py │ │ │ ├── answer_ground_truth_filter.py │ │ │ ├── answer_ngram_filter.py │ │ │ ├── answer_token_length_filter.py │ │ │ └── math_problem_filter.py │ │ └── refiners │ │ │ ├── __init__.py │ │ │ ├── html_url_remover_refiner.py │ │ │ ├── lowercase_refiner.py │ │ │ ├── ner_refiner.py │ │ │ ├── pii_anonymize_refiner.py │ │ │ ├── remove_contractions_refiner.py │ │ │ ├── remove_emoji_refiner.py │ │ │ ├── remove_emoticons_refiner.py │ │ │ ├── remove_extra_spaces_refiner.py │ │ │ ├── remove_number_refiner.py │ │ │ ├── remove_punctuation_refiner.py │ │ │ ├── remove_repetitions_punctuation_refiner.py │ │ │ ├── remove_stopwords_refiner.py │ │ │ ├── spelling_correction_refiner.py │ │ │ ├── stemming_lemmatization_refiner.py │ │ │ └── text_normalization_refiner.py │ └── video │ │ ├── README.md │ │ ├── README.zh-CN.md │ │ ├── __init__.py │ │ └── filters │ │ ├── __init__.py │ │ ├── dover_filter.py │ │ ├── emscore_filter.py │ │ ├── fastervqa_filter.py │ │ ├── fastvqa_filter.py │ │ ├── pacscore_filter.py │ │ ├── video_motion_filter.py │ │ └── video_resolution_filter.py ├── retrieve │ └── __init__.py ├── utils │ ├── __init__.py │ ├── api_utils.py │ ├── image_utils.py │ ├── json_utils.py │ ├── mm_utils.py │ ├── model_utils.py │ ├── registry.py │ ├── text_utils.py │ └── utils.py └── visualization │ └── __init__.py ├── demos ├── demos_result │ └── processed.jsonl ├── image_eval │ ├── gen_image.json │ ├── gen_image_eval_example.ipynb │ ├── gen_images │ │ ├── 1de8c101-7d18-45c9-90ad-6b27bc9b565f.png │ │ ├── 1e558d58-f53e-422e-9715-19d4c12c093d.png │ │ ├── 202a9b83-28e5-4002-9858-64a4cb734f91.png │ │ ├── 65250ac7-c2ed-4ed1-b375-0916649c849b.png │ │ ├── 67d2b934-2be9-470b-898e-98251a81e74d.png │ │ ├── a34a8f2f-e3f5-42b7-9f18-cad076ad70bb.png │ │ ├── bda58b80-f069-45f5-9eb3-5d023fb731ec.png │ │ ├── cd2eed6b-8aea-40f1-a635-3cb9b4f1f460.png │ │ ├── d10f73b1-ecff-4562-a3b9-3d2d0fc0b82a.png │ │ └── e5a47fa8-f901-40cb-9f3d-39b048152fe2.png │ ├── image.json │ ├── image_eval_example.ipynb │ ├── image_eval_example.zh-CN.ipynb │ ├── image_text.json │ ├── images │ │ ├── 10007903636.jpg │ │ ├── 10089027076.jpg │ │ ├── cake.jpg │ │ └── giraffe.jpg │ ├── new_gen_image_eval_example.ipynb │ ├── ref_image.json │ └── run_images.py ├── image_process │ ├── image.json │ ├── image_process_example.ipynb │ ├── image_process_example.zh-CN.ipynb │ ├── images │ │ ├── hflip.png │ │ ├── origin.png │ │ ├── other.png │ │ ├── resize.png │ │ └── rotate.png │ └── test.py ├── text_eval │ ├── alpaca_5_samples.json │ ├── fineweb_5_samples.json │ ├── text_eval_example.ipynb │ └── text_eval_example.zh-CN.ipynb ├── text_process │ ├── fineweb_5_samples.json │ ├── reasoners │ │ ├── math_5_samples.json │ │ └── text_process_mathproblem.jsonl │ ├── text_process_example.ipynb │ └── text_process_example.zh-CN.ipynb ├── video_eval │ ├── test_video.avi │ ├── test_video.mkv │ ├── test_video.mov │ ├── test_video.mp4 │ ├── video-caption.json │ ├── video.json │ ├── video_caption_eval.yaml │ ├── video_eval.yaml │ ├── video_eval_example.ipynb │ ├── video_eval_example.py │ └── video_eval_example.zh-CN.ipynb └── video_process │ ├── video5data.json │ ├── video_caption_process.yaml │ ├── video_process.yaml │ ├── video_process_example.ipynb │ ├── video_process_example.zh-CN.ipynb │ ├── videocap5data.json │ └── videos │ ├── 4744073127.mp4 │ ├── 5319047612.mp4 │ ├── 6408325533.mp4 │ ├── 8536919744.mp4 │ └── 8724380666.mp4 ├── docs ├── Makefile ├── README.md ├── _build │ ├── doctrees │ │ ├── README.doctree │ │ ├── environment.pickle │ │ ├── index.doctree │ │ └── src │ │ │ ├── getting_started │ │ │ ├── Bring_Your_Own_Scorer.doctree │ │ │ ├── Installation.doctree │ │ │ ├── customized_scorer.doctree │ │ │ └── index.doctree │ │ │ └── metrics │ │ │ ├── gen_text_metrics.doctree │ │ │ ├── gen_text_metrics.zh-CN.doctree │ │ │ ├── image_metrics.doctree │ │ │ ├── image_metrics.zh-CN.doctree │ │ │ ├── image_process.doctree │ │ │ ├── image_process.zh-CN.doctree │ │ │ ├── index.doctree │ │ │ ├── synth_metrics.doctree │ │ │ ├── text_metrics.doctree │ │ │ ├── text_metrics.zh-CN.doctree │ │ │ ├── text_process.doctree │ │ │ ├── text_process.zh-CN.doctree │ │ │ ├── video_metrics.doctree │ │ │ ├── video_metrics.zh-CN.doctree │ │ │ ├── video_process.doctree │ │ │ └── video_process.zh-CN.doctree │ └── html │ │ ├── .buildinfo │ │ ├── .doctrees │ │ ├── README.doctree │ │ ├── environment.pickle │ │ ├── index.doctree │ │ └── src │ │ │ ├── getting_started │ │ │ ├── Bring_Your_Own_Scorer.doctree │ │ │ ├── Installation.doctree │ │ │ ├── customized_scorer.doctree │ │ │ └── index.doctree │ │ │ └── metrics │ │ │ ├── image_metrics.doctree │ │ │ ├── image_metrics.zh-CN.doctree │ │ │ ├── index.doctree │ │ │ ├── synth_metrics.doctree │ │ │ ├── text_metrics.doctree │ │ │ ├── text_metrics.zh-CN.doctree │ │ │ ├── video_metrics.doctree │ │ │ └── video_metrics.zh-CN.doctree │ │ ├── README.html │ │ ├── _sources │ │ ├── README.md.txt │ │ ├── index.rst.txt │ │ └── src │ │ │ ├── getting_started │ │ │ ├── Bring_Your_Own_Scorer.md.txt │ │ │ ├── Installation.md.txt │ │ │ ├── customized_scorer.md.txt │ │ │ └── index.rst.txt │ │ │ └── metrics │ │ │ ├── gen_text_metrics.md.txt │ │ │ ├── gen_text_metrics.zh-CN.md.txt │ │ │ ├── image_metrics.md.txt │ │ │ ├── image_metrics.zh-CN.md.txt │ │ │ ├── image_process.md.txt │ │ │ ├── image_process.zh-CN.md.txt │ │ │ ├── index.rst.txt │ │ │ ├── synth_metrics.md.txt │ │ │ ├── text_metrics.md.txt │ │ │ ├── text_metrics.zh-CN.md.txt │ │ │ ├── text_process.md.txt │ │ │ ├── text_process.zh-CN.md.txt │ │ │ ├── video_metrics.md.txt │ │ │ ├── video_metrics.zh-CN.md.txt │ │ │ ├── video_process.md.txt │ │ │ └── video_process.zh-CN.md.txt │ │ ├── _static │ │ ├── _sphinx_javascript_frameworks_compat.js │ │ ├── basic.css │ │ ├── css │ │ │ ├── badge_only.css │ │ │ ├── fonts │ │ │ │ ├── Roboto-Slab-Bold.woff │ │ │ │ ├── Roboto-Slab-Bold.woff2 │ │ │ │ ├── Roboto-Slab-Regular.woff │ │ │ │ ├── Roboto-Slab-Regular.woff2 │ │ │ │ ├── fontawesome-webfont.eot │ │ │ │ ├── fontawesome-webfont.svg │ │ │ │ ├── fontawesome-webfont.ttf │ │ │ │ ├── fontawesome-webfont.woff │ │ │ │ ├── fontawesome-webfont.woff2 │ │ │ │ ├── lato-bold-italic.woff │ │ │ │ ├── lato-bold-italic.woff2 │ │ │ │ ├── lato-bold.woff │ │ │ │ ├── lato-bold.woff2 │ │ │ │ ├── lato-normal-italic.woff │ │ │ │ ├── lato-normal-italic.woff2 │ │ │ │ ├── lato-normal.woff │ │ │ │ └── lato-normal.woff2 │ │ │ └── theme.css │ │ ├── doctools.js │ │ ├── documentation_options.js │ │ ├── file.png │ │ ├── fonts │ │ │ ├── Lato │ │ │ │ ├── lato-bold.eot │ │ │ │ ├── lato-bold.ttf │ │ │ │ ├── lato-bold.woff │ │ │ │ ├── lato-bold.woff2 │ │ │ │ ├── lato-bolditalic.eot │ │ │ │ ├── lato-bolditalic.ttf │ │ │ │ ├── lato-bolditalic.woff │ │ │ │ ├── lato-bolditalic.woff2 │ │ │ │ ├── lato-italic.eot │ │ │ │ ├── lato-italic.ttf │ │ │ │ ├── lato-italic.woff │ │ │ │ ├── lato-italic.woff2 │ │ │ │ ├── lato-regular.eot │ │ │ │ ├── lato-regular.ttf │ │ │ │ ├── lato-regular.woff │ │ │ │ └── lato-regular.woff2 │ │ │ └── RobotoSlab │ │ │ │ ├── roboto-slab-v7-bold.eot │ │ │ │ ├── roboto-slab-v7-bold.ttf │ │ │ │ ├── roboto-slab-v7-bold.woff │ │ │ │ ├── roboto-slab-v7-bold.woff2 │ │ │ │ ├── roboto-slab-v7-regular.eot │ │ │ │ ├── roboto-slab-v7-regular.ttf │ │ │ │ ├── roboto-slab-v7-regular.woff │ │ │ │ └── roboto-slab-v7-regular.woff2 │ │ ├── jquery.js │ │ ├── js │ │ │ ├── badge_only.js │ │ │ ├── html5shiv-printshiv.min.js │ │ │ ├── html5shiv.min.js │ │ │ ├── theme.js │ │ │ └── versions.js │ │ ├── language_data.js │ │ ├── minus.png │ │ ├── plus.png │ │ ├── pygments.css │ │ ├── searchtools.js │ │ ├── sphinx_highlight.js │ │ └── translations.js │ │ ├── genindex.html │ │ ├── index.html │ │ ├── objects.inv │ │ ├── search.html │ │ ├── searchindex.js │ │ └── src │ │ ├── getting_started │ │ ├── Bring_Your_Own_Scorer.html │ │ ├── Installation.html │ │ ├── customized_scorer.html │ │ └── index.html │ │ └── metrics │ │ ├── gen_text_metrics.html │ │ ├── gen_text_metrics.zh-CN.html │ │ ├── image_metrics.html │ │ ├── image_metrics.zh-CN.html │ │ ├── image_process.html │ │ ├── image_process.zh-CN.html │ │ ├── index.html │ │ ├── synth_metrics.html │ │ ├── text_metrics.html │ │ ├── text_metrics.zh-CN.html │ │ ├── text_process.html │ │ ├── text_process.zh-CN.html │ │ ├── video_metrics.html │ │ ├── video_metrics.zh-CN.html │ │ ├── video_process.html │ │ └── video_process.zh-CN.html ├── conf.py ├── index.rst ├── make.bat └── src │ ├── developer │ ├── index.rst │ ├── logging.md │ └── testcase.md │ ├── getting_started │ ├── Installation.md │ ├── customized_scorer.md │ └── index.rst │ └── metrics │ ├── gen_text_metrics.md │ ├── gen_text_metrics.zh-CN.md │ ├── image_metrics.md │ ├── image_metrics.zh-CN.md │ ├── image_process.md │ ├── image_process.zh-CN.md │ ├── index.rst │ ├── text_metrics.md │ ├── text_metrics.zh-CN.md │ ├── text_process.md │ ├── text_process.zh-CN.md │ ├── video_metrics.md │ ├── video_metrics.zh-CN.md │ ├── video_process.md │ └── video_process.zh-CN.md ├── eval.py ├── image_process.py ├── new_image_eval_example.py ├── process.py ├── processed.jsonl ├── requirements.txt ├── setup.py ├── static └── images │ ├── Face.png │ └── example_1.png └── test.py /.github/workflows/documentation.yml: -------------------------------------------------------------------------------- 1 | name: documentation 2 | 3 | on: [push, pull_request, workflow_dispatch] 4 | 5 | permissions: 6 | contents: write 7 | 8 | jobs: 9 | docs: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | - uses: actions/setup-python@v5 14 | - name: Install dependencies 15 | run: | 16 | pip install sphinx sphinx_rtd_theme myst_parser 17 | - name: Sphinx build 18 | run: | 19 | sphinx-build docs _build 20 | - name: Deploy to GitHub Pages 21 | uses: peaceiris/actions-gh-pages@v3 22 | if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} 23 | with: 24 | publish_branch: gh-pages 25 | github_token: ${{ secrets.GITHUB_TOKEN }} 26 | publish_dir: _build/ 27 | force_orphan: true -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__ 2 | **/__pycache__/ 3 | **/.ipynb_checkpoints/ 4 | **/Kenlm/wikipedia 5 | **/Kenlm/.cache 6 | tmp/ 7 | build/ 8 | ./models/* 9 | scores.json 10 | *egg-info/ 11 | compute_stats.py 12 | output.txt 13 | texttest/* 14 | test_main.txt 15 | statss.json 16 | pure_video_scores.json 17 | configs/process/experiments/ 18 | .vscode 19 | **/meteorscorer/data 20 | **/meteorscorer/data/meteor-1.5.jar 21 | **/ciderscorer/coco-val-df.p 22 | **/data 23 | **/ckpt 24 | tmp.* 25 | configs/process/text_process_reasoner.yaml 26 | docs/src/getting_started/Dev.md -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "dataflow/Eval/image/longclip"] 2 | path = dataflow/Eval/image/longclip 3 | url = https://github.com/beichenzbc/Long-CLIP.git 4 | [submodule "dataflow/Eval/Text/models/UniEval"] 5 | path = dataflow/Eval/Text/models/UniEval 6 | url = https://github.com/MOLYHECI/UniEval.git 7 | branch = main 8 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from dataflow import * -------------------------------------------------------------------------------- /configs/eval/gen_image.yaml: -------------------------------------------------------------------------------- 1 | model_cache_path: '../ckpt' # Path to cache models 2 | num_workers: 2 3 | dependencies: [image] 4 | 5 | # data: 6 | # image: 7 | # meta_data_path: "demos/image_eval/image.json" 8 | # data_path: "demos/image_eval/images" 9 | # ref_meta_data_path: "demos/image_eval/image_copy.json" 10 | # ref_data_path: "demos/image_eval/images" 11 | # image_key: 'image' 12 | # formatter: 'GenImageFormatter' 13 | 14 | data: 15 | image: 16 | meta_data_path: "demos/image_eval/gen_image.json" 17 | data_path: "demos/image_eval/gen_images" 18 | ref_meta_data_path: "demos/image_eval/ref_image.json" 19 | ref_data_path: "demos/image_eval/images" 20 | image_key: 'image' 21 | formatter: 'GenImageFormatter' 22 | 23 | scorers: 24 | ISScorer: 25 | batch_size: 32 26 | device: cpu 27 | dims: 2048 28 | resize: True 29 | splits: 10 30 | KIDScorer: 31 | batch_size: 50 32 | num_workers: 8 33 | dims: 2048 34 | device: cpu 35 | model: inception 36 | -------------------------------------------------------------------------------- /configs/eval/image_eval.yaml: -------------------------------------------------------------------------------- 1 | model_cache_path: '../ckpt' # Path to cache models 2 | num_workers: 2 3 | dependencies: [image] 4 | 5 | data: 6 | image: 7 | meta_data_path: "demos/image_eval/image.json" 8 | data_path: "demos/image_eval/images" 9 | image_key: 'image' 10 | id_key: 'id' 11 | formatter: 'PureImageFormatter' 12 | 13 | scorers: 14 | LiqeScorer: 15 | batch_size: 2 16 | device: "cuda" 17 | # ArniqaScorer: 18 | # batch_size: 2 19 | # device: "cuda" 20 | # TopiqScorer: 21 | # batch_size: 2 22 | # device: "cuda" 23 | # ClipiqaScorer: 24 | # batch_size: 2 25 | # device: "cuda" 26 | # QalignScorer: 27 | # batch_size: 2 28 | # device: "cuda" 29 | # ManiqaScorer: 30 | # batch_size: 2 31 | # device: "cuda" 32 | # MusiqScorer: 33 | # batch_size: 2 34 | # device: "cuda" 35 | # DbcnnScorer: 36 | # batch_size: 2 37 | # device: "cuda" 38 | # Pqa2piqScorer: 39 | # batch_size: 2 40 | # device: "cuda" 41 | # HyperiqaScorer: 42 | # batch_size: 2 43 | # device: "cuda" 44 | # NimaScorer: 45 | # batch_size: 2 46 | # device: "cuda" 47 | # WadiqamScoreer: 48 | # batch_size: 2 49 | # device: "cuda" 50 | # CnniqaScorer: 51 | # batch_size: 2 52 | # device: "cuda" 53 | # NrqmScoreer: 54 | # batch_size: 2 55 | # device: "cuda" 56 | # PiScorer: 57 | # batch_size: 2 58 | # device: "cuda" 59 | # BrisqueScorer: 60 | # batch_size: 2 61 | # device: "cuda" 62 | # IlniqeScorer: 63 | # batch_size: 2 64 | # device: "cuda" 65 | # NiqeScorer: 66 | # batch_size: 2 67 | # device: "cuda" 68 | # PiqeScorer: 69 | # batch_size: 2 70 | # device: "cuda" 71 | 72 | -------------------------------------------------------------------------------- /configs/eval/image_eval_example.yaml: -------------------------------------------------------------------------------- 1 | model_cache_path: '../ckpt' # Path to cache models 2 | num_workers: 2 3 | dependencies: [image] 4 | 5 | data: 6 | image: 7 | meta_data_path: "demos/image_eval/image.json" 8 | data_path: "demos/image_eval/images" 9 | image_key: 'image' 10 | id_key: 'id' 11 | formatter: 'PureImageFormatter' 12 | 13 | scorers: 14 | LiqeScorer: 15 | batch_size: 2 16 | device: "cuda" 17 | ArniqaScorer: 18 | batch_size: 2 19 | device: "cuda" 20 | -------------------------------------------------------------------------------- /configs/eval/image_text_eval_example.yaml: -------------------------------------------------------------------------------- 1 | model_cache_path: '../ckpt' # Path to cache models 2 | num_workers: 2 3 | dependencies: [image] 4 | 5 | data: 6 | image_caption: 7 | meta_data_path: "demos/image_eval/image_text.json" 8 | data_path: "demos/image_eval/images" 9 | image_key: 'image' 10 | image_caption_key: 'caption' 11 | id_key: 'id' 12 | formatter: 'ImageCaptionFormatter' 13 | 14 | scorers: 15 | ClipScorer: 16 | batch_size: 2 17 | device: "cuda" 18 | LongClipScorer: 19 | model_size: B 20 | batch_size: 2 21 | device: "cuda" 22 | -------------------------------------------------------------------------------- /configs/eval/text_scorer_example1.yaml: -------------------------------------------------------------------------------- 1 | # Only some example scorers are listed here. Please refer to all_scorers.yaml for all scorers 2 | 3 | model_cache_path: '../ckpt' # cache path for models 4 | dependencies: [text] 5 | save_path: './scores.json' 6 | 7 | data: 8 | text: 9 | use_hf: False # Whether to use huggingface_dataset, if used, ignore the local data path below 10 | dataset_name: 'yahma/alpaca-cleaned' 11 | dataset_split: 'train' 12 | name: 'default' 13 | revision: null 14 | data_path: 'demos/text_eval/fineweb_5_samples.json' # Local data path, supports json, jsonl, parquet formats 15 | formatter: "TextFormatter" # Data loader type 16 | 17 | keys: 'text' # Key name to be evaluated, for sft data, it can be specified as ['instruction','input','output'] 18 | 19 | scorers: # You can select multiple text scorers from all_scorers.yaml and put their configuration information here 20 | NgramScorer: 21 | ngrams: 5 22 | LexicalDiversityScorer: 23 | metrics_to_keep: 24 | mtld: True 25 | hdd: True -------------------------------------------------------------------------------- /configs/eval/text_scorer_example2.yaml: -------------------------------------------------------------------------------- 1 | # Only some example scorers are listed here. Please refer to all_scorers.yaml for all scorers 2 | 3 | model_cache_path: '../ckpt' # cache path for models 4 | dependencies: [text] 5 | save_path: './scores.json' 6 | 7 | data: 8 | text: 9 | use_hf: False # Whether to use huggingface_dataset, if used, ignore the local data path below 10 | dataset_name: 'yahma/alpaca-cleaned' 11 | dataset_split: 'train' 12 | name: 'default' 13 | revision: null 14 | data_path: 'demos/text_eval/fineweb_5_samples.json' # Local data path, supports json, jsonl, parquet formats 15 | formatter: "TextFormatter" # Data loader type 16 | 17 | keys: 'text' # Key name to be evaluated, for sft data, it can be specified as ['instruction','input','output'] 18 | 19 | scorers: # You can select multiple text scorers from all_scorers.yaml and put their configuration information here 20 | QuratingScorer: 21 | model: 'princeton-nlp/QuRater-1.3B' 22 | tokens_field: 'input_ids' 23 | tokens: 512 24 | map_batch_size: 512 25 | num_workers: 1 26 | device_batch_size: 16 27 | device: 'cuda:0' 28 | labels: 29 | - writing_style 30 | - required_expertise 31 | - facts_and_trivia 32 | - educational_value 33 | PresidioScorer: 34 | language: 'en' 35 | device: 'cuda:0' -------------------------------------------------------------------------------- /configs/process/image_deduplicate.yaml: -------------------------------------------------------------------------------- 1 | model_cache_path: '../ckpt' # Path to cache models 2 | num_workers: 2 3 | dependencies: [image] 4 | save_path: "./processed.jsonl" 5 | 6 | 7 | data: 8 | image: 9 | meta_data_path: "demos/image_process/image.json" 10 | data_path: "demos/image_process/images" 11 | image_key: 'image' 12 | formatter: 'PureImageFormatter' 13 | 14 | processors: 15 | ImagePHashDeduplicator: 16 | threshold: 13 # hamming distance between two images below which retrieved duplicates are valid. (must be an int between 0 and 64). 17 | ImageDHashDeduplicator: 18 | threshold: 13 # hamming distance between two images below which retrieved duplicates are valid. (must be an int between 0 and 64). 19 | ImageWHashDeduplicator: 20 | threshold: 13 # hamming distance between two images below which retrieved duplicates are valid. (must be an int between 0 and 64). 21 | ImageAHashDeduplicator: 22 | threshold: 13 # hamming distance between two images below which retrieved duplicates are valid. (must be an int between 0 and 64). -------------------------------------------------------------------------------- /configs/process/image_filter.yaml: -------------------------------------------------------------------------------- 1 | model_cache_path: '../ckpt' # Path to cache models 2 | num_workers: 2 3 | dependencies: [image] 4 | save_path: "./processed.jsonl" 5 | 6 | data: 7 | image: 8 | meta_data_path: "demos/image_eval/image.json" 9 | data_path: "demos/image_eval/images" 10 | image_key: 'image' 11 | formatter: 'PureImageFormatter' 12 | 13 | processors: 14 | ImageResolutionFilter: 15 | min_width: 160 16 | max_width: 7680 17 | min_height: 120 18 | max_height: 4320 19 | batch_size: 2 20 | LiqeFilter: 21 | batch_size: 2 22 | device: "cuda" 23 | min_score: 3 24 | max_score: 5 25 | QalignFilter: 26 | batch_size: 2 27 | device: "cuda" 28 | min_score: 3 29 | max_score: 5 30 | ImageAspectRatioFilter: 31 | min_ratio: 0.2 32 | max_ratio: 5.0 33 | batch_size: 2 -------------------------------------------------------------------------------- /configs/process/image_text_filter.yaml: -------------------------------------------------------------------------------- 1 | model_cache_path: '../ckpt' # Path to cache models 2 | num_workers: 2 3 | dependencies: [image] 4 | save_path: "./processed.jsonl" 5 | 6 | data: 7 | image_caption: 8 | meta_data_path: "demos/image_eval/image_text.json" 9 | data_path: "demos/image_eval/images" 10 | image_key: 'image' 11 | image_caption_key: 'caption' 12 | formatter: 'ImageCaptionFormatter' 13 | 14 | processors: 15 | ClipFilter: 16 | batch_size: 2 17 | device: "cuda" 18 | min_score: 30 19 | LongClipFilter: 20 | batch_size: 2 21 | device: "cuda" 22 | min_score: 25 23 | model_size: B -------------------------------------------------------------------------------- /configs/process/text_process_example.yaml: -------------------------------------------------------------------------------- 1 | model_cache_path: '../ckpt' # Path to cache models 2 | dependencies: [text] 3 | save_path: "./processed.jsonl" 4 | 5 | data: 6 | text: 7 | use_hf: False # Whether to use huggingface_dataset, if used, ignore the local data path below 8 | dataset_name: 'yahma/alpaca-cleaned' 9 | dataset_split: 'train' 10 | name: 'default' 11 | revision: null 12 | data_path: 'demos/text_process/fineweb_5_samples.json' # Local data path, supports json, jsonl, parquet formats 13 | formatter: "TextFormatter" # Data loader type 14 | 15 | keys: 'text' # Key name to be processed, for sft data, it can be specified as ['instruction','input','output'] 16 | 17 | processors: 18 | RemoveExtraSpacesRefiner: {} 19 | CCNetDeduplicator: 20 | bit_length: 64 # should be a multiple of 8 21 | NgramFilter: 22 | min_score: 0.99 23 | max_score: 1.0 24 | scorer_args: 25 | ngrams: 5 -------------------------------------------------------------------------------- /configs/process/text_process_reasoner_ansfilter.yaml: -------------------------------------------------------------------------------- 1 | model_cache_path: '../ckpt' # Path to cache models 2 | dependencies: [text] 3 | save_path: "../dataflow-develop/processed.jsonl" 4 | 5 | data: 6 | text: 7 | use_hf: False # Whether to use huggingface_dataset, if used, ignore the local data path below 8 | dataset_name: 'yahma/alpaca-cleaned' 9 | dataset_split: 'train' 10 | name: 'default' 11 | revision: null 12 | data_path: './demos/text_process/reasoners/math_5_samples.json' # Local data path, supports json, jsonl, parquet formats 13 | formatter: "TextFormatter" # Data loader type 14 | keys: 'answer' # Key name to be processed, for sft data, it can be specified as ['instruction','input','output'] 15 | 16 | processors: 17 | AnswerFormatterFilter: {} 18 | AnswerNgramFilter: 19 | min_score: 0.1 20 | max_score: 1.0 21 | ngrams: 5 22 | AnswerGroundTruthFilter: 23 | compare_method: math_verify # exact or math_verify 24 | AnswerTokenLengthFilter: 25 | max_answer_token_length: 512 26 | tokenizer_dir: '../Qwen2.5-0.5B-Instruct' 27 | 28 | 29 | -------------------------------------------------------------------------------- /configs/process/video_text_process.yaml: -------------------------------------------------------------------------------- 1 | model_cache_path: '../ckpt' # Path to cache models 2 | num_workers: 2 3 | dependencies: [video] 4 | save_path: './example.jsonl' 5 | data: 6 | video_caption: 7 | meta_data_path: 'configs/process/videocap5data.json' 8 | data_path: '/mnt/petrelfs/mengzimo/videodataset/vatex/videos/' 9 | formatter: 'VideoCaptionFormatter' # Formatter for video-caption evaluation 10 | 11 | processors: 12 | EMScoreFilter: 13 | min_score: 0.3 14 | max_score: 1.0 15 | scorer_args: 16 | batch_size: 16 17 | num_workers: 4 18 | PACScoreFilter: 19 | min_score: 0.3 20 | max_score: 1.0 21 | scorer_args: 22 | batch_size: 16 23 | num_workers: 4 24 | model_path: ./models/clip_ViT-B-32.pth 25 | -------------------------------------------------------------------------------- /dataflow/Agent for System/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/dataflow/Agent for System/__init__.py -------------------------------------------------------------------------------- /dataflow/Eval/Text/apicaller/perspective_scorer.py: -------------------------------------------------------------------------------- 1 | from googleapiclient import discovery 2 | from dataflow.core import TextScorer 3 | from dataflow.utils.registry import MODEL_REGISTRY 4 | 5 | # PerspectiveAPI toxicity evaluationß 6 | @MODEL_REGISTRY.register() 7 | class PerspectiveScorer(TextScorer): 8 | def __init__(self, args_dict): 9 | super().__init__(args_dict) 10 | self.api_key = args_dict.get('api_key') 11 | self.api_name = args_dict.get('api_name') 12 | self.api_version = args_dict.get('api_version') 13 | self.discovery_service_url = args_dict.get('discovery_service_url') 14 | self.static_discovery = args_dict.get('static_discovery') 15 | self.client = discovery.build( 16 | self.api_name, 17 | self.api_version, 18 | developerKey=self.api_key, 19 | discoveryServiceUrl=self.discovery_service_url, 20 | static_discovery=self.static_discovery, 21 | ) 22 | self.batch_size = 1 23 | self.score_type = float 24 | self.data_type = 'text' 25 | self.score_name = 'PerspectiveScore' 26 | 27 | def analyze_toxicity(self, text): 28 | analyze_request = { 29 | 'comment': {'text': text}, 30 | 'requestedAttributes': {'TOXICITY': {}} 31 | } 32 | response = self.client.comments().analyze(body=analyze_request).execute() 33 | return response['attributeScores']['TOXICITY']['spanScores'][0]['score']['value'] 34 | 35 | def evaluate_batch(self, batch): 36 | results = [] 37 | input_data = next(iter(batch.values())) 38 | 39 | for sample in input_data: 40 | text = sample 41 | max_bytes = 20480 42 | 43 | if len(text.encode('utf-8')) > max_bytes: 44 | score = None 45 | else: 46 | score = self.analyze_toxicity(text) 47 | 48 | results.append(score) 49 | 50 | return results 51 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/diversity/vendi_scorer.py: -------------------------------------------------------------------------------- 1 | from vendi_score import text_utils 2 | from dataflow.core import TextScorer 3 | from dataflow.utils.registry import MODEL_REGISTRY 4 | 5 | # VendiScore dataset diversity evaluation 6 | # cited from: The Vendi Score: A Diversity Evaluation Metric for Machine Learning 7 | @MODEL_REGISTRY.register() 8 | class VendiScorer(TextScorer): 9 | def __init__(self, args_dict): 10 | super().__init__(args_dict) 11 | self.bert_model_path = args_dict.get('bert_model_path') 12 | self.simcse_model_path = args_dict.get('simcse_model_path') 13 | self.metrics_to_keep = args_dict.get('metrics_to_keep') 14 | self.device = args_dict.get('device') 15 | self.batch_size = -1 16 | self.use_meta = True 17 | self.score_type = float 18 | self.data_type = 'text' 19 | self.score_name = 'VendiScore' 20 | 21 | def evaluate_batch(self, batch): 22 | sentences = next(iter(batch.values())) 23 | 24 | result = {} 25 | 26 | if self.metrics_to_keep.get("ngram", False): 27 | ngram_vs = text_utils.ngram_vendi_score(sentences, ns=[1, 2, 3, 4]) 28 | result["N-gramsVendiScore"] = round(ngram_vs, 2) 29 | 30 | if self.metrics_to_keep.get("bert", False): 31 | bert_vs = text_utils.embedding_vendi_score(sentences, model_path=self.bert_model_path, device=self.device) 32 | result["BERTVendiScore"] = round(bert_vs, 2) 33 | 34 | if self.metrics_to_keep.get("simcse", False): 35 | simcse_vs = text_utils.embedding_vendi_score(sentences, model_path=self.simcse_model_path, device=self.device) 36 | result["SimCSEVendiScore"] = round(simcse_vs, 2) 37 | 38 | return result 39 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/gen/bert_scorer.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from dataflow.core import GenTextScorer 3 | from dataflow.utils.registry import MODEL_REGISTRY 4 | 5 | @MODEL_REGISTRY.register() 6 | class BERTScoreScorer(GenTextScorer): 7 | def __init__(self, args_dict: dict): 8 | super().__init__(args_dict) 9 | self.batch_size = 1 # Default batch size 10 | self.data_type = "text" 11 | self.scorer_name = "BERTScoreScorer" 12 | self.score_type = float 13 | 14 | # Additional parameters for BERTScore 15 | self.lang = args_dict.get("lang", "en") # Language (default: English) 16 | self.model_type = args_dict.get("model_type", "distilbert-base-uncased") # Pretrained model for BERTScore 17 | self.idf = args_dict.get("idf", False) # Whether to use IDF weighting 18 | self.rescale_with_baseline = args_dict.get("rescale_with_baseline", False) # Rescale scores with baseline 19 | 20 | # Load the BERTScore metric 21 | self.bertscore = evaluate.load("bertscore") 22 | 23 | def evaluate_batch(self, eval_batch, ref_batch=None): 24 | eval_data = next(iter(eval_batch.values())) # Extract predictions 25 | ref_data = next(iter(ref_batch.values())) if ref_batch else None # Extract references 26 | 27 | if ref_data is None: 28 | raise ValueError("Reference data must be provided for BERTScore Scorer.") 29 | 30 | # Compute BERTScore for predictions and references 31 | results = self.bertscore.compute( 32 | predictions=eval_data, 33 | references=ref_data, 34 | lang=self.lang, 35 | model_type=self.model_type, 36 | idf=self.idf, 37 | rescale_with_baseline=self.rescale_with_baseline 38 | ) 39 | 40 | # Extract F1 scores for batch and return 41 | scores = results["f1"] 42 | return scores 43 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/gen/bleu_scorer.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import GenTextScorer 2 | from dataflow.utils.registry import MODEL_REGISTRY 3 | from dataflow.Eval.Text.gen.bleuscorer.bleu import Bleu 4 | 5 | @MODEL_REGISTRY.register() 6 | class BleuScorer(GenTextScorer): 7 | def __init__(self, args_dict: dict): 8 | super().__init__(args_dict) 9 | self.batch_size = 1 # Default batch size to 1 10 | self.data_type = "text" 11 | self.scorer_name = "BleuScorer" 12 | self.score_type = float 13 | 14 | self.n = args_dict.get("n", 4) # Max n-gram length (default: 4) 15 | self.eff = args_dict.get("eff", "average") # Effective reference length calculation method 16 | self.special_reflen = args_dict.get("special_reflen", None) # Special reference length if specified 17 | 18 | def evaluate_batch(self, eval_batch, ref_batch=None): 19 | eval_data = next(iter(eval_batch.values())) # Extract generated text 20 | ref_data = next(iter(ref_batch.values())) if ref_batch else None # Extract reference text 21 | 22 | if ref_data is None: 23 | raise ValueError("Reference data must be provided for BLEU Scorer.") 24 | 25 | scores = [] 26 | for eval_text, ref_text in zip(eval_data, ref_data): 27 | bleu_scorer = Bleu( 28 | test=eval_text, 29 | refs=[ref_text], 30 | n=self.n, 31 | special_reflen=self.special_reflen, 32 | ) 33 | 34 | bleu_score, _ = bleu_scorer.compute_score(option=self.eff) 35 | scores.append(bleu_score[0]) 36 | 37 | return scores 38 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/gen/bleurt_scorer.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import GenTextScorer 2 | from dataflow.utils.registry import MODEL_REGISTRY 3 | import evaluate 4 | 5 | @MODEL_REGISTRY.register() 6 | class BleurtScorer(GenTextScorer): 7 | def __init__(self, args_dict: dict): 8 | super().__init__(args_dict) 9 | self.batch_size = 1 10 | self.data_type = "text" 11 | self.scorer_name = "BleurtScorer" 12 | self.score_type = float 13 | 14 | 15 | self.bleurt = evaluate.load("bleurt", module_type="metric", checkpoint="bleurt-base-128") 16 | 17 | def evaluate_batch(self, eval_batch, ref_batch=None): 18 | eval_data = next(iter(eval_batch.values())) 19 | ref_data = next(iter(ref_batch.values())) if ref_batch else None 20 | 21 | if ref_data is None: 22 | raise ValueError("Reference data must be provided for BLEURT Scorer.") 23 | 24 | scores = [] 25 | for eval_text, ref_text in zip(eval_data, ref_data): 26 | 27 | results = self.bleurt.compute(predictions=[eval_text], references=[ref_text]) 28 | scores.append(results['scores'][0]) 29 | 30 | return scores 31 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/gen/chrf_scorer.py: -------------------------------------------------------------------------------- 1 | import sacrebleu 2 | from dataflow.core import GenTextScorer 3 | from dataflow.utils.registry import MODEL_REGISTRY 4 | 5 | @MODEL_REGISTRY.register() 6 | class CHRFScorer(GenTextScorer): 7 | def __init__(self, args_dict: dict): 8 | super().__init__(args_dict) 9 | self.batch_size = 1 10 | self.data_type = "text" 11 | self.scorer_name = "CHRFScorer" 12 | self.score_type = float 13 | 14 | # Optional, you can add parameters for CHRF scoring if needed 15 | self.char_order = args_dict.get("char_order", 6) 16 | self.word_order = args_dict.get("word_order", 0) 17 | self.beta = args_dict.get("beta", 3) 18 | 19 | def evaluate_batch(self, eval_batch, ref_batch=None): 20 | eval_data = next(iter(eval_batch.values())) # Evaluate texts 21 | ref_data = next(iter(ref_batch.values())) if ref_batch else None # Reference texts 22 | 23 | if ref_data is None: 24 | raise ValueError("Reference data must be provided for CHRF Scorer.") 25 | 26 | # Compute CHRF score for each pair of evaluation and reference texts 27 | results = [] 28 | for eval_text, ref_text in zip(eval_data, ref_data): 29 | # Compute the CHRF score for hypotheses and references 30 | result = sacrebleu.sentence_chrf( 31 | hypothesis=eval_text, 32 | references=[ref_text], 33 | char_order=self.char_order, 34 | word_order=self.word_order, 35 | beta=self.beta 36 | ).score 37 | results.append(result) 38 | 39 | return results 40 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/gen/cider_scorer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from dataflow.core import GenTextScorer 4 | from dataflow.utils.registry import MODEL_REGISTRY 5 | from dataflow.Eval.Text.gen.ciderscorer.cider import Cider 6 | import pickle 7 | 8 | 9 | def load_idf(idf_path): 10 | with open(idf_path, 'rb') as f: 11 | idf = pickle.load(f, encoding='utf-8') 12 | return idf 13 | 14 | @MODEL_REGISTRY.register() 15 | class CiderScorer(GenTextScorer): 16 | def __init__(self, args_dict: dict): 17 | super().__init__(args_dict) 18 | self.batch_size = 1 19 | self.data_type = "text" 20 | self.scorer_name = "CiderScorer" 21 | self.score_type = float 22 | self.n = args_dict.get("n", 4) # Max n-gram length (default: 4) 23 | self.sigma = args_dict.get("sigma", 6.0) # Sigma for Gaussian penalty (default: 6.0) 24 | 25 | # Decide which IDF file to load based on 'df_mode' 26 | df_mode = args_dict.get("df_mode", "coco-val-df") # Default to 'coco-val-df' 27 | if df_mode != "corpus": 28 | idf_path = args_dict.get("idf_path", "dataflow/Eval/Text/gen/ciderscorer/coco-val-df.p") 29 | self.idf = load_idf(idf_path) 30 | else: 31 | self.idf = None # No need to load IDF for 'corpus' mode 32 | 33 | def evaluate_batch(self, eval_batch, ref_batch=None): 34 | eval_data = next(iter(eval_batch.values())) 35 | ref_data = next(iter(ref_batch.values())) if ref_batch else None 36 | 37 | if ref_data is None: 38 | raise ValueError("Reference data must be provided for CIDEr Scorer.") 39 | 40 | scores = [] 41 | for eval_text, ref_text in zip(eval_data, ref_data): 42 | cider_scorer = Cider( 43 | test=eval_text, 44 | refs=[ref_text], 45 | n=self.n, 46 | sigma=self.sigma, 47 | idf=self.idf # Pass IDF (None if using 'corpus') 48 | ) 49 | 50 | # Pass df_mode dynamically based on the argument 51 | cider_score, _ = cider_scorer.compute_score(df_mode='corpus' if self.idf is None else 'coco-val-df') 52 | scores.append(cider_score) 53 | 54 | return scores 55 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/gen/meteor_scorer.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import GenTextScorer 2 | from dataflow.utils.registry import MODEL_REGISTRY 3 | from dataflow.Eval.Text.gen.meteorscorer.meteor import Meteor 4 | 5 | @MODEL_REGISTRY.register() 6 | class MeteorScorer(GenTextScorer): 7 | def __init__(self, args_dict: dict): 8 | super().__init__(args_dict) 9 | self.batch_size = 1 10 | self.data_type = "text" 11 | self.scorer_name = "MeteorScorer" 12 | self.score_type = float 13 | 14 | self.language = args_dict.get("language", "en") 15 | def evaluate_batch(self, eval_batch, ref_batch=None): 16 | eval_data = next(iter(eval_batch.values())) 17 | ref_data = next(iter(ref_batch.values())) if ref_batch else None 18 | 19 | if ref_data is None: 20 | raise ValueError("Reference data must be provided for Meteor Scorer.") 21 | 22 | scores = [] 23 | for eval_text, ref_text in zip(eval_data, ref_data): 24 | 25 | meteor_scorer = Meteor(language=self.language) 26 | score = meteor_scorer.compute_score(eval_text, [ref_text]) 27 | 28 | scores.append(score) 29 | 30 | return scores 31 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/gen/rouge_scorer.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import GenTextScorer 2 | from dataflow.utils.registry import MODEL_REGISTRY 3 | from dataflow.Eval.Text.gen.rouge_scorer.rouge import Rouge 4 | 5 | @MODEL_REGISTRY.register() 6 | class RougeScorer(GenTextScorer): 7 | def __init__(self, args_dict: dict): 8 | """ 9 | Initializes the RougeScorer with the provided configuration. 10 | :param args_dict: dict : Configuration dictionary for the scorer 11 | """ 12 | super().__init__(args_dict) 13 | 14 | self.batch_size = 1 # Default batch size to 1 15 | self.data_type = "text" 16 | self.scorer_name = "RougeScorer" 17 | self.score_type = float 18 | 19 | # Get beta for ROUGE-L score computation, default is 1.2 20 | self.beta = args_dict.get("beta", 1.2) 21 | 22 | def evaluate_batch(self, eval_batch, ref_batch=None): 23 | """ 24 | Evaluate the batch of generated text against reference text using ROUGE-L. 25 | :param eval_batch: dict : Batch of generated text, indexed by image or example ID 26 | :param ref_batch: dict : Batch of reference text, indexed by image or example ID 27 | :returns: list : List of ROUGE-L scores for each evaluation example 28 | """ 29 | # Extract generated and reference text from the batches 30 | eval_data = next(iter(eval_batch.values())) # Generated text 31 | ref_data = next(iter(ref_batch.values())) if ref_batch else None # Reference text 32 | 33 | if ref_data is None: 34 | raise ValueError("Reference data must be provided for ROUGE Scorer.") 35 | 36 | scores = [] 37 | 38 | # Loop through each pair of generated text and reference text 39 | for eval_text, ref_text in zip(eval_data, ref_data): 40 | # Instantiate the Rouge scorer 41 | rouge_scorer = Rouge(beta=self.beta) 42 | 43 | # Compute the ROUGE-L score using the calc_score method from Rouge 44 | rouge_score = rouge_scorer.calc_score([eval_text], [ref_text]) 45 | scores.append(rouge_score) 46 | 47 | return scores 48 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/gen/ter_scorer.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from dataflow.core import GenTextScorer 3 | from dataflow.utils.registry import MODEL_REGISTRY 4 | 5 | @MODEL_REGISTRY.register() 6 | class TERScorer(GenTextScorer): 7 | def __init__(self, args_dict: dict): 8 | super().__init__(args_dict) 9 | self.batch_size = 1 10 | self.data_type = "text" 11 | self.scorer_name = "TERScorer" 12 | self.score_type = float 13 | 14 | # Optional, you can add parameters for TER scoring if needed 15 | self.normalized = args_dict.get("normalized", True) 16 | self.case_sensitive = args_dict.get("case_sensitive", True) 17 | 18 | # Load the TER metric 19 | self.ter = evaluate.load("ter") 20 | 21 | def evaluate_batch(self, eval_batch, ref_batch=None): 22 | eval_data = next(iter(eval_batch.values())) # Evaluate texts 23 | ref_data = next(iter(ref_batch.values())) if ref_batch else None # Reference texts 24 | 25 | if ref_data is None: 26 | raise ValueError("Reference data must be provided for TER Scorer.") 27 | 28 | # Compute TER score for each pair of evaluation and reference texts 29 | results = [] 30 | for eval_text, ref_text in zip(eval_data, ref_data): 31 | # TER expects references as a list of lists, so we wrap the reference in a list 32 | reference_list = [ref_text] if isinstance(ref_text, str) else ref_text 33 | result = self.ter.compute(predictions=[eval_text], references=[reference_list], 34 | normalized=self.normalized, case_sensitive=self.case_sensitive) 35 | 36 | # Extract the TER score from the result dictionary 37 | ter_score = result['score'] 38 | results.append(ter_score) 39 | 40 | return results 41 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/gen/wsd_scorer.py: -------------------------------------------------------------------------------- 1 | import gensim.downloader as api 2 | from nltk.corpus import stopwords 3 | from nltk import download 4 | from dataflow.core import GenTextScorer 5 | from dataflow.utils.registry import MODEL_REGISTRY 6 | import logging 7 | 8 | 9 | 10 | @MODEL_REGISTRY.register() 11 | class WsdScorer(GenTextScorer): 12 | def __init__(self, args_dict: dict): 13 | super().__init__(args_dict) 14 | self.batch_size = 1 15 | self.data_type = "text" 16 | self.scorer_name = "WSDScorer" 17 | self.score_type = float 18 | model = api.load('word2vec-google-news-300') 19 | print('api over') 20 | self.model = model 21 | 22 | def evaluate_batch(self, eval_batch, ref_batch=None): 23 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 24 | 25 | 26 | 27 | eval_data = next(iter(eval_batch.values())) 28 | ref_data = next(iter(ref_batch.values())) if ref_batch else None 29 | 30 | if ref_data is None: 31 | raise ValueError("Reference data must be provided for WSD Scorer.") 32 | 33 | scores = [] 34 | for eval_text, ref_text in zip(eval_data, ref_data): 35 | 36 | eval_tokens = preprocess(eval_text) 37 | ref_tokens = preprocess(ref_text) 38 | 39 | 40 | wmd_score = self.model.wmdistance(eval_tokens, ref_tokens) 41 | scores.append(wmd_score) 42 | 43 | return scores 44 | 45 | def preprocess(sentence): 46 | download('stopwords') 47 | stop_words = stopwords.words('english') 48 | return [w for w in sentence.lower().split() if w not in stop_words] 49 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/models/Superfiltering/data_analysis.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | import argparse 5 | from tqdm import tqdm 6 | 7 | from transformers import AutoTokenizer, AutoModelForCausalLM 8 | 9 | 10 | PROMPT_DICT_NONE = { 11 | "prompt_input": ( 12 | "{instruction}\n{input}\n" 13 | ), 14 | "prompt_no_input": ( 15 | "{instruction}\n" 16 | ), 17 | } 18 | # Used to get the ppl and emb for the whole input 19 | def get_perplexity_and_embedding_whole_text(tokenizer, model, text, max_length, device): 20 | 21 | input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=max_length).to(device) 22 | 23 | with torch.no_grad(): 24 | outputs = model(input_ids, labels=input_ids.contiguous()) 25 | loss = outputs.loss 26 | perplexity = torch.exp(loss) 27 | 28 | return perplexity.to('cpu').item(), loss.to('cpu').item() 29 | 30 | 31 | # Used to get the ppl and emb for part of input, used in conditional version, and token-wise loss 32 | def get_perplexity_and_embedding_part_text(tokenizer, model, text, target_span, max_length, device): 33 | 34 | try: 35 | input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=max_length).to(device) 36 | 37 | start_index = text.rfind(target_span) 38 | start_token = len(tokenizer.encode(text[:start_index])) 39 | end_token = input_ids.shape[1] 40 | 41 | labels = input_ids.clone() 42 | labels[0, :start_token] = -100 43 | 44 | with torch.no_grad(): 45 | outputs = model(input_ids, labels=labels) 46 | 47 | loss = outputs.loss 48 | perplexity = torch.exp(loss) 49 | 50 | return perplexity.to('cpu').item(), loss.to('cpu').item() 51 | 52 | except: 53 | return 0, 0 54 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/dataflow/Eval/Text/models/__init__.py -------------------------------------------------------------------------------- /dataflow/Eval/Text/models/fineweb_edu_scorer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForSequenceClassification 3 | from dataflow.core import TextScorer 4 | from dataflow.utils.registry import MODEL_REGISTRY 5 | # FineWeb-Edu quality classifier (Huggingface) 6 | # cited from: The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale 7 | @MODEL_REGISTRY.register() 8 | class FineWebEduScorer(TextScorer): 9 | def __init__(self, args_dict): 10 | super().__init__(args_dict) 11 | self.model_name = args_dict.get('model_name') 12 | self.model_cache_dir = args_dict.get('model_cache_dir') 13 | self.device = args_dict.get('device', 'cuda' if torch.cuda.is_available() else 'cpu') 14 | self.batch_size = 1 15 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir) 16 | self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device) 17 | self.model.eval() 18 | self.score_type = float 19 | self.data_type = 'text' 20 | self.score_name = 'FineWebEduScore' 21 | 22 | def evaluate_batch(self, batch) -> list: 23 | input_texts = next(iter(batch.values())) 24 | tokenized_inputs = self.tokenizer(input_texts, return_tensors="pt", padding="longest", truncation=True).to(self.device) 25 | 26 | with torch.no_grad(): 27 | outputs = self.model(**tokenized_inputs) 28 | logits = outputs.logits.squeeze(-1).float().detach().cpu().numpy() 29 | 30 | results = logits.tolist() 31 | 32 | return results 33 | 34 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/models/perplexity_scorer.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextScorer 2 | from dataflow.utils.registry import MODEL_REGISTRY 3 | from dataflow.Eval.Text.models.Kenlm.model import KenlmModel 4 | 5 | # Kenlm models perplexity evaluation 6 | @MODEL_REGISTRY.register() 7 | class PerplexityScorer(TextScorer): 8 | def __init__(self, args_dict): 9 | super().__init__(args_dict) 10 | self.model_path = args_dict.get('model_path') 11 | self.language = args_dict.get('language') 12 | self.batch_size = 1 13 | self.score_type = float 14 | self.data_type = 'text' 15 | self.score_name = 'PerplexityScore' 16 | self.model = KenlmModel.from_pretrained(self.model_path, self.language) 17 | 18 | def evaluate_batch(self, batch): 19 | input_texts = next(iter(batch.values())) 20 | 21 | results = [] 22 | for text in input_texts: 23 | perplexity = self.model.get_perplexity(text) 24 | results.append(perplexity) 25 | 26 | return results 27 | 28 | 29 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/models/presidio_scorer.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextScorer 2 | from dataflow.utils.registry import MODEL_REGISTRY 3 | from presidio_analyzer import AnalyzerEngine 4 | from presidio_analyzer.nlp_engine import TransformersNlpEngine 5 | from transformers import AutoModelForTokenClassification, AutoTokenizer 6 | import warnings 7 | 8 | # Presidio PII detection Scorer 9 | @MODEL_REGISTRY.register() 10 | class PresidioScorer(TextScorer): 11 | def __init__(self, args_dict): 12 | super().__init__(args_dict) 13 | self.language = args_dict.get('language', 'en') 14 | self.device = args_dict.get('device', 'cpu') 15 | self.model_cache_dir = args_dict.get('model_cache_dir') 16 | model_name = 'dslim/bert-base-NER' 17 | self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=self.model_cache_dir) 18 | self.model = AutoModelForTokenClassification.from_pretrained(model_name, cache_dir=self.model_cache_dir).to(self.device) 19 | 20 | warnings.filterwarnings("ignore", category=UserWarning, module="spacy_huggingface_pipelines") 21 | model_config = [{ 22 | "lang_code": self.language, 23 | "model_name": { 24 | "spacy": "en_core_web_sm", 25 | "transformers": model_name 26 | } 27 | }] 28 | 29 | self.nlp_engine = TransformersNlpEngine(models=model_config) 30 | self.analyzer = AnalyzerEngine(nlp_engine=self.nlp_engine) 31 | 32 | self.batch_size = 1 33 | self.score_type = float 34 | self.data_type = 'text' 35 | self.score_name = 'PresidioScore' 36 | 37 | def evaluate_batch(self, batch): 38 | input_texts = next(iter(batch.values())) 39 | 40 | results = [] 41 | for text in input_texts: 42 | analysis_results = self.analyzer.analyze(text=text, language=self.language) 43 | pii_count = len(analysis_results) 44 | results.append(pii_count) 45 | 46 | return results 47 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/models/rm_scorer.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForSequenceClassification, AutoTokenizer 2 | from dataflow.core import TextScorer 3 | from dataflow.utils.registry import MODEL_REGISTRY 4 | import torch 5 | 6 | # RMScorer for evaluating based on reward-model-deberta-v3-large-v2 7 | @MODEL_REGISTRY.register() 8 | class RMScorer(TextScorer): 9 | def __init__(self, args_dict): 10 | super().__init__(args_dict) 11 | self.model_name = args_dict.get('model_name') 12 | self.model_cache_dir = args_dict.get('model_cache_dir') # 增加缓存目录 13 | self.batch_size = args_dict.get('batch_size') 14 | self.device = args_dict.get('device') 15 | self.score_type = float 16 | self.data_type = 'text' 17 | self.score_name = 'RewardModelScore' 18 | 19 | self.rank_model = AutoModelForSequenceClassification.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device) 20 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir) 21 | 22 | 23 | 24 | def evaluate_batch(self, batch): 25 | input_texts = batch.get('instruction', '') 26 | output_texts = batch.get('output', '') 27 | inputs = self.tokenizer(input_texts, output_texts, return_tensors='pt', padding=True, truncation=True).to(self.device) 28 | 29 | with torch.no_grad(): 30 | logits = self.rank_model(**inputs).logits.cpu().detach().numpy() 31 | 32 | scores = logits.squeeze() 33 | 34 | if scores.ndim == 0: 35 | scores = [float(scores)] 36 | 37 | return scores.tolist() -------------------------------------------------------------------------------- /dataflow/Eval/Text/models/textbook_scorer.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import re 3 | from huggingface_hub import hf_hub_download 4 | import fasttext 5 | from dataflow.core import TextScorer 6 | from dataflow.utils.registry import MODEL_REGISTRY 7 | 8 | # Textbook quality classifier (Huggingface) 9 | # cited from: Textbooks Are All You Need 10 | @MODEL_REGISTRY.register() 11 | class TextbookScorer(TextScorer): 12 | def __init__(self, args_dict): 13 | super().__init__(args_dict) 14 | model_path = hf_hub_download( 15 | repo_id=args_dict.get('model_repo'), 16 | filename=args_dict.get('model_file'), 17 | cache_dir=args_dict.get('model_cache_dir') # 设置自定义缓存路径 18 | ) 19 | 20 | self.model = fasttext.load_model(model_path) 21 | self.batch_size = args_dict.get('batch_size') 22 | self.score_type = float 23 | self.data_type = 'text' 24 | self.score_name = 'TextbookScore' 25 | 26 | self.score_dict = { 27 | '__label__': 0, 28 | '__label__Low': args_dict.get('low_score', 1.0), 29 | '__label__Mid': args_dict.get('mid_score', 3.0), 30 | '__label__High': args_dict.get('high_score', 5.0) 31 | } 32 | 33 | @staticmethod 34 | def replace_newlines(text: str) -> str: 35 | return re.sub("\n+", " ", text) 36 | 37 | def evaluate_batch(self, batch) -> List[float]: 38 | text_list = next(iter(batch.values())) 39 | text_list = [self.replace_newlines(text) for text in text_list] 40 | pred = self.model.predict(text_list, k=-1) 41 | 42 | score_list = [] 43 | for labels, scores in zip(*pred): 44 | score = 0 45 | for label, score_value in zip(labels, scores): 46 | score += self.score_dict.get(label, 0) * score_value 47 | score_list.append(float(score)) 48 | return score_list 49 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/statistics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/dataflow/Eval/Text/statistics/__init__.py -------------------------------------------------------------------------------- /dataflow/Eval/Text/statistics/langkit_scorer.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextScorer 2 | from dataflow.utils.registry import MODEL_REGISTRY 3 | import pandas as pd 4 | from langkit import light_metrics, extract 5 | 6 | # Langkit quality metrics 7 | @MODEL_REGISTRY.register() 8 | class LangkitScorer(TextScorer): 9 | def __init__(self, args_dict): 10 | super().__init__(args_dict) 11 | self.llm_schema = light_metrics.init() 12 | self.metrics_to_keep = args_dict.get('metrics_to_keep', {}) 13 | self.batch_size = 1 14 | self.score_type = float 15 | self.data_type = 'text' 16 | self.score_name = 'LangkitScore' 17 | 18 | def evaluate_batch(self, batch): 19 | input_data = next(iter(batch.values())) 20 | results = {} 21 | for sample in input_data: 22 | df = pd.DataFrame({'prompt': [sample]}) 23 | df['response'] = '' 24 | enhanced_df = extract(df, schema=self.llm_schema) 25 | scores_dict = enhanced_df.to_dict(orient='records')[0] 26 | 27 | processed_scores = {} 28 | for k, v in scores_dict.items(): 29 | if k == 'prompt': 30 | continue 31 | elif k.startswith('prompt.'): 32 | new_key = k[len('prompt.'):] 33 | processed_scores[new_key] = v 34 | elif not (k == 'response' or k.startswith('response.')): 35 | processed_scores[k] = v 36 | 37 | processed_scores.pop('has_patterns', None) 38 | 39 | if self.metrics_to_keep: 40 | processed_scores = {k: v for k, v in processed_scores.items() if self.metrics_to_keep.get(k, True)} 41 | 42 | for k, v in processed_scores.items(): 43 | score_key = f"Langkit{''.join([word.capitalize() for word in k.split('_')])}Score" 44 | if score_key not in results: 45 | results[score_key] = [] 46 | results[score_key].append(v) 47 | 48 | return results 49 | -------------------------------------------------------------------------------- /dataflow/Eval/Text/statistics/ngram_scorer.py: -------------------------------------------------------------------------------- 1 | import re 2 | from dataflow.core import TextScorer 3 | from dataflow.utils.registry import MODEL_REGISTRY 4 | 5 | # N-gram repetition evaluation 6 | @MODEL_REGISTRY.register() 7 | class NgramScorer(TextScorer): 8 | def __init__(self, args_dict: dict): 9 | super().__init__(args_dict) 10 | self.ngrams = args_dict.get('ngrams') 11 | self.batch_size = 1 12 | self.data_type = 'text' 13 | self.score_name = 'NgramScore' 14 | self.score_type = float 15 | 16 | def evaluate_batch(self, batch): 17 | input_data = next(iter(batch.values())) 18 | scores = [] 19 | for sample in input_data: 20 | content = sample 21 | content = content.lower() 22 | content = re.sub(r'[^\w\s]', '', content) 23 | words = content.split() 24 | ngrams = [' '.join(words[i:i + self.ngrams]) for i in range(len(words) - (self.ngrams - 1))] 25 | unique_ngrams = set(ngrams) 26 | 27 | total_ngrams = len(ngrams) 28 | unique_ngrams_count = len(unique_ngrams) 29 | 30 | repetition_score = unique_ngrams_count / total_ngrams if total_ngrams > 0 else 0.0 31 | scores.append(repetition_score) 32 | 33 | return scores 34 | -------------------------------------------------------------------------------- /dataflow/Eval/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'video', 3 | 'image', 4 | 'Text', 5 | ] -------------------------------------------------------------------------------- /dataflow/Eval/image/clip_scorer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import clip 3 | import os 4 | 5 | from dataflow.core.scorer import ImageTextScorer 6 | from dataflow.utils.registry import MODEL_REGISTRY 7 | 8 | 9 | @MODEL_REGISTRY.register() 10 | class ClipScorer(ImageTextScorer): 11 | def __init__(self, args_dict: dict): 12 | super().__init__(args_dict) 13 | model, preprocess = clip.load(name="ViT-B/32", device=args_dict["device"], download_root=os.path.join(args_dict["model_cache_dir"], "clip")) 14 | self.model = model.eval() 15 | self.image_preprocessor = preprocess 16 | self.text_preprocessor = clip.tokenize 17 | self.device = args_dict["device"] 18 | self.data_type = "image_caption" 19 | self.scorer_name = "ClipScorer" 20 | 21 | # def get_image_preprocessor(self): 22 | # return self.image_preprocessor 23 | 24 | # def get_text_preprocessor(self): 25 | # return self.text_preprocessor 26 | 27 | def evaluate_batch(self, sample): 28 | image_features = self.model.encode_image(sample[0].to(self.device)) # [batch_size, dim] 29 | text_features = self.model.encode_text(sample[1].squeeze(1).to(self.device)) 30 | 31 | # normalize the features 32 | image_features /= image_features.norm(dim=-1, keepdim=True) 33 | text_features /= text_features.norm(dim=-1, keepdim=True) 34 | 35 | scores = torch.bmm(image_features.unsqueeze(1), text_features.unsqueeze(2)).squeeze(1).squeeze(1)*100 # [batch_size, 1, 1]->[batch_size] 36 | return scores 37 | -------------------------------------------------------------------------------- /dataflow/Eval/image/clip_t5/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/dataflow/Eval/image/clip_t5/__init__.py -------------------------------------------------------------------------------- /dataflow/Eval/image/clip_t5/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.clip_t5 import CLIPT5ForConditionalGeneration, CLIPT5Config, ModelArguments -------------------------------------------------------------------------------- /dataflow/Eval/image/clip_t5/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"): 9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 10 | 11 | raise ValueError(f'Unknown vision tower: {vision_tower}') 12 | -------------------------------------------------------------------------------- /dataflow/Eval/image/clip_t5/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /dataflow/Eval/image/image_aspect_ratio_scorer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from dataflow.core.scorer import ImageScorer 4 | from dataflow.utils.registry import MODEL_REGISTRY 5 | from dataflow.utils.image_utils import image_collate_fn 6 | 7 | @MODEL_REGISTRY.register() 8 | class ImageAspectRatioScorer(ImageScorer): 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | self.data_type = "image" 12 | self.scorer_name = "ImageAspectRatioScorer" 13 | self.collate_fn = image_collate_fn 14 | 15 | def evaluate_batch(self, sample): 16 | scores = [] 17 | for img in sample: 18 | scores.append(img.size[0] / img.size[1]) 19 | scores = np.array(scores) 20 | 21 | return scores 22 | -------------------------------------------------------------------------------- /dataflow/Eval/image/image_resolution_scorer.py: -------------------------------------------------------------------------------- 1 | from dataflow.core.scorer import ImageScorer 2 | from dataflow.utils.registry import MODEL_REGISTRY 3 | from dataflow.utils.image_utils import image_collate_fn 4 | 5 | @MODEL_REGISTRY.register() 6 | class ImageResolutionScorer(ImageScorer): 7 | def __init__(self, args_dict: dict): 8 | super().__init__(args_dict) 9 | self.data_type = "image" 10 | self.scorer_name = "ImageResolutionScorer" 11 | self.collate_fn = image_collate_fn 12 | self.score_type_list = ['width', 'height'] 13 | 14 | def evaluate_batch(self, sample): 15 | # format of return scores: 16 | # [ 17 | # {'width': ndarray, 'height': ndarray}, 18 | # {'width': ndarray, 'height': ndarray}, 19 | # ... 20 | # ] 21 | scores = [] 22 | for img in sample: 23 | scores.append({'width': img.size[0], 'height': img.size[1]}) 24 | 25 | return scores 26 | -------------------------------------------------------------------------------- /dataflow/Eval/image/image_text_scorer.py: -------------------------------------------------------------------------------- 1 | from .clip_scorer import ClipScorer 2 | from .longclip_scorer import LongClipScorer 3 | from .clip_t5_scorer import ClipT5Scorer 4 | from .fleur_scorer import FleurScorer -------------------------------------------------------------------------------- /dataflow/Eval/image/kid/lenet.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/dataflow/Eval/image/kid/lenet.pth -------------------------------------------------------------------------------- /dataflow/Eval/image/kid/lenet.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from collections import OrderedDict 3 | 4 | 5 | class LeNet5(nn.Module): 6 | """ 7 | Input - 1x32x32 8 | C1 - 6@28x28 (5x5 kernel) 9 | tanh 10 | S2 - 6@14x14 (2x2 kernel, stride 2) Subsampling 11 | C3 - 16@10x10 (5x5 kernel, complicated shit) 12 | tanh 13 | S4 - 16@5x5 (2x2 kernel, stride 2) Subsampling 14 | C5 - 120@1x1 (5x5 kernel) 15 | F6 - 84 16 | tanh 17 | F7 - 10 (Output) 18 | """ 19 | def __init__(self): 20 | super(LeNet5, self).__init__() 21 | 22 | self.convnet = nn.Sequential(OrderedDict([ 23 | ('c1', nn.Conv2d(1, 6, kernel_size=(5, 5))), 24 | ('tanh1', nn.Tanh()), 25 | ('s2', nn.MaxPool2d(kernel_size=(2, 2), stride=2, padding=1)), 26 | ('c3', nn.Conv2d(6, 16, kernel_size=(5, 5))), 27 | ('tanh3', nn.Tanh()), 28 | ('s4', nn.MaxPool2d(kernel_size=(2, 2), stride=2, padding=1)), 29 | ('c5', nn.Conv2d(16, 120, kernel_size=(5, 5))), 30 | ('tanh5', nn.Tanh()) 31 | ])) 32 | 33 | self.fc = nn.Sequential(OrderedDict([ 34 | ('f6', nn.Linear(120, 84)), 35 | ('tanh6', nn.Tanh()), 36 | ('f7', nn.Linear(84, 10)), 37 | ('sig7', nn.LogSoftmax(dim=-1)) 38 | ])) 39 | 40 | def forward(self, img): 41 | output = self.convnet(img) 42 | output = output.view(img.size(0), -1) 43 | output = self.fc(output) 44 | return output 45 | 46 | def extract_features(self, img): 47 | output = self.convnet(img.float()) 48 | output = output.view(img.size(0), -1) 49 | output = self.fc[1](self.fc[0](output)) 50 | return output 51 | -------------------------------------------------------------------------------- /dataflow/Eval/image/longclip_scorer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import torch 4 | import subprocess 5 | from dataflow.Eval.image.longclip.model import longclip 6 | from dataflow.core.scorer import ImageTextScorer 7 | from dataflow.utils.registry import MODEL_REGISTRY 8 | from dataflow.utils.utils import download_model_from_hf 9 | 10 | 11 | @MODEL_REGISTRY.register() 12 | class LongClipScorer(ImageTextScorer): 13 | def __init__(self, args_dict: dict): 14 | super().__init__(args_dict) 15 | model_cache_dir = os.path.join(args_dict["model_cache_dir"], "longclip") 16 | model_cache_path=os.path.join(model_cache_dir, f"longclip-{args_dict['model_size']}.pt") 17 | try: 18 | model, preprocess = longclip.load(name=model_cache_path, device=args_dict["device"]) 19 | except: 20 | download_model_from_hf("BeichenZhang/LongCLIP-" + args_dict["model_size"], model_cache_dir) 21 | model, preprocess = longclip.load(name=model_cache_path, device=args_dict["device"]) 22 | 23 | self.model = model.eval() 24 | self.image_preprocessor = preprocess 25 | self.text_preprocessor = longclip.tokenize 26 | self.device = args_dict["device"] 27 | self.data_type = "image_caption" 28 | self.scorer_name = "LongClipScorer" 29 | 30 | 31 | def evaluate_batch(self, sample): 32 | image_features = self.model.encode_image(sample[0].to(self.device)) # [batch_size, dim] 33 | text_features = self.model.encode_text(sample[1].squeeze(1).to(self.device)) 34 | 35 | scores = torch.bmm(image_features.unsqueeze(1), text_features.unsqueeze(2)).squeeze(1).squeeze(1) # [batch_size, 1, 1]->[batch_size] 36 | return scores 37 | -------------------------------------------------------------------------------- /dataflow/Eval/video/dover/__init__.py: -------------------------------------------------------------------------------- 1 | from .datasets import * 2 | from .models import * 3 | -------------------------------------------------------------------------------- /dataflow/Eval/video/dover/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | ## API for DOVER and its variants 2 | from .basic_datasets import * 3 | from .dover_datasets import * 4 | -------------------------------------------------------------------------------- /dataflow/Eval/video/dover/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .conv_backbone import convnext_3d_small, convnext_3d_tiny 2 | from .evaluator import DOVER, BaseEvaluator, BaseImageEvaluator 3 | from .head import IQAHead, VARHead, VQAHead 4 | from .swin_backbone import SwinTransformer2D as IQABackbone 5 | from .swin_backbone import SwinTransformer3D as VQABackbone 6 | from .swin_backbone import swin_3d_small, swin_3d_tiny 7 | 8 | __all__ = [ 9 | "VQABackbone", 10 | "IQABackbone", 11 | "VQAHead", 12 | "IQAHead", 13 | "VARHead", 14 | "BaseEvaluator", 15 | "BaseImageEvaluator", 16 | "DOVER", 17 | ] 18 | -------------------------------------------------------------------------------- /dataflow/Eval/video/emscore/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import em_cos_score, get_idf_dict 2 | 3 | __all__ = [ 4 | 'em_cos_score', 5 | 'get_idf_dict' 6 | ] -------------------------------------------------------------------------------- /dataflow/Eval/video/fastvqa/__init__.py: -------------------------------------------------------------------------------- 1 | from .datasets import * 2 | from .models import * 3 | -------------------------------------------------------------------------------- /dataflow/Eval/video/fastvqa/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | ## Version 0.0 Dataset API, includes FAST-VQA and its variants 2 | from .basic_datasets import ( 3 | FastVQAPlusPlusDataset, 4 | FragmentVideoDataset, 5 | FragmentImageDataset, 6 | ResizedVideoDataset, 7 | ResizedImageDataset, 8 | CroppedVideoDataset, 9 | CroppedImageDataset, 10 | SampleFrames, 11 | FragmentSampleFrames, 12 | ) 13 | 14 | ## Version 1.0 Dataset API, includes DiViDe VQA and its variants 15 | from .fusion_datasets import get_spatial_fragments, SimpleDataset, FusionDataset, LSVQPatchDataset, FusionDatasetK400 16 | 17 | 18 | __all__ = [ 19 | "FragmentVideoDataset", 20 | "FragmentImageDataset", 21 | "ResizedVideoDataset", 22 | "ResizedImageDataset", 23 | "CroppedVideoDataset", 24 | "CroppedImageDataset", 25 | "LSVQPatchDataset", 26 | "get_spatial_fragments", 27 | "SampleFrames", 28 | "FragmentSampleFrames", 29 | "SimpleDataset", 30 | "FusionDatasetK400", 31 | "FusionDataset", 32 | ] -------------------------------------------------------------------------------- /dataflow/Eval/video/fastvqa/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .swin_backbone import SwinTransformer3D as VQABackbone 2 | from .swin_backbone import SwinTransformer2D as IQABackbone 3 | from .head import VQAHead, IQAHead, VARHead 4 | from .swin_backbone import swin_3d_tiny, swin_3d_small 5 | from .conv_backbone import convnext_3d_tiny, convnext_3d_small 6 | from .xclip_backbone import build_x_clip_model 7 | from .evaluator import BaseEvaluator, BaseImageEvaluator, DiViDeAddEvaluator 8 | 9 | __all__ = [ 10 | "VQABackbone", 11 | "IQABackbone", 12 | "VQAHead", 13 | "IQAHead", 14 | "VARHead", 15 | "BaseEvaluator", 16 | "BaseImageEvaluator", 17 | "DiViDeAddEvaluator", 18 | ] 19 | -------------------------------------------------------------------------------- /dataflow/Eval/video/fastvqa/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "3.1.0" 2 | 3 | 4 | def parse_version_info(version_str): 5 | version_info = [] 6 | for x in version_str.split("."): 7 | if x.isdigit(): 8 | version_info.append(int(x)) 9 | elif x.find("rc") != -1: 10 | patch_version = x.split("rc") 11 | version_info.append(int(patch_version[0])) 12 | version_info.append(f"rc{patch_version[1]}") 13 | return tuple(version_info) 14 | 15 | 16 | version_info = parse_version_info(__version__) 17 | -------------------------------------------------------------------------------- /dataflow/Eval/video/video_resolution_scorer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import av 3 | 4 | import numpy as np 5 | from jsonargparse.typing import PositiveInt 6 | from dataflow.core import VideoScorer 7 | from dataflow.utils.registry import MODEL_REGISTRY 8 | 9 | @MODEL_REGISTRY.register() 10 | class VideoResolutionScorer(VideoScorer): 11 | 12 | def __init__(self, args_dict): 13 | super().__init__(args_dict) 14 | 15 | def init_score(self, len_dataset): 16 | ''' 17 | return empty score dict for this scorer 18 | eg: {'Default': np.array([-1] * len_dataset)} 19 | ''' 20 | return {'width': np.array([np.nan] * len_dataset), 'height': np.array([np.nan] * len_dataset)} 21 | 22 | 23 | def evaluate_batch(self, sample, key=None, rank=None): 24 | video_data = av.open(sample['video'][0]) 25 | video_stream = video_data.streams.video[0] 26 | video_width, video_height = video_stream.codec_context.width, video_stream.codec_context.height 27 | for video_stream in video_data.streams.video: 28 | video_stream.close(strict=False) 29 | 30 | video_data.close() 31 | return {'width': video_width, 'height': video_height} 32 | 33 | -------------------------------------------------------------------------------- /dataflow/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import * 2 | from .utils import * 3 | # from .Eval import * 4 | # from .process import * 5 | from .format import * 6 | 7 | from .utils.utils import list_image_eval_metrics, get_scorer -------------------------------------------------------------------------------- /dataflow/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import new_init_config 2 | 3 | __all__ = [ 4 | 'new_init_config' 5 | ] -------------------------------------------------------------------------------- /dataflow/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .scorer import Scorer, VideoScorer, VideoTextScorer, TextScorer, GenTextScorer, ScoreRecord 2 | from .process.filter import Filter, ImageFilter, ImageTextFilter, VideoFilter, TextFilter, VideoTextFilter 3 | from .process.refiner import Refiner, TextRefiner 4 | from .process.deduplicator import Deduplicator, TextDeduplicator, ImageDeduplicator 5 | from .process.reasoner import ReasonerFilter 6 | 7 | __all__ = [ 8 | 'Scorer', 9 | 'VideoScorer', 10 | 'VideoTextScorer', 11 | 'TextScorer', 12 | 'GenTextScorer', 13 | 'ScoreRecord', 14 | 'Processor', 15 | 'Filter', 16 | 'TextFilter', 17 | 'ImageFilter', 18 | 'ImageTextFilter', 19 | 'VideoFilter', 20 | 'VideoTextFilter', 21 | 'Refiner', 22 | 'TextRefiner', 23 | 'Deduplicator', 24 | 'TextDeduplicator', 25 | 'ReasonerFilter' 26 | ] -------------------------------------------------------------------------------- /dataflow/core/process/deduplicator.py: -------------------------------------------------------------------------------- 1 | from datasets import Dataset 2 | 3 | class Deduplicator: 4 | 5 | def __init__(self, args): 6 | pass 7 | 8 | def dedup_func(self, dataset): 9 | raise NotImplementedError 10 | 11 | def __call__(self, dataset): 12 | init_len = len(dataset) 13 | deduped_dataset = self.dedup_func(dataset) 14 | print(f'Implemented {self.__class__.__name__}. Data Number: {init_len} -> {len(deduped_dataset)}', flush=True) 15 | 16 | return deduped_dataset 17 | 18 | class TextDeduplicator(Deduplicator): 19 | 20 | def __init__(self, args=None): 21 | self.data_type = "text" 22 | 23 | def __call__(self, dataset): 24 | init_len = len(dataset) 25 | labels = self.dedup_func(dataset) 26 | if isinstance(dataset.dataset, Dataset): 27 | def filter_by_labels(example, index): 28 | return labels[index] == 1 29 | dataset.dataset = dataset.dataset.filter(filter_by_labels, with_indices=True) 30 | deduped_dataset = dataset 31 | else: 32 | deduped_dataset = dataset.filter(labels) 33 | print(f'Implemented {self.dedupliactor_name}. Data Number: {init_len} -> {len(deduped_dataset)}') 34 | return deduped_dataset 35 | 36 | class ImageDeduplicator(Deduplicator): 37 | 38 | def __init__(self, args=None): 39 | self.data_type = "image" 40 | -------------------------------------------------------------------------------- /dataflow/core/process/reasoner.py: -------------------------------------------------------------------------------- 1 | from dataflow.data import DataFlowDataset 2 | from dataflow.core import ScoreRecord 3 | from datasets import Dataset 4 | 5 | class Reasoner(): 6 | def __init__(self, args=None): 7 | pass 8 | 9 | def reason_func(self, dataset): 10 | pass 11 | 12 | def __call__(self, dataset: DataFlowDataset): 13 | pass 14 | 15 | class ReasonerFilter(Reasoner): 16 | def __init__(self, args=None): 17 | super().__init__() 18 | self.data_type = "text" 19 | self.filter_name = "ReasonerFilter" 20 | self.args = args 21 | 22 | api_args = args.get('api_args', None) 23 | if api_args is not None: 24 | self.model_name = api_args['model_name'] 25 | self.api_url = api_args['api_url'] 26 | self.mode_test = api_args['mode_test'] 27 | def filter_func(self, dataset): 28 | pass 29 | 30 | def __call__(self, dataset: DataFlowDataset): 31 | """Processes the dataset using the reasoner""" 32 | init_len = len(dataset) 33 | score_record = ScoreRecord() 34 | dataset.set_score_record(score_record) 35 | labels = self.filter_func(dataset) 36 | 37 | if isinstance(dataset.dataset, Dataset): 38 | def filter_by_labels(example, index): 39 | return labels[index] == 1 40 | dataset.dataset = dataset.dataset.filter(filter_by_labels, with_indices=True) 41 | filtered_dataset = dataset 42 | else: 43 | filtered_dataset = dataset.filter(labels) 44 | 45 | print(f'Implemented {self.filter_name}. Data Number: {init_len} -> {len(filtered_dataset)}', flush=True) 46 | return filtered_dataset -------------------------------------------------------------------------------- /dataflow/core/process/refiner.py: -------------------------------------------------------------------------------- 1 | class Refiner(): 2 | 3 | def __init__(self, args): 4 | pass 5 | 6 | def __call__(self, dataset): 7 | pass 8 | 9 | class TextRefiner(Refiner): 10 | 11 | def __init__(self, args=None): 12 | self.data_type = "text" 13 | 14 | def __call__(self, dataset): 15 | refined_dataset, numbers = self.refine_func(dataset) 16 | print(f'Implemented {self.refiner_name}. {numbers} data refined.', flush=True) 17 | 18 | return refined_dataset 19 | -------------------------------------------------------------------------------- /dataflow/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataflow_dataset import DataFlowDataset, DataFlowSubset, DataFlowDSDict 2 | from .pure_video_dataset import PureVideoDataset 3 | from .video_caption_dataset import VideoCaptionDataset 4 | from .text_dataset import TextDataset 5 | from .image_dataset import ImageDataset, ImageCaptionDataset 6 | 7 | __all__ = [ 8 | 'DataFlowDataset', 9 | 'DataFlowSubset', 10 | 'DataFlowDSDict', 11 | 'PureVideoDataset', 12 | 'VideoCaptionDataset', 13 | 'TextDataset', 14 | 'ImageDataset', 15 | 'ImageCaptionDataset', 16 | ] -------------------------------------------------------------------------------- /dataflow/data/pure_video_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import numpy as np 4 | from .dataflow_dataset import DataFlowDataset 5 | 6 | class PureVideoDataset(DataFlowDataset): 7 | 8 | def __init__(self, meta_data, video_folder): 9 | super().__init__() 10 | self.meta_data = meta_data 11 | self.video_folder = video_folder 12 | 13 | def __getitem__(self, index): 14 | sample_metadata = self.meta_data[index] 15 | if 'flickr_id' in sample_metadata.keys(): 16 | sample_metadata['video'] = os.path.join(self.video_folder, str(sample_metadata['flickr_id'])) + '.mp4' 17 | elif 'videoID' in sample_metadata.keys(): 18 | sample_metadata['video'] = os.path.join(self.video_folder, str(sample_metadata['videoID'])) + '.mp4' 19 | else: 20 | sample_metadata['video'] = os.path.join(self.video_folder, str(sample_metadata['video'])) 21 | for func in self.map_func: 22 | sample_metadata = func(sample_metadata) 23 | return {'video': sample_metadata['video']} 24 | 25 | def __len__(self): 26 | return len(self.meta_data) 27 | 28 | def get_dump_data(self): 29 | return self.meta_data 30 | 31 | def dump(self, save_path): 32 | import json 33 | import uuid 34 | if os.path.exists(save_path): 35 | save_file = save_path if os.path.isfile(save_path) else save_path + uuid.uuid4().hex + '.json' 36 | with open(save_file, 'w+') as f: 37 | json.dump(self.meta_data) 38 | -------------------------------------------------------------------------------- /dataflow/data/video_caption_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .dataflow_dataset import DataFlowDataset 3 | class VideoCaptionDataset(DataFlowDataset): 4 | 5 | def __init__(self, meta_data, video_folder): 6 | 7 | super().__init__() 8 | self.meta_data = meta_data 9 | self.video_folder = video_folder 10 | 11 | def __getitem__(self, index) : 12 | 13 | sample_meta_data = self.meta_data[index] 14 | 15 | return { 16 | 'captions': sample_meta_data['enCap'].tolist() if type(sample_meta_data['enCap']) is not list else sample_meta_data['enCap'], 17 | 'video': os.path.join(self.video_folder, sample_meta_data['videoID'] + '.mp4') if 'videoID' in sample_meta_data.keys() else os.path.join(self.video_folder, sample_meta_data['video']) 18 | } 19 | 20 | def __len__(self): 21 | return len(self.meta_data) 22 | 23 | def get_dump_data(self): 24 | return self.meta_data 25 | 26 | def dump(self, save_path): 27 | import json 28 | import uuid 29 | if os.path.exists(save_path): 30 | save_file = save_path if os.path.isfile(save_path) else save_path + uuid.uuid4().hex + '.json' 31 | with open(save_file, 'w+') as f: 32 | json.dump(self.meta_data) -------------------------------------------------------------------------------- /dataflow/format/__init__.py: -------------------------------------------------------------------------------- 1 | from .video_formatter import PureVideoFormatter, VideoCaptionFormatter 2 | from .text_formatter import TextFormatter, GenTextFormatter 3 | from .image_formatter import PureImageFormatter, ImageCaptionFormatter 4 | 5 | __all__ = [ 6 | 'PureVideoFormatter', 7 | 'VideoCaptionFormatter', 8 | 'TextFormatter', 9 | 'GenTextFormatter', 10 | 'PureImageFormatter', 11 | 'ImageCaptionFormatter' 12 | ] -------------------------------------------------------------------------------- /dataflow/format/video_formatter.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from dataflow.data import PureVideoDataset, VideoCaptionDataset 3 | from torch.utils.data import Dataset 4 | from dataflow.utils.registry import FORMATTER_REGISTRY 5 | 6 | @FORMATTER_REGISTRY.register() 7 | class PureVideoFormatter(): 8 | 9 | def __init__(self, cfg): 10 | self.meta_path = cfg['meta_data_path'] 11 | self.video_path = cfg['data_path'] 12 | 13 | def load_dataset(self) -> Dataset: 14 | if self.meta_path.endswith('.csv'): 15 | df = pd.read_csv(self.meta_path) 16 | elif self.meta_path.endswith('.tsv'): 17 | df = pd.read_csv(self.meta_path, sep="\t") 18 | elif self.meta_path.endswith('.json'): 19 | df = pd.read_json(self.meta_path) 20 | elif self.meta_path.endswith('.parquet'): 21 | df = pd.read_parquet(self.meta_path) 22 | else: 23 | return ValueError(f"Unsupported file type: {self.meta_path}") 24 | meta_data = df.to_dict(orient='records') 25 | return PureVideoDataset(meta_data, self.video_path) 26 | 27 | @FORMATTER_REGISTRY.register() 28 | class VideoCaptionFormatter(): 29 | 30 | def __init__(self, cfg): 31 | self.meta_path = cfg['meta_data_path'] 32 | self.video_path = cfg['data_path'] 33 | 34 | def load_dataset(self) -> Dataset: 35 | if self.meta_path.endswith('.csv'): 36 | df = pd.read_csv(self.meta_path) 37 | elif self.meta_path.endswith('.tsv'): 38 | df = pd.read_csv(self.meta_path, sep="\t") 39 | elif self.meta_path.endswith('.json'): 40 | df = pd.read_json(self.meta_path) 41 | elif self.meta_path.endswith('.parquet'): 42 | df = pd.read_parquet(self.meta_path) 43 | else: 44 | return ValueError(f"Unsupported file type: {self.meta_path}") 45 | meta_data = df.to_dict(orient='records') 46 | return VideoCaptionDataset(meta_data, self.video_path) -------------------------------------------------------------------------------- /dataflow/generate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/dataflow/generate/__init__.py -------------------------------------------------------------------------------- /dataflow/process/image/__init__.py: -------------------------------------------------------------------------------- 1 | from .filters import * 2 | from .deduplicators import * -------------------------------------------------------------------------------- /dataflow/process/image/deduplicators/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from dataflow.utils.registry import LazyLoader 3 | 4 | _import_structure = { 5 | "ImagePHashDeduplicator": ("dataflow/process/image/deduplicators/deduplicator.py", "ImagePHashDeduplicator"), 6 | "ImageAHashDeduplicator": ("dataflow/process/image/deduplicators/deduplicator.py", "ImageAHashDeduplicator"), 7 | "ImageDHashDeduplicator": ("dataflow/process/image/deduplicators/deduplicator.py", "ImageDHashDeduplicator"), 8 | "ImageWHashDeduplicator": ("dataflow/process/image/deduplicators/deduplicator.py", "ImageWHashDeduplicator"), 9 | } 10 | sys.modules[__name__] = LazyLoader(__name__, "dataflow/process/image/deduplicators/", _import_structure) 11 | 12 | # from .deduplicator import ImagePHashDeduplicator, ImageAHashDeduplicator, ImageDHashDeduplicator, ImageWHashDeduplicator 13 | -------------------------------------------------------------------------------- /dataflow/process/image/filters/image_aspect_ratio_filter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from dataflow.core import ImageFilter 3 | from dataflow.Eval.image import ImageAspectRatioScorer 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class ImageAspectRatioFilter(ImageFilter): 8 | def __init__(self, args_dict: dict): 9 | super().__init__() 10 | self.min_ratio = args_dict["min_ratio"] if "min_ratio" in args_dict else -np.inf 11 | self.max_ratio = args_dict["max_ratio"] if "max_ratio" in args_dict else np.inf 12 | 13 | self.scorer = ImageAspectRatioScorer(args_dict=args_dict) 14 | 15 | def filter_func(self, sample): 16 | _, score = self.scorer(sample) 17 | 18 | result = np.array(((self.min_ratio <= score['Default']) & (score['Default'] <= self.max_ratio)).astype(int)) 19 | 20 | return result 21 | -------------------------------------------------------------------------------- /dataflow/process/image/filters/image_resolution_filter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from dataflow.core import ImageFilter 3 | from dataflow.Eval.image import ImageResolutionScorer 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class ImageResolutionFilter(ImageFilter): 8 | def __init__(self, args_dict: dict): 9 | super().__init__() 10 | self.min_width = args_dict["min_width"] 11 | self.max_width = args_dict["max_width"] 12 | self.min_height = args_dict["min_height"] 13 | self.max_height = args_dict["max_height"] 14 | self.scorer = ImageResolutionScorer(args_dict=args_dict) 15 | 16 | def filter_func(self, sample): 17 | _, score = self.scorer(sample) 18 | width_condition = (self.min_width <= score['width']) & (score['width'] <= self.max_width) 19 | height_condition = (self.min_height <= score['height']) & (score['height'] <= self.max_height) 20 | result = np.array((width_condition & height_condition).astype(int)) 21 | return result 22 | -------------------------------------------------------------------------------- /dataflow/process/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .filters import * 2 | from .refiners import * 3 | from .deduplicators import * 4 | from .reasoners import * -------------------------------------------------------------------------------- /dataflow/process/text/deduplicators/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from dataflow.utils.registry import LazyLoader 3 | 4 | _import_structure = { 5 | "HashDeduplicator": ("dataflow/process/text/deduplicators/hash_deduplicator.py", "HashDeduplicator"), 6 | "SemDeduplicator": ("dataflow/process/text/deduplicators/sem_deduplicator.py", "SemDeduplicator"), 7 | "SimHashDeduplicator": ("dataflow/process/text/deduplicators/simhash_deduplicator.py", "SimHashDeduplicator"), 8 | "CCNetDeduplicator": ("dataflow/process/text/deduplicators/ccnet_deduplicator.py", "CCNetDeduplicator"), 9 | "NgramHashDeduplicator": ("dataflow/process/text/deduplicators/ngramhash_deduplicator.py", "NgramHashDeduplicator"), 10 | "MinHashDeduplicator": ("dataflow/process/text/deduplicators/minhash_deduplicator.py", "MinHashDeduplicator") 11 | } 12 | 13 | sys.modules[__name__] = LazyLoader(__name__, "dataflow/process/text/deduplicators", _import_structure) 14 | 15 | -------------------------------------------------------------------------------- /dataflow/process/text/deduplicators/ccnet_deduplicator.py: -------------------------------------------------------------------------------- 1 | # 比较SHA-1数字前64位 CCNet 2 | from dataflow.core import TextDeduplicator 3 | from dataflow.utils.registry import PROCESSOR_REGISTRY 4 | from dataflow.utils.text_utils import sha1_hash 5 | from tqdm import tqdm 6 | 7 | @PROCESSOR_REGISTRY.register() 8 | class CCNetDeduplicator(TextDeduplicator): 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | self.dedupliactor_name = 'CCNetDeduplicator' 12 | self.bit_length = args_dict.get('bit_length', 64) 13 | 14 | def _compute_hash(self, text: str) -> str: 15 | return sha1_hash(text, self.bit_length) 16 | 17 | def dedup_func(self, dataset): 18 | seen_hashes = set() 19 | labels = [0] * len(dataset) 20 | for idx, sample in tqdm(enumerate(dataset), desc=f"Implementing {self.dedupliactor_name}", total=len(dataset)): 21 | if isinstance(dataset.keys, list): 22 | text = " ".join([str(sample[key]) for key in dataset.keys]) 23 | text = text.encode('utf-8') 24 | else: 25 | text = str(sample[dataset.keys]).encode('utf-8') 26 | hash_value = self._compute_hash(text) 27 | if hash_value not in seen_hashes: 28 | labels[idx] = 1 29 | seen_hashes.add(hash_value) 30 | return labels 31 | -------------------------------------------------------------------------------- /dataflow/process/text/deduplicators/hash_deduplicator.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextDeduplicator 2 | import hashlib 3 | from dataflow.utils.registry import PROCESSOR_REGISTRY 4 | from dataflow.utils.text_utils import md5, sha256, xxh3_128 5 | from tqdm import tqdm 6 | 7 | @PROCESSOR_REGISTRY.register() 8 | class HashDeduplicator(TextDeduplicator): 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | self.dedupliactor_name = 'HashDeduplicator' 12 | self.hash_func = args_dict.get('hash_func', 'md5') 13 | self.hash_func_dict = { 14 | 'md5': md5, 15 | 'sha256': sha256, 16 | 'xxh3': xxh3_128 17 | } 18 | 19 | def _compute_hash(self, text: str) -> str: 20 | return self.hash_func_dict[self.hash_func](text.encode('utf-8')).hexdigest() 21 | 22 | def dedup_func(self, dataset): 23 | seen_hashes = set() 24 | labels = [0] * len(dataset) 25 | for idx, sample in tqdm(enumerate(dataset), desc=f"Implementing {self.dedupliactor_name}", total=len(dataset)): 26 | if isinstance(dataset.keys, list): 27 | text = " ".join([str(sample[key]) for key in dataset.keys]) 28 | else: 29 | text = str(sample[dataset.keys]) 30 | 31 | hash_value = self._compute_hash(text) 32 | 33 | if hash_value not in seen_hashes: 34 | seen_hashes.add(hash_value) 35 | labels[idx] = 1 36 | 37 | return labels -------------------------------------------------------------------------------- /dataflow/process/text/deduplicators/minhash_deduplicator.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextDeduplicator 2 | from dataflow.utils.registry import PROCESSOR_REGISTRY 3 | from datasketch import MinHash, MinHashLSH # use datasketch-1.6.5 4 | from tqdm import tqdm 5 | from collections.abc import Sequence 6 | 7 | 8 | @PROCESSOR_REGISTRY.register() 9 | class MinHashDeduplicator(TextDeduplicator): 10 | def __init__(self, args_dict: dict): 11 | super().__init__(args_dict) 12 | self.dedupliactor_name = 'MinHashDeduplicator' 13 | self.num_perm = args_dict.get('num_perm', 128) 14 | self.threshold = args_dict.get('threshold', 0.9) 15 | self.use_n_gram = args_dict.get('use_n_gram', True) 16 | self.n_gram = args_dict.get('n_gram', 5) 17 | 18 | def create_minhash(self, data): 19 | minhash = MinHash(num_perm=self.num_perm) 20 | if self.use_n_gram: 21 | for i in range(len(data) - self.n_gram + 1): 22 | minhash.update(data[i:i + self.n_gram].encode('utf8')) 23 | else: 24 | for d in data: 25 | minhash.update(d.encode('utf8')) 26 | return minhash 27 | 28 | def dedup_func(self, dataset): 29 | lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm) 30 | 31 | labels = [0] * len(dataset) 32 | with lsh.insertion_session() as session: 33 | for idx, sample in tqdm(enumerate(dataset), desc=f"Implementing {self.dedupliactor_name}", total=len(dataset)): 34 | text = str(sample[dataset.keys]) 35 | minhash = self.create_minhash(text) 36 | result = lsh.query(minhash) 37 | if len(result) == 0: 38 | labels[idx] = 1 39 | session.insert(idx, minhash) 40 | 41 | return labels 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /dataflow/process/text/deduplicators/ngramhash_deduplicator.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextDeduplicator 2 | from dataflow.utils.registry import PROCESSOR_REGISTRY 3 | from dataflow.utils.text_utils import md5, sha256, xxh3_128 4 | from tqdm import tqdm 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class NgramHashDeduplicator(TextDeduplicator): 8 | def __init__(self, args_dict: dict): 9 | super().__init__(args_dict) 10 | self.dedupliactor_name = 'NgramHashDeduplicator' 11 | self.n_gram = args_dict.get('n_gram', 3) 12 | self.hash_func = args_dict.get('hash_func', 'md5') 13 | self.diff_size = args_dict.get('diff_size', 1) # 有diff_size个hash值不同,则认为不同 14 | self.hash_func_dict = { 15 | 'md5': md5, 16 | 'sha256': sha256, 17 | 'xxh3': xxh3_128 18 | } 19 | 20 | def _compute_hash(self, text: str) -> str: 21 | return self.hash_func_dict[self.hash_func](text.encode('utf-8')).hexdigest() 22 | 23 | def dedup_func(self, dataset): 24 | seen_hashes = [] 25 | labels = [0] * len(dataset) 26 | for idx, sample in tqdm(enumerate(dataset), desc=f"Implementing {self.dedupliactor_name}", total=len(dataset)): 27 | if isinstance(dataset.keys, list): 28 | text = " ".join([str(sample[key]) for key in dataset.keys]) 29 | else: 30 | text = str(sample[dataset.keys]) 31 | gram_length = len(text) // self.n_gram 32 | ngrams = [text[i*gram_length:(i+1)*gram_length] for i in range(self.n_gram)] 33 | hash_value = set(self._compute_hash(ngram) for ngram in ngrams) 34 | if all(len(hash_value & hash) < self.diff_size for hash in seen_hashes): 35 | labels[idx]=1 36 | seen_hashes.append(hash_value) 37 | return labels 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /dataflow/process/text/deduplicators/simhash_deduplicator.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextDeduplicator 2 | from dataflow.utils.registry import PROCESSOR_REGISTRY 3 | from collections import defaultdict 4 | from typing import List 5 | from dataflow.utils.text_utils import md5_digest, sha256_digest, xxh3_128_digest 6 | from simhash import Simhash 7 | from tqdm import tqdm 8 | 9 | @PROCESSOR_REGISTRY.register() 10 | class SimHashDeduplicator(TextDeduplicator): 11 | def __init__(self, args_dict: dict): 12 | super().__init__(args_dict) 13 | self.deduplicator_name = 'SimHashDeduplicator' 14 | self.fingerprint_size = args_dict.get('fingerprint_size', 64) 15 | self.bound = args_dict.get('bound', 0.1) 16 | 17 | 18 | def dedup_func(self, dataset): 19 | simhashes = [] 20 | labels = [0] * len(dataset) 21 | def get_similarity(simhash, another_simhash): 22 | max_hashbit = max(len(bin(simhash.value)), len(bin(another_simhash.value))) 23 | distince = simhash.distance(another_simhash) 24 | similar = 1 - distince / max_hashbit 25 | return similar 26 | for idx, sample in tqdm(enumerate(dataset), desc=f"Implementing {self.dedupliactor_name}", total=len(dataset)): 27 | if isinstance(dataset.keys, list): 28 | text = " ".join([str(sample[key]) for key in dataset.keys]) 29 | else: 30 | text = str(sample[dataset.keys]) 31 | simhash = Simhash(text, f=self.fingerprint_size) 32 | if all(get_similarity(simhash, another_simhash) < 1 - self.bound for another_simhash in simhashes): 33 | labels[idx]=1 34 | simhashes.append(simhash) 35 | return labels 36 | 37 | 38 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/alpagasus_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import AlpagasusScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class AlpagasusFilter(TextFilter): 8 | def __init__(self, args_dict: dict): 9 | super().__init__(args_dict) 10 | self.min_score = args_dict['min_score'] 11 | self.max_score = args_dict['max_score'] 12 | scorer_args = args_dict.get('scorer_args') 13 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 14 | self.scorer = AlpagasusScorer(scorer_args) 15 | self.filter_name = 'AlpagasusFilter' 16 | 17 | def filter_func(self, dataset): 18 | _, scores = self.scorer(dataset) 19 | return np.array([self.min_score <= score <= self.max_score for score in scores['Default']]).astype(int) 20 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/debertav3_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import DebertaV3Scorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class DebertaV3Filter(TextFilter): 8 | 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | self.allowed_scores = args_dict['allowed_scores'] 12 | scorer_args = args_dict.get('scorer_args', {}) 13 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 14 | self.scorer = DebertaV3Scorer(scorer_args) 15 | self.filter_name = 'DebertaV3Filter' 16 | 17 | def filter_func(self, dataset): 18 | _, scores = self.scorer(dataset) 19 | metric_scores = scores['Default'] 20 | metric_filter = np.array([1 if score in self.allowed_scores else 0 for score in metric_scores]) 21 | return metric_filter 22 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/deita_complexity_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import DeitaComplexityScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class DeitaComplexityFilter(TextFilter): 8 | def __init__(self, args_dict: dict): 9 | super().__init__(args_dict) 10 | self.min_score = args_dict['min_score'] 11 | self.max_score = args_dict['max_score'] 12 | scorer_args = args_dict.get('scorer_args') 13 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 14 | self.scorer = DeitaComplexityScorer(scorer_args) 15 | self.filter_name = 'DeitaComplexityFilter' 16 | 17 | def filter_func(self, dataset): 18 | _, scores = self.scorer(dataset) 19 | return np.array([self.min_score <= score <= self.max_score for score in scores['Default']]).astype(int) 20 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/deita_quality_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import DeitaQualityScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class DeitaQualityFilter(TextFilter): 8 | def __init__(self, args_dict: dict): 9 | super().__init__(args_dict) 10 | self.min_score = args_dict['min_score'] 11 | self.max_score = args_dict['max_score'] 12 | scorer_args = args_dict.get('scorer_args') 13 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 14 | self.scorer = DeitaQualityScorer(scorer_args) 15 | self.filter_name = 'DeitaQualityFilter' 16 | 17 | def filter_func(self, dataset): 18 | _, scores = self.scorer(dataset) 19 | return np.array([self.min_score <= score <= self.max_score for score in scores['Default']]).astype(int) 20 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/finewebedu_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import FineWebEduScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class FineWebEduFilter(TextFilter): 8 | 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | self.min_score = args_dict['min_score'] 12 | self.max_score = args_dict['max_score'] 13 | scorer_args = args_dict.get('scorer_args', {}) 14 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 15 | self.scorer = FineWebEduScorer(scorer_args) 16 | self.filter_name = 'FineWebEduFilter' 17 | 18 | def filter_func(self, dataset): 19 | _, scores = self.scorer(dataset) 20 | metric_scores = np.array(scores['Default']) 21 | metric_filter = (self.min_score <= metric_scores) & (metric_scores <= self.max_score) 22 | return metric_filter.astype(int) 23 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/instag_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import InstagScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class InstagFilter(TextFilter): 8 | 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | self.min_score = args_dict['min_score'] 12 | self.max_score = args_dict['max_score'] 13 | scorer_args = args_dict.get('scorer_args', {}) 14 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 15 | self.scorer = InstagScorer(scorer_args) 16 | self.filter_name = 'InstagFilter' 17 | 18 | def filter_func(self, dataset): 19 | _, scores = self.scorer(dataset) 20 | metric_scores = np.array(scores['Default']) 21 | metric_filter = (self.min_score <= metric_scores) & (metric_scores <= self.max_score) 22 | return metric_filter.astype(int) 23 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/language_filter.py: -------------------------------------------------------------------------------- 1 | import fasttext 2 | import numpy as np 3 | from huggingface_hub import hf_hub_download 4 | from dataflow.core import TextFilter 5 | from dataflow.utils.registry import PROCESSOR_REGISTRY 6 | from tqdm import tqdm 7 | 8 | @PROCESSOR_REGISTRY.register() 9 | class LanguageFilter(TextFilter): 10 | 11 | def __init__(self, args_dict: dict): 12 | super().__init__(args_dict) 13 | self.allowed_languages = args_dict['allowed_languages'] 14 | model_cache_dir = args_dict.get('model_cache_dir', None) 15 | 16 | model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin", cache_dir=model_cache_dir) 17 | self.model = fasttext.load_model(model_path) 18 | self.filter_name = 'LanguageFilter' 19 | 20 | def filter_func(self, dataset): 21 | predictions = [] 22 | for item in tqdm(dataset, desc=f"Implementing {self.filter_name}"): 23 | if isinstance(dataset.keys, list): 24 | text_to_evaluate = " ".join(item[key].replace('\n', ' ') for key in dataset.keys) 25 | else: 26 | text_to_evaluate = item[dataset.keys].replace('\n', ' ') 27 | labels, _ = self.model.predict(text_to_evaluate, k=5) 28 | predictions.append(any(label in self.allowed_languages for label in labels)) 29 | 30 | return np.array(predictions).astype(int) 31 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/lexical_diversity_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import LexicalDiversityScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class LexicalDiversityFilter(TextFilter): 8 | 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | 12 | self.min_scores = args_dict['min_scores'] 13 | self.max_scores = args_dict['max_scores'] 14 | scorer_args = args_dict.get('scorer_args', {}) 15 | self.metrics_to_keep = scorer_args['metrics_to_keep'] 16 | if not set(self.min_scores.keys()).issubset(set(self.metrics_to_keep.keys())): 17 | raise ValueError("The filtering metrics must be a subset of metrics_to_keep.") 18 | 19 | if not set(self.max_scores.keys()).issubset(set(self.metrics_to_keep.keys())): 20 | raise ValueError("The filtering metrics must be a subset of metrics_to_keep.") 21 | self.metric_name_map = { 22 | 'mtld': 'LexicalDiversityMTLDScore', 23 | 'hdd': 'LexicalDiversityHD-DScore' 24 | } 25 | self.scorer = LexicalDiversityScorer(scorer_args) 26 | self.filter_name = 'LexicalDiversityFilter' 27 | 28 | def filter_func(self, dataset): 29 | _, scores = self.scorer(dataset) 30 | results = np.ones(len(dataset), dtype=int) 31 | for metric, min_score in self.min_scores.items(): 32 | max_score = self.max_scores[metric] 33 | score_key = self.metric_name_map[metric] 34 | metric_scores = scores[score_key] 35 | metric_filter = (min_score <= metric_scores) & (metric_scores <= max_score) 36 | results = results & metric_filter.astype(int) 37 | 38 | return results 39 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/ngram_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import NgramScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class NgramFilter(TextFilter): 8 | 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | self.min_score = args_dict['min_score'] 12 | self.max_score = args_dict['max_score'] 13 | scorer_args = args_dict.get('scorer_args', {}) 14 | self.scorer = NgramScorer(scorer_args) 15 | self.filter_name = 'NgramFilter' 16 | 17 | def filter_func(self, dataset): 18 | _, scores = self.scorer(dataset) 19 | return np.array([self.min_score <= score <= self.max_score for score in scores['Default']]).astype(int) 20 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/perplexity_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import PerplexityScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class PerplexityFilter(TextFilter): 8 | 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | self.min_score = args_dict['min_score'] 12 | self.max_score = args_dict['max_score'] 13 | scorer_args = args_dict.get('scorer_args', {}) 14 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 15 | self.scorer = PerplexityScorer(scorer_args) 16 | self.filter_name = 'PerplexityFilter' 17 | 18 | def filter_func(self, dataset): 19 | _, scores = self.scorer(dataset) 20 | metric_scores = np.array(scores['Default']) 21 | metric_filter = (self.min_score <= metric_scores) & (metric_scores <= self.max_score) 22 | return metric_filter.astype(int) 23 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/perspective_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import PerspectiveScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class PerspectiveFilter(TextFilter): 8 | def __init__(self, args_dict: dict): 9 | super().__init__(args_dict) 10 | self.min_score = args_dict['min_score'] 11 | self.max_score = args_dict['max_score'] 12 | scorer_args = args_dict.get('scorer_args') 13 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 14 | self.scorer = PerspectiveScorer(scorer_args) 15 | self.filter_name = 'PerspectiveFilter' 16 | 17 | def filter_func(self, dataset): 18 | _, scores = self.scorer(dataset) 19 | return np.array([self.min_score <= score <= self.max_score for score in scores['Default']]).astype(int) 20 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/presidio_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import PresidioScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class PresidioFilter(TextFilter): 8 | 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | self.min_score = args_dict['min_score'] 12 | self.max_score = args_dict['max_score'] 13 | scorer_args = args_dict.get('scorer_args', {}) 14 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 15 | self.scorer = PresidioScorer(scorer_args) 16 | self.filter_name = 'PresidioFilter' 17 | 18 | def filter_func(self, dataset): 19 | _, scores = self.scorer(dataset) 20 | metric_scores = np.array(scores['Default']) 21 | metric_filter = (self.min_score <= metric_scores) & (metric_scores <= self.max_score) 22 | return metric_filter.astype(int) 23 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/qurating_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import QuratingScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class QuratingFilter(TextFilter): 8 | 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | self.min_scores = args_dict['min_scores'] 12 | self.max_scores = args_dict['max_scores'] 13 | scorer_args = args_dict.get('scorer_args', {}) 14 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 15 | self.scorer = QuratingScorer(scorer_args) 16 | self.filter_name = 'QuratingFilter' 17 | 18 | def filter_func(self, dataset): 19 | _, scores = self.scorer(dataset) 20 | 21 | results = np.ones(len(dataset), dtype=int) 22 | 23 | for label in self.min_scores.keys(): 24 | min_score = self.min_scores[label] 25 | max_score = self.max_scores[label] 26 | score_key = f"Qurating{''.join([word.capitalize() for word in label.split('_')])}Score" 27 | metric_scores = np.array(scores[score_key]) 28 | metric_filter = (min_score <= metric_scores) & (metric_scores <= max_score) 29 | results = results & metric_filter.astype(int) 30 | 31 | return results 32 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/reward_model_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import RMScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class RMFilter(TextFilter): 8 | 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | self.min_score = args_dict['min_score'] 12 | self.max_score = args_dict['max_score'] 13 | scorer_args = args_dict.get('scorer_args', {}) 14 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 15 | self.scorer = RMScorer(scorer_args) 16 | self.filter_name = 'RMFilter' 17 | 18 | def filter_func(self, dataset): 19 | _, scores = self.scorer(dataset) 20 | metric_scores = np.array(scores['Default']) 21 | metric_filter = (self.min_score <= metric_scores) & (metric_scores <= self.max_score) 22 | return metric_filter.astype(int) 23 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/superfiltering_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import SuperfilteringScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class SuperfilteringFilter(TextFilter): 8 | 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | self.min_score = args_dict['min_score'] 12 | self.max_score = args_dict['max_score'] 13 | scorer_args = args_dict.get('scorer_args') 14 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 15 | self.scorer = SuperfilteringScorer(scorer_args) 16 | self.filter_name = 'SuperfilteringFilter' 17 | 18 | def filter_func(self, dataset): 19 | _, scores = self.scorer(dataset) 20 | return np.array([self.min_score <= score <= self.max_score for score in scores['Default']]).astype(int) 21 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/textbook_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import TextbookScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class TextbookFilter(TextFilter): 8 | 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | self.min_score = args_dict['min_score'] 12 | self.max_score = args_dict['max_score'] 13 | scorer_args = args_dict.get('scorer_args', {}) 14 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 15 | self.scorer = TextbookScorer(scorer_args) 16 | self.filter_name = 'TextbookFilter' 17 | 18 | def filter_func(self, dataset): 19 | _, scores = self.scorer(dataset) 20 | metric_scores = np.array(scores['Default']) 21 | metric_filter = (self.min_score <= metric_scores) & (metric_scores <= self.max_score) 22 | return metric_filter.astype(int) 23 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/treeinstrct_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import TreeinstructScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class TreeinstructFilter(TextFilter): 8 | def __init__(self, args_dict: dict): 9 | super().__init__(args_dict) 10 | self.min_score = args_dict['min_score'] 11 | self.max_score = args_dict['max_score'] 12 | scorer_args = args_dict.get('scorer_args') 13 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 14 | self.scorer = TreeinstructScorer(scorer_args) 15 | self.filter_name = 'TreeinstructFilter' 16 | 17 | def filter_func(self, dataset): 18 | _, scores = self.scorer(dataset) 19 | return np.array([self.min_score <= score <= self.max_score for score in scores['Default']]).astype(int) 20 | -------------------------------------------------------------------------------- /dataflow/process/text/filters/unieval_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.Text import UnievalScorer 2 | from dataflow.core import TextFilter 3 | import numpy as np 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class UnievalFilter(TextFilter): 8 | 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | 12 | self.min_scores = args_dict['min_scores'] 13 | self.max_scores = args_dict['max_scores'] 14 | scorer_args = args_dict.get('scorer_args', {}) 15 | scorer_args['model_cache_dir'] = args_dict.get('model_cache_dir') 16 | self.metrics_to_keep = scorer_args['metrics_to_keep'] 17 | 18 | if not set(self.min_scores.keys()).issubset(set(self.metrics_to_keep.keys())): 19 | raise ValueError("The filtering metrics must be a subset of metrics_to_keep.") 20 | 21 | if not set(self.max_scores.keys()).issubset(set(self.metrics_to_keep.keys())): 22 | raise ValueError("The filtering metrics must be a subset of metrics_to_keep.") 23 | 24 | self.metric_name_map = { 25 | 'fluency': 'UniEvalFluencyScore', 26 | 'naturalness': 'UniEvalNaturalnessScore', 27 | 'understandability': 'UniEvalUnderstandabilityScore' 28 | } 29 | 30 | self.scorer = UnievalScorer(scorer_args) 31 | self.filter_name = 'UnievalFilter' 32 | 33 | def filter_func(self, dataset): 34 | _, scores = self.scorer(dataset) 35 | 36 | results = np.ones(len(dataset), dtype=int) 37 | 38 | for metric, min_score in self.min_scores.items(): 39 | max_score = self.max_scores[metric] 40 | score_key = self.metric_name_map[metric] 41 | 42 | metric_scores = np.array(scores[score_key]) 43 | metric_filter = (min_score <= metric_scores) & (metric_scores <= max_score) 44 | 45 | results = results & metric_filter.astype(int) 46 | 47 | return results 48 | -------------------------------------------------------------------------------- /dataflow/process/text/reasoners/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from dataflow.utils.registry import LazyLoader 3 | 4 | _import_structure = { 5 | "MathProblemFilter": ("dataflow/process/text/reasoners/math_problem_filter.py", "MathProblemFilter"), 6 | "AnswerGroundTruthFilter": ("dataflow/process/text/reasoners/answer_ground_truth_filter.py", "AnswerGroundTruthFilter"), 7 | "AnswerFormatterFilter": ("dataflow/process/text/reasoners/answer_formatter_filter.py", "AnswerFormatterFilter"), 8 | "AnswerNgramFilter": ("dataflow/process/text/reasoners/answer_ngram_filter.py", "AnswerNgramFilter"), 9 | "AnswerTokenLengthFilter": ("dataflow/process/text/reasoners/answer_token_length_filter.py", "AnswerTokenLengthFilter"), 10 | } 11 | 12 | sys.modules[__name__] = LazyLoader(__name__, "dataflow/process/text/reasoners", _import_structure) -------------------------------------------------------------------------------- /dataflow/process/text/reasoners/answer_formatter_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextFilter, ReasonerFilter 2 | import numpy as np 3 | from dataflow.utils.registry import PROCESSOR_REGISTRY 4 | import re 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class AnswerFormatterFilter(ReasonerFilter): 8 | def __init__(self, args_dict: dict): 9 | super().__init__(args_dict) 10 | self.filter_name = 'AnswerFormatterFilter' 11 | 12 | def is_valid_answer(answer: str) -> bool: 13 | # check final answer in \boxed{} or not 14 | if not re.search(r'\\boxed{.*}', answer): 15 | return False 16 | 17 | return True 18 | 19 | def filter_func(self, dataset): 20 | indexes = np.zeros(len(dataset)).astype(int) 21 | 22 | for i, item in enumerate(dataset): 23 | answer = item['answer'] 24 | if AnswerFormatterFilter.is_valid_answer(answer): 25 | indexes[i] = 1 26 | 27 | return indexes -------------------------------------------------------------------------------- /dataflow/process/text/reasoners/answer_ngram_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import ReasonerFilter 2 | import numpy as np 3 | import re 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | from dataflow.Eval.Text import NgramScorer 6 | 7 | @PROCESSOR_REGISTRY.register() 8 | class AnswerNgramFilter(ReasonerFilter): 9 | def __init__(self, args_dict: dict): 10 | super().__init__(args_dict) 11 | self.filter_name = 'AnswerNgramFilter' 12 | self.min_score = args_dict['min_score'] 13 | self.max_score = args_dict['max_score'] 14 | self.ngrams = args_dict['ngrams'] 15 | 16 | def filter_func(self, dataset): 17 | scores = [] 18 | for sample in dataset: 19 | answer = sample['answer'] 20 | content = answer.lower() 21 | content = re.sub(r'[^\w\s]', '', content) 22 | words = content.split() 23 | ngrams = [' '.join(words[i:i + self.ngrams]) for i in range(len(words) - (self.ngrams - 1))] 24 | unique_ngrams = set(ngrams) 25 | 26 | total_ngrams = len(ngrams) 27 | unique_ngrams_count = len(unique_ngrams) 28 | 29 | repetition_score = unique_ngrams_count / total_ngrams if total_ngrams > 0 else 0.0 30 | scores.append(repetition_score) 31 | 32 | return np.array([self.min_score <= score <= self.max_score for score in scores]).astype(int) -------------------------------------------------------------------------------- /dataflow/process/text/reasoners/answer_token_length_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import ReasonerFilter 2 | import numpy as np 3 | from dataflow.utils.registry import PROCESSOR_REGISTRY 4 | from transformers import AutoTokenizer 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class AnswerTokenLengthFilter(ReasonerFilter): 8 | def __init__(self, args_dict: dict): 9 | super().__init__(args_dict) 10 | self.filter_name = 'AnswerTokenLengthFilter' 11 | self.max_answer_token_length = args_dict['max_answer_token_length'] 12 | self.tokenizer = AutoTokenizer.from_pretrained(args_dict['tokenizer_dir']) 13 | 14 | def filter_func(self, dataset): 15 | def get_token_count(input_string): 16 | tokens = self.tokenizer.encode(input_string, add_special_tokens=False) 17 | return len(tokens) 18 | 19 | return np.array([get_token_count(item['answer']) <= self.max_answer_token_length for item in dataset]).astype(int) -------------------------------------------------------------------------------- /dataflow/process/text/refiners/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from dataflow.utils.registry import LazyLoader 3 | 4 | _import_structure = { 5 | "LowercaseRefiner": ("dataflow/process/text/refiners/lowercase_refiner.py", "LowercaseRefiner"), 6 | "PIIAnonymizeRefiner": ("dataflow/process/text/refiners/pii_anonymize_refiner.py", "PIIAnonymizeRefiner"), 7 | "RemovePunctuationRefiner": ("dataflow/process/text/refiners/remove_punctuation_refiner.py", "RemovePunctuationRefiner"), 8 | "RemoveNumberRefiner": ("dataflow/process/text/refiners/remove_number_refiner.py", "RemoveNumberRefiner"), 9 | "RemoveExtraSpacesRefiner": ("dataflow/process/text/refiners/remove_extra_spaces_refiner.py", "RemoveExtraSpacesRefiner"), 10 | "RemoveRepetitionsPunctuationRefiner": ("dataflow/process/text/refiners/remove_repetitions_punctuation_refiner.py", "RemoveRepetitionsPunctuationRefiner"), 11 | "RemoveEmojiRefiner": ("dataflow/process/text/refiners/remove_emoji_refiner.py", "RemoveEmojiRefiner"), 12 | "RemoveEmoticonsRefiner": ("dataflow/process/text/refiners/remove_emoticons_refiner.py", "RemoveEmoticonsRefiner"), 13 | "RemoveContractionsRefiner": ("dataflow/process/text/refiners/remove_contractions_refiner.py", "RemoveContractionsRefiner"), 14 | "HtmlUrlRemoverRefiner": ("dataflow/process/text/refiners/html_url_remover_refiner.py", "HtmlUrlRemoverRefiner"), 15 | "TextNormalizationRefiner": ("dataflow/process/text/refiners/text_normalization_refiner.py", "TextNormalizationRefiner"), 16 | "NERRefiner": ("dataflow/process/text/refiners/ner_refiner.py", "NERRefiner"), 17 | "StemmingLemmatizationRefiner": ("dataflow/process/text/refiners/stemming_lemmatization_refiner.py", "StemmingLemmatizationRefiner"), 18 | "SpellingCorrectionRefiner": ("dataflow/process/text/refiners/spelling_correction_refiner.py", "SpellingCorrectionRefiner"), 19 | "RemoveStopwordsRefiner": ("dataflow/process/text/refiners/remove_stopwords_refiner.py", "RemoveStopwordsRefiner") 20 | } 21 | 22 | sys.modules[__name__] = LazyLoader(__name__, "dataflow/process/text/refiners", _import_structure) 23 | -------------------------------------------------------------------------------- /dataflow/process/text/refiners/html_url_remover_refiner.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextRefiner 2 | from dataflow.data import TextDataset 3 | import re 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | from tqdm import tqdm 6 | 7 | """ 8 | This refiner class, HtmlUrlRemoverRefiner, is designed to clean text data by removing URLs and HTML tags. 9 | It iterates over specified fields in a dataset, detects and removes any web URLs (e.g., starting with "http" or "https") 10 | and HTML elements (e.g., ""). After cleaning, it returns the refined dataset and counts how many items were modified. 11 | """ 12 | 13 | @PROCESSOR_REGISTRY.register() 14 | class HtmlUrlRemoverRefiner(TextRefiner): 15 | def __init__(self, args_dict: dict): 16 | super().__init__(args_dict) 17 | self.refiner_name = 'HtmlUrlRemoverRefiner' 18 | 19 | def refine_func(self, dataset): 20 | refined_data = [] 21 | numbers = 0 22 | keys = dataset.keys if isinstance(dataset.keys, list) else [dataset.keys] 23 | for item in tqdm(dataset, desc=f"Implementing {self.refiner_name}"): 24 | if isinstance(item, dict): 25 | modified = False 26 | for key in keys: 27 | if key in item and isinstance(item[key], str): 28 | original_text = item[key] 29 | refined_text = original_text 30 | 31 | refined_text = re.sub(r'https?:\/\/\S+[\r\n]*', '', refined_text, flags=re.MULTILINE) 32 | refined_text = re.sub(r'<.*?>', '', refined_text) 33 | if original_text != refined_text: 34 | item[key] = refined_text 35 | modified = True 36 | 37 | refined_data.append(item) 38 | if modified: 39 | numbers += 1 40 | 41 | dataset.dataset = refined_data 42 | return dataset, numbers 43 | -------------------------------------------------------------------------------- /dataflow/process/text/refiners/lowercase_refiner.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextRefiner 2 | from dataflow.utils.registry import PROCESSOR_REGISTRY 3 | from tqdm import tqdm 4 | """ 5 | The LowercaseRefiner is a text refiner class that processes a dataset to convert text in specified fields to lowercase. 6 | It iterates through the dataset, checking each specified field in each item. If any text is found in uppercase, 7 | it is converted to lowercase, and the modified dataset is returned along with a count of the modified items. 8 | """ 9 | 10 | @PROCESSOR_REGISTRY.register() 11 | class LowercaseRefiner(TextRefiner): 12 | def __init__(self, args_dict: dict): 13 | super().__init__(args_dict) 14 | self.refiner_name = 'LowercaseRefiner' 15 | 16 | def refine_func(self, dataset): 17 | refined_data = [] 18 | numbers = 0 19 | keys = dataset.keys if isinstance(dataset.keys, list) else [dataset.keys] 20 | for item in tqdm(dataset, desc=f"Implementing {self.refiner_name}"): 21 | if isinstance(item, dict): 22 | modified = False 23 | for key in keys: 24 | if key in item and isinstance(item[key], str): 25 | original_text = item[key] 26 | lower_text = original_text.lower() 27 | if original_text != lower_text: 28 | item[key] = lower_text 29 | modified = True 30 | 31 | refined_data.append(item) 32 | if modified: 33 | numbers += 1 34 | dataset.dataset = refined_data 35 | return dataset, numbers 36 | 37 | 38 | -------------------------------------------------------------------------------- /dataflow/process/text/refiners/remove_contractions_refiner.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextRefiner 2 | from dataflow.data import TextDataset 3 | import contractions 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | from tqdm import tqdm 6 | 7 | """ 8 | The RemoveContractionsRefiner class is a text refiner that expands contractions in specified text fields within a dataset. 9 | Using the `contractions` library, it identifies and replaces contracted words (e.g., "can't" becomes "cannot") with their full forms. 10 | This process helps to normalize text for consistency or further processing. After expansion, the modified dataset is returned along 11 | with a count of items that were modified, enabling more standardized text content. 12 | """ 13 | 14 | @PROCESSOR_REGISTRY.register() 15 | class RemoveContractionsRefiner(TextRefiner): 16 | def __init__(self, args_dict: dict): 17 | super().__init__(args_dict) 18 | self.refiner_name = 'RemoveContractionsRefiner' 19 | 20 | def refine_func(self, dataset): 21 | refined_data = [] 22 | numbers = 0 23 | keys = dataset.keys if isinstance(dataset.keys, list) else [dataset.keys] 24 | for item in tqdm(dataset, desc=f"Implementing {self.refiner_name}"): 25 | if isinstance(item, dict): 26 | modified = False 27 | for key in keys: 28 | if key in item and isinstance(item[key], str): 29 | original_text = item[key] 30 | expanded_text = contractions.fix(original_text) 31 | if original_text != expanded_text: 32 | item[key] = expanded_text 33 | modified = True 34 | 35 | refined_data.append(item) 36 | if modified: 37 | numbers += 1 38 | 39 | dataset.dataset = refined_data 40 | return dataset, numbers 41 | -------------------------------------------------------------------------------- /dataflow/process/text/refiners/remove_extra_spaces_refiner.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextRefiner 2 | from dataflow.data import TextDataset 3 | from dataflow.utils.registry import PROCESSOR_REGISTRY 4 | from tqdm import tqdm 5 | 6 | """ 7 | The RemoveExtraSpacesRefiner class is a text refiner that removes extra spaces from specified text fields in a dataset. 8 | It reduces multiple consecutive spaces to a single space and removes leading or trailing spaces, helping to normalize 9 | text for further processing. This is achieved by splitting and rejoining the text, ensuring consistent spacing throughout. 10 | 11 | After cleaning, the modified dataset is returned along with a count of the modified items, resulting in a cleaner and 12 | more uniform text format. 13 | """ 14 | 15 | 16 | @PROCESSOR_REGISTRY.register() 17 | class RemoveExtraSpacesRefiner(TextRefiner): 18 | def __init__(self, args_dict: dict): 19 | super().__init__(args_dict) 20 | self.refiner_name = 'RemoveExtraSpacesRefiner' 21 | 22 | def refine_func(self, dataset): 23 | refined_data = [] 24 | numbers = 0 25 | keys = dataset.keys if isinstance(dataset.keys, list) else [dataset.keys] 26 | for item in tqdm(dataset, desc=f"Implementing {self.refiner_name}"): 27 | if isinstance(item, dict): 28 | modified = False 29 | for key in keys: 30 | if key in item and isinstance(item[key], str): 31 | original_text = item[key] 32 | no_extra_spaces_text = " ".join(original_text.split()) 33 | if original_text != no_extra_spaces_text: 34 | item[key] = no_extra_spaces_text 35 | modified = True 36 | 37 | refined_data.append(item) 38 | if modified: 39 | numbers += 1 40 | 41 | dataset.dataset = refined_data 42 | return dataset, numbers 43 | -------------------------------------------------------------------------------- /dataflow/process/text/refiners/remove_number_refiner.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextRefiner 2 | from dataflow.data import TextDataset 3 | from dataflow.utils.registry import PROCESSOR_REGISTRY 4 | from tqdm import tqdm 5 | 6 | """ 7 | The RemoveNumberRefiner class is a text refiner designed to remove all numeric characters from specified text fields in a dataset. 8 | It iterates through each item in the dataset, identifying fields that contain numbers, and removes these characters to leave only 9 | non-numeric text. This is particularly useful for cases where numerical values might interfere with analysis or where text needs to be 10 | purely alphabetical. 11 | 12 | After processing, the refiner returns the modified dataset along with a count of the modified items, resulting in a text format free from numbers. 13 | """ 14 | 15 | @PROCESSOR_REGISTRY.register() 16 | class RemoveNumberRefiner(TextRefiner): 17 | def __init__(self, args_dict: dict): 18 | super().__init__(args_dict) 19 | self.refiner_name = 'RemoveNumberRefiner' 20 | 21 | def refine_func(self, dataset): 22 | refined_data = [] 23 | numbers = 0 24 | keys = dataset.keys if isinstance(dataset.keys, list) else [dataset.keys] 25 | for item in tqdm(dataset, desc=f"Implementing {self.refiner_name}"): 26 | if isinstance(item, dict): 27 | modified = False 28 | for key in keys: 29 | if key in item and isinstance(item[key], str): 30 | original_text = item[key] 31 | no_number_text = ''.join([char for char in original_text if not char.isdigit()]) 32 | if original_text != no_number_text: 33 | item[key] = no_number_text 34 | modified = True 35 | 36 | refined_data.append(item) 37 | if modified: 38 | numbers += 1 39 | 40 | dataset.dataset = refined_data 41 | return dataset, numbers -------------------------------------------------------------------------------- /dataflow/process/text/refiners/remove_punctuation_refiner.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextRefiner 2 | import string 3 | from dataflow.utils.registry import PROCESSOR_REGISTRY 4 | from tqdm import tqdm 5 | 6 | """ 7 | The RemovePunctuationRefiner class is a text refiner that removes all punctuation characters from specified text fields in a dataset. 8 | Using Python’s `string.punctuation`, it identifies and removes common punctuation marks such as periods, commas, question marks, 9 | and exclamation points. This is useful in cases where punctuation might interfere with text analysis or processing. 10 | 11 | After removing punctuation, the refiner returns the modified dataset along with a count of the modified items, resulting in 12 | a cleaner text format without any punctuation marks. 13 | """ 14 | 15 | @PROCESSOR_REGISTRY.register() 16 | class RemovePunctuationRefiner(TextRefiner): 17 | def __init__(self, args_dict: dict): 18 | super().__init__(args_dict) 19 | self.refiner_name = 'RemovePunctuationRefiner' 20 | self.punct_to_remove = string.punctuation 21 | 22 | def refine_func(self, dataset): 23 | refined_data = [] 24 | numbers = 0 25 | keys = dataset.keys if isinstance(dataset.keys, list) else [dataset.keys] 26 | for item in tqdm(dataset, desc=f"Implementing {self.refiner_name}"): 27 | if isinstance(item, dict): 28 | modified = False 29 | for key in keys: 30 | if key in item and isinstance(item[key], str): 31 | original_text = item[key] 32 | no_punct_text = original_text.translate(str.maketrans('', '', self.punct_to_remove)) 33 | 34 | if original_text != no_punct_text: 35 | item[key] = no_punct_text 36 | modified = True 37 | 38 | refined_data.append(item) 39 | if modified: 40 | numbers += 1 41 | dataset.dataset = refined_data 42 | return dataset, numbers 43 | -------------------------------------------------------------------------------- /dataflow/process/text/refiners/remove_repetitions_punctuation_refiner.py: -------------------------------------------------------------------------------- 1 | from dataflow.core import TextRefiner 2 | from dataflow.data import TextDataset 3 | import re 4 | from dataflow.utils.registry import PROCESSOR_REGISTRY 5 | from tqdm import tqdm 6 | 7 | """ 8 | The RemoveRepetitionsPunctuationRefiner class is a text refiner that removes repeated punctuation characters 9 | from specified text fields in a dataset. Using a regular expression, it reduces consecutive occurrences of the 10 | same punctuation mark (e.g., "!!" becomes "!") to a single instance, including repeated underscores. 11 | 12 | This refiner is useful for cleaning up text that may have excessive or stylistic punctuation, which can interfere 13 | with analysis or make the text harder to read. After processing, the refiner returns the modified dataset along 14 | with a count of the items that were altered, resulting in a more uniform text format with minimal punctuation repetition. 15 | """ 16 | 17 | @PROCESSOR_REGISTRY.register() 18 | class RemoveRepetitionsPunctuationRefiner(TextRefiner): 19 | def __init__(self, args_dict: dict): 20 | super().__init__(args_dict) 21 | self.refiner_name = 'RemoveRepetitionsPunctuationRefiner' 22 | 23 | def refine_func(self, dataset): 24 | refined_data = [] 25 | numbers = 0 26 | keys = dataset.keys if isinstance(dataset.keys, list) else [dataset.keys] 27 | for item in tqdm(dataset, desc=f"Implementing {self.refiner_name}"): 28 | if isinstance(item, dict): 29 | modified = False 30 | for key in keys: 31 | if key in item and isinstance(item[key], str): 32 | original_text = item[key] 33 | no_extra_punct_text = re.sub(r'([^\w\s_])\1+|(_)\2+', r'\1\2', original_text) 34 | 35 | if original_text != no_extra_punct_text: 36 | item[key] = no_extra_punct_text 37 | modified = True 38 | 39 | refined_data.append(item) 40 | if modified: 41 | numbers += 1 42 | dataset.dataset = refined_data 43 | return dataset, numbers -------------------------------------------------------------------------------- /dataflow/process/video/__init__.py: -------------------------------------------------------------------------------- 1 | from .filters import * 2 | -------------------------------------------------------------------------------- /dataflow/process/video/filters/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from dataflow.utils.registry import LazyLoader 3 | 4 | _import_structure = { 5 | "VideoResolutionFilter": ("dataflow/process/video/filters/video_resolution_filter.py", "VideoResolutionFilter"), 6 | "VideoMotionFilter": ("dataflow/process/video/filters/video_motion_filter.py", "VideoMotionFilter"), 7 | "FastVQAFilter": ("dataflow/process/video/filters/fastvqa_filter.py", "FastVQAFilter"), 8 | "FasterVQAFilter": ("dataflow/process/video/filters/fastervqa_filter.py", "FasterVQAFilter"), 9 | "DOVERFilter": ("dataflow/process/video/filters/dover_filter.py", "DOVERFilter"), 10 | "EMScoreFilter": ("dataflow/process/video/filters/emscore_filter.py", "EMScoreFilter"), 11 | "PACScoreFilter": ("dataflow/process/video/filters/pacscore_filter.py", "PACScoreFilter") 12 | } 13 | 14 | sys.modules[__name__] = LazyLoader(__name__, "dataflow/process/video/filters", _import_structure) 15 | -------------------------------------------------------------------------------- /dataflow/process/video/filters/dover_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.video import DOVERScorer 2 | from dataflow.utils.registry import PROCESSOR_REGISTRY 3 | from dataflow.core import VideoFilter 4 | import numpy as np 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class DOVERFilter(VideoFilter): 8 | def __init__(self, args_dict): 9 | super().__init__(args_dict) 10 | self.min_tech_score = args_dict['min_tech_score'] 11 | self.max_tech_score = args_dict['max_tech_score'] 12 | self.min_aes_score = args_dict['min_aes_score'] 13 | self.max_aes_score = args_dict['max_aes_score'] 14 | self.scorer = DOVERScorer(args_dict['scorer_args']) 15 | 16 | def filter_func(self, dataset): 17 | _, scores = self.scorer(dataset) 18 | return np.array([ 19 | self.min_tech_score <= tech_score <= self.max_tech_score and 20 | self.min_aes_score <= aes_score <= self.max_aes_score 21 | for tech_score, aes_score in zip(scores['technical'], scores['aesthetic']) 22 | ]).astype(int) -------------------------------------------------------------------------------- /dataflow/process/video/filters/emscore_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.video import EMScorer 2 | from dataflow.utils.registry import PROCESSOR_REGISTRY 3 | from dataflow.core import VideoTextFilter 4 | import numpy as np 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class EMScoreFilter(VideoTextFilter): 8 | 9 | def __init__(self, args_dict): 10 | super().__init__(args_dict) 11 | self.min_score = args_dict['min_score'] 12 | self.max_score = args_dict['max_score'] 13 | self.scorer = EMScorer(args_dict['scorer_args']) 14 | 15 | def filter_func(self, dataset): 16 | _, scores = self.scorer(dataset) 17 | return np.array([self.min_score <= score <= self.max_score for score in scores['EMScore(X,V)']['full_F'] ]).astype(int) -------------------------------------------------------------------------------- /dataflow/process/video/filters/fastervqa_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.video import FasterVQAScorer 2 | from dataflow.utils.registry import PROCESSOR_REGISTRY 3 | from dataflow.core import VideoFilter 4 | import numpy as np 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class FasterVQAFilter(VideoFilter): 8 | def __init__(self, args_dict): 9 | super().__init__(args_dict) 10 | self.min_score = args_dict['min_score'] 11 | self.max_score = args_dict['max_score'] 12 | self.scorer = FasterVQAScorer(args_dict['scorer_args']) 13 | 14 | def filter_func(self, dataset): 15 | _, scores = self.scorer(dataset) 16 | return np.array([self.min_score <= score <= self.max_score for score in scores['Default']]).astype(int) -------------------------------------------------------------------------------- /dataflow/process/video/filters/fastvqa_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.video import FastVQAScorer 2 | from dataflow.utils.registry import PROCESSOR_REGISTRY 3 | from dataflow.core import VideoFilter 4 | import numpy as np 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class FastVQAFilter(VideoFilter): 8 | 9 | def __init__(self, args_dict): 10 | super().__init__(args_dict) 11 | self.min_score = args_dict['min_score'] 12 | self.max_score = args_dict['max_score'] 13 | self.scorer = FastVQAScorer(args_dict['scorer_args']) 14 | 15 | def filter_func(self, dataset): 16 | _, scores = self.scorer(dataset) 17 | return np.array([self.min_score <= score <= self.max_score for score in scores['Default'] ]).astype(int) -------------------------------------------------------------------------------- /dataflow/process/video/filters/pacscore_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.video import PACScorer 2 | from dataflow.utils.registry import PROCESSOR_REGISTRY 3 | from dataflow.core import VideoTextFilter 4 | import numpy as np 5 | 6 | @PROCESSOR_REGISTRY.register() 7 | class PACScoreFilter(VideoTextFilter): 8 | 9 | def __init__(self, args_dict): 10 | super().__init__(args_dict) 11 | self.min_score = args_dict['min_score'] 12 | self.max_score = args_dict['max_score'] 13 | self.scorer = PACScorer(args_dict['scorer_args']) 14 | 15 | def filter_func(self, dataset): 16 | _, scores = self.scorer(dataset) 17 | return np.array([self.min_score <= score <= self.max_score for score in scores['PACScore(X,V)']['full_F'] ]).astype(int) -------------------------------------------------------------------------------- /dataflow/process/video/filters/video_motion_filter.py: -------------------------------------------------------------------------------- 1 | from dataflow.Eval.video import VideoMotionScorer 2 | from dataflow.data import DataFlowDataset 3 | from dataflow.core import VideoFilter 4 | import numpy as np 5 | from dataflow.utils.registry import PROCESSOR_REGISTRY 6 | 7 | @PROCESSOR_REGISTRY.register() 8 | class VideoMotionFilter(VideoFilter): 9 | 10 | def __init__(self, args_dict: dict): 11 | super().__init__(args_dict) 12 | self.min_score = args_dict['min_score'] 13 | self.max_score = args_dict['max_score'] 14 | self.scorer = VideoMotionScorer(args_dict['scorer_args']) 15 | 16 | def filter_func(self, dataset): 17 | _, scores = self.scorer(dataset) 18 | # print(sample['video'], scores) 19 | return np.array([self.min_score <= score <= self.max_score for score in scores['Default']]).astype(int) 20 | -------------------------------------------------------------------------------- /dataflow/process/video/filters/video_resolution_filter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from jsonargparse.typing import PositiveInt 3 | from functools import partial 4 | import numpy as np 5 | from dataflow.core import VideoFilter 6 | from dataflow.data import DataFlowDataset 7 | from dataflow.Eval.video import VideoResolutionScorer 8 | from dataflow.utils.registry import PROCESSOR_REGISTRY 9 | 10 | @PROCESSOR_REGISTRY.register() 11 | class VideoResolutionFilter(VideoFilter): 12 | 13 | def __init__(self, args_dict: dict): 14 | 15 | super().__init__(args_dict) 16 | self.min_width = args_dict['min_width'] 17 | self.max_width = args_dict['max_width'] 18 | self.min_height = args_dict['min_height'] 19 | self.max_height = args_dict['max_height'] 20 | self.scorer = VideoResolutionScorer(args_dict['scorer_args']) 21 | 22 | def filter_func(self, dataset): 23 | _, scores = self.scorer(dataset) 24 | print(scores) 25 | return np.array([ 26 | self.min_width <= width <= self.max_width and 27 | self.min_height <= height <= self.max_height 28 | for width, height in zip(scores['width'], scores['height']) 29 | ]).astype(int) 30 | -------------------------------------------------------------------------------- /dataflow/retrieve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/dataflow/retrieve/__init__.py -------------------------------------------------------------------------------- /dataflow/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import calculate_score, recursive_insert, recursive_len, recursive_idx, recursive_func, round_to_sigfigs, recursive, process 2 | from .mm_utils import close_video, extract_key_frames, get_key_frame_seconds, extract_video_frames_uniformly 3 | from .model_utils import prepare_huggingface_model, cuda_device_count, is_cuda_available, wget_model, gdown_model 4 | from .text_utils import md5, sha256, sha1_hash, xxh3_64, xxh3_64_digest, xxh3_128, xxh3_128_digest, xxh3_hash, xxh3_16hash, xxh3_32hash, md5_digest, md5_hexdigest, sha256_digest, sha256_hexdigest 5 | from .json_utils import check_serializable_fields 6 | __all__ = [ 7 | 'calculate_score', 8 | 'recursive_insert', 9 | 'recursive_len', 10 | 'recursive_idx', 11 | 'recursive_func', 12 | 'round_to_sigfigs', 13 | 'process', 14 | 'close_video', 15 | 'extract_key_frames', 16 | 'get_key_frame_seconds', 17 | 'extract_video_frames_uniformly', 18 | 'prepare_huggingface_model', 19 | 'cuda_device_count', 20 | 'is_cuda_available', 21 | 'wget_model', 22 | 'gdown_model', 23 | 'recursive', 24 | "md5", 25 | "sha256", 26 | "sha1_hash", 27 | "xxh3_64", 28 | "xxh3_64_digest", 29 | "xxh3_128", 30 | "xxh3_128_digest", 31 | "xxh3_hash", 32 | "xxh3_16hash", 33 | "xxh3_32hash", 34 | "md5_digest", 35 | "md5_hexdigest", 36 | "sha256_digest", 37 | "sha256_hexdigest", 38 | "check_serializable_fields" 39 | ] -------------------------------------------------------------------------------- /dataflow/utils/api_utils.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import json 3 | import requests 4 | import os 5 | 6 | def api_chat( 7 | system_info: str, 8 | messages: str, 9 | model: str, 10 | api_url : str = "", 11 | api_key : str = "", 12 | finish_try: int = 3, 13 | mode_test : bool = True 14 | ): 15 | if api_key == "": 16 | api_key = os.environ.get("API_KEY") 17 | 18 | if api_key is None: 19 | raise ValueError("Lack of API_KEY") 20 | 21 | if mode_test is True: 22 | try: 23 | payload = json.dumps({ 24 | "model": model, 25 | "messages": [ 26 | {"role": "system", "content": system_info}, 27 | {"role": "user", "content": messages} 28 | ] 29 | }) 30 | 31 | headers = { 32 | 'Authorization': f"Bearer {api_key}", 33 | 'Content-Type': 'application/json', 34 | 'User-Agent': 'Apifox/1.0.0 (https://apifox.com)' 35 | } 36 | 37 | # request OpenAI API 38 | response = requests.post(api_url, headers=headers, data=payload, timeout=1800) 39 | 40 | # API debug code 41 | # print("response ", response) 42 | # print("response.status_code", response.status_code) 43 | 44 | if response.status_code == 200: 45 | response_data = response.json() 46 | return response_data['choices'][0]['message']['content'] 47 | 48 | 49 | except Exception as e: 50 | print("Error:", e) 51 | pass 52 | 53 | else : 54 | client = OpenAI(api_key=api_key) 55 | api_response = client.chat.completions.create( 56 | model=model, 57 | messages=[ 58 | {"role": "system", "content": system_info}, 59 | {"role": "user", "content": messages} 60 | ] 61 | ) 62 | 63 | response_content = api_response.choices[0].message.content.strip() 64 | 65 | return response_content -------------------------------------------------------------------------------- /dataflow/utils/json_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | def detect_format(filepath): 4 | """Attempt to detect whether the file is in standard JSON format or ND-JSON format.""" 5 | with open(filepath, 'r', encoding='utf-8') as file: 6 | first_line = file.readline().strip() 7 | try: 8 | # Try to parse the first line as a JSON object 9 | json.loads(first_line) 10 | # If no exception is raised, it's likely ND-JSON format 11 | return 'ndjson' 12 | except json.JSONDecodeError: 13 | # If the first line cannot be parsed as a JSON object, it might be standard JSON format 14 | file.seek(0) # Reset file pointer to the beginning 15 | first_bytes = file.read(1024) # Read the first 1024 bytes 16 | if first_bytes.startswith('[') and first_bytes.endswith(']'): 17 | return 'standard' 18 | else: 19 | raise ValueError("The file is neither in standard JSON nor ND-JSON format") 20 | 21 | 22 | def read_json_file(filepath): 23 | format_type = detect_format(filepath) 24 | 25 | if format_type == 'ndjson': 26 | with open(filepath, 'r', encoding='utf-8') as file: 27 | return [json.loads(line.strip()) for line in file] 28 | elif format_type == 'standard': 29 | with open(filepath, 'r', encoding='utf-8') as file: 30 | return json.load(file) 31 | 32 | def check_serializable_fields(data): 33 | serializable_fields = [] 34 | for key, value in data.items(): 35 | try: 36 | json.dumps(value) 37 | serializable_fields.append(key) 38 | except (TypeError, ValueError): 39 | pass 40 | return serializable_fields 41 | -------------------------------------------------------------------------------- /dataflow/visualization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/dataflow/visualization/__init__.py -------------------------------------------------------------------------------- /demos/demos_result/processed.jsonl: -------------------------------------------------------------------------------- 1 | {"problem": "A rectangle has a length of 10 cm and a width of 5 cm. What is its area?"} 2 | {"problem": "If a car travels at 60 km/h for 2 hours, how far does it go?"} 3 | -------------------------------------------------------------------------------- /demos/image_eval/gen_image.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "image": "1e558d58-f53e-422e-9715-19d4c12c093d.png" 4 | }, 5 | { 6 | "image": "202a9b83-28e5-4002-9858-64a4cb734f91.png" 7 | }, 8 | { 9 | "image": "65250ac7-c2ed-4ed1-b375-0916649c849b.png" 10 | }, 11 | { 12 | "image": "d10f73b1-ecff-4562-a3b9-3d2d0fc0b82a.png" 13 | }, 14 | { 15 | "image": "bda58b80-f069-45f5-9eb3-5d023fb731ec.png" 16 | }, 17 | { 18 | "image": "a34a8f2f-e3f5-42b7-9f18-cad076ad70bb.png" 19 | }, 20 | { 21 | "image": "67d2b934-2be9-470b-898e-98251a81e74d.png" 22 | }, 23 | { 24 | "image": "1de8c101-7d18-45c9-90ad-6b27bc9b565f.png" 25 | }, 26 | { 27 | "image": "cd2eed6b-8aea-40f1-a635-3cb9b4f1f460.png" 28 | }, 29 | { 30 | "image": "e5a47fa8-f901-40cb-9f3d-39b048152fe2.png" 31 | } 32 | ] -------------------------------------------------------------------------------- /demos/image_eval/gen_images/1de8c101-7d18-45c9-90ad-6b27bc9b565f.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_eval/gen_images/1de8c101-7d18-45c9-90ad-6b27bc9b565f.png -------------------------------------------------------------------------------- /demos/image_eval/gen_images/1e558d58-f53e-422e-9715-19d4c12c093d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_eval/gen_images/1e558d58-f53e-422e-9715-19d4c12c093d.png -------------------------------------------------------------------------------- /demos/image_eval/gen_images/202a9b83-28e5-4002-9858-64a4cb734f91.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_eval/gen_images/202a9b83-28e5-4002-9858-64a4cb734f91.png -------------------------------------------------------------------------------- /demos/image_eval/gen_images/65250ac7-c2ed-4ed1-b375-0916649c849b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_eval/gen_images/65250ac7-c2ed-4ed1-b375-0916649c849b.png -------------------------------------------------------------------------------- /demos/image_eval/gen_images/67d2b934-2be9-470b-898e-98251a81e74d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_eval/gen_images/67d2b934-2be9-470b-898e-98251a81e74d.png -------------------------------------------------------------------------------- /demos/image_eval/gen_images/a34a8f2f-e3f5-42b7-9f18-cad076ad70bb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_eval/gen_images/a34a8f2f-e3f5-42b7-9f18-cad076ad70bb.png -------------------------------------------------------------------------------- /demos/image_eval/gen_images/bda58b80-f069-45f5-9eb3-5d023fb731ec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_eval/gen_images/bda58b80-f069-45f5-9eb3-5d023fb731ec.png -------------------------------------------------------------------------------- /demos/image_eval/gen_images/cd2eed6b-8aea-40f1-a635-3cb9b4f1f460.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_eval/gen_images/cd2eed6b-8aea-40f1-a635-3cb9b4f1f460.png -------------------------------------------------------------------------------- /demos/image_eval/gen_images/d10f73b1-ecff-4562-a3b9-3d2d0fc0b82a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_eval/gen_images/d10f73b1-ecff-4562-a3b9-3d2d0fc0b82a.png -------------------------------------------------------------------------------- /demos/image_eval/gen_images/e5a47fa8-f901-40cb-9f3d-39b048152fe2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_eval/gen_images/e5a47fa8-f901-40cb-9f3d-39b048152fe2.png -------------------------------------------------------------------------------- /demos/image_eval/image.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "image": "10007903636.jpg" 4 | }, 5 | { 6 | "image": "10089027076.jpg" 7 | } 8 | ] -------------------------------------------------------------------------------- /demos/image_eval/image_text.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "image": "cake.jpg", 4 | "caption": "a slice of chocolate cake on a white plate with a fork next to it" 5 | }, 6 | { 7 | "image": "cake.jpg", 8 | "caption": "a strawberry cake placed on the ground" 9 | }, 10 | { 11 | "image": "giraffe.jpg", 12 | "caption": "a deer eating grass" 13 | }, 14 | { 15 | "image": "giraffe.jpg", 16 | "caption": "a giraffe reaching up to eat from a tree" 17 | } 18 | ] -------------------------------------------------------------------------------- /demos/image_eval/images/10007903636.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_eval/images/10007903636.jpg -------------------------------------------------------------------------------- /demos/image_eval/images/10089027076.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_eval/images/10089027076.jpg -------------------------------------------------------------------------------- /demos/image_eval/images/cake.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_eval/images/cake.jpg -------------------------------------------------------------------------------- /demos/image_eval/images/giraffe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_eval/images/giraffe.jpg -------------------------------------------------------------------------------- /demos/image_eval/ref_image.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "image": "cake.jpg" 4 | }, 5 | { 6 | "image": "giraffe.jpg" 7 | } 8 | ] -------------------------------------------------------------------------------- /demos/image_eval/run_images.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, '../DataGym') 3 | 4 | from dataflow.utils.utils import calculate_score 5 | 6 | calculate_score() -------------------------------------------------------------------------------- /demos/image_process/image.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "image": "origin.png" 4 | }, 5 | { 6 | "image": "hflip.png" 7 | }, 8 | { 9 | "image": "other.png" 10 | }, 11 | { 12 | "image": "resize.png" 13 | }, 14 | { 15 | "image": "rotate.png" 16 | } 17 | ] -------------------------------------------------------------------------------- /demos/image_process/images/hflip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_process/images/hflip.png -------------------------------------------------------------------------------- /demos/image_process/images/origin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_process/images/origin.png -------------------------------------------------------------------------------- /demos/image_process/images/other.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_process/images/other.png -------------------------------------------------------------------------------- /demos/image_process/images/resize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_process/images/resize.png -------------------------------------------------------------------------------- /demos/image_process/images/rotate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/image_process/images/rotate.png -------------------------------------------------------------------------------- /demos/image_process/test.py: -------------------------------------------------------------------------------- 1 | # run "python demos/image_dedup/test.py --config configs/process/image_deduplicate.yaml" under main folder 2 | 3 | import sys 4 | sys.path.insert(0, '../DataFlow') 5 | 6 | from dataflow.utils.utils import process 7 | 8 | if __name__ == '__main__': 9 | process() 10 | -------------------------------------------------------------------------------- /demos/text_process/reasoners/text_process_mathproblem.jsonl: -------------------------------------------------------------------------------- 1 | {"problem": "A rectangle has a length of 10 cm and a width of 5 cm. What is its area?"} 2 | {"problem": "If a car travels at 60 km/h for 2 hours, how far does it go?"} 3 | {"problem": "Solveaaaaaa for x: 2x + 5 = 15"} -------------------------------------------------------------------------------- /demos/video_eval/test_video.avi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/video_eval/test_video.avi -------------------------------------------------------------------------------- /demos/video_eval/test_video.mkv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/video_eval/test_video.mkv -------------------------------------------------------------------------------- /demos/video_eval/test_video.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/video_eval/test_video.mov -------------------------------------------------------------------------------- /demos/video_eval/test_video.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/video_eval/test_video.mp4 -------------------------------------------------------------------------------- /demos/video_eval/video-caption.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "video": "test_video.avi", 4 | "enCap": [ 5 | "A man is clipping paper.", 6 | "A man is cutting paper." 7 | ] 8 | } 9 | ] -------------------------------------------------------------------------------- /demos/video_eval/video.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "video": "test_video.mp4" 4 | }, 5 | { 6 | "video": "test_video.mov" 7 | } 8 | ] -------------------------------------------------------------------------------- /demos/video_eval/video_caption_eval.yaml: -------------------------------------------------------------------------------- 1 | model_cache_path: '../ckpt' # Path to cache models 2 | num_workers: 2 3 | 4 | data: 5 | video-caption: 6 | meta_data_path: './demos/video_eval/video-caption.json' # Path to meta data (mainly for image or video data) 7 | data_path: './demos/video_eval' # Path to dataset 8 | formatter: 'VideoCaptionFormatter' # formatter for pure video evaluation 9 | 10 | scorers: 11 | EMScorer: 12 | batch_size: 4 13 | num_workers: 4 14 | 15 | 16 | -------------------------------------------------------------------------------- /demos/video_eval/video_eval.yaml: -------------------------------------------------------------------------------- 1 | model_cache_path: '../ckpt' # Path to cache models 2 | num_workers: 2 3 | 4 | data: 5 | video: 6 | meta_data_path: './video.json' # Path to meta data (mainly for image or video data) 7 | data_path: './' # Path to dataset 8 | formatter: 'PureVideoFormatter' # formatter for pure video evaluation 9 | 10 | scorers: 11 | VideoMotionScorer: # Keep samples with video motion scores within a specific range. 12 | batch_size: 1 13 | num_workers: 4 14 | min_score: 0.25 # the minimum motion score to keep samples 15 | max_score: 10000.0 # the maximum motion score to keep samples 16 | sampling_fps: 2 # the samplig rate of frames_per_second to compute optical flow 17 | size: null # resize frames along the smaller edge before computing optical flow, or a sequence like (h, w) 18 | max_size: null # maximum allowed for the longer edge of resized frames 19 | relative: false # whether to normalize the optical flow magnitude to [0, 1], relative to the frame's diagonal length 20 | any_or_all: any # keep this sample when any/all videos meet the filter condition 21 | -------------------------------------------------------------------------------- /demos/video_process/video5data.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "flickr_id": 8536919744 4 | }, 5 | { 6 | "flickr_id": 6408325533 7 | }, 8 | { 9 | "flickr_id": 5319047612 10 | }, 11 | { 12 | "flickr_id": 8724380666 13 | }, 14 | { 15 | "flickr_id": 4744073127 16 | } 17 | ] -------------------------------------------------------------------------------- /demos/video_process/video_caption_process.yaml: -------------------------------------------------------------------------------- 1 | model_cache_path: '../ckpt' # Path to cache models 2 | num_workers: 2 3 | dependencies: [video] 4 | save_path: './example.jsonl' 5 | data: 6 | video_caption: 7 | meta_data_path: 'demos/video_process/videocap5data.json' # Path to meta data (mainly for image or video data) 8 | data_path: 'demos/video_process/video-caption/' 9 | formatter: 'VideoCaptionFormatter' # formatter for video-caption evaluation 10 | 11 | processors: 12 | EMScoreFilter: 13 | min_score: 0.3 14 | max_score: 1.0 15 | scorer_args: 16 | batch_size: 16 17 | num_workers: 4 18 | PACScoreFilter: 19 | min_score: 0.3 20 | max_score: 1.0 21 | scorer_args: 22 | batch_size: 16 23 | num_workers: 4 24 | model_path: ./models/clip_ViT-B-32.pth 25 | -------------------------------------------------------------------------------- /demos/video_process/videos/4744073127.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/video_process/videos/4744073127.mp4 -------------------------------------------------------------------------------- /demos/video_process/videos/5319047612.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/video_process/videos/5319047612.mp4 -------------------------------------------------------------------------------- /demos/video_process/videos/6408325533.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/video_process/videos/6408325533.mp4 -------------------------------------------------------------------------------- /demos/video_process/videos/8536919744.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/video_process/videos/8536919744.mp4 -------------------------------------------------------------------------------- /demos/video_process/videos/8724380666.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/demos/video_process/videos/8724380666.mp4 -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_build/doctrees/README.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/README.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/environment.pickle -------------------------------------------------------------------------------- /docs/_build/doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/index.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/getting_started/Bring_Your_Own_Scorer.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/getting_started/Bring_Your_Own_Scorer.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/getting_started/Installation.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/getting_started/Installation.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/getting_started/customized_scorer.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/getting_started/customized_scorer.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/getting_started/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/getting_started/index.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/gen_text_metrics.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/gen_text_metrics.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/gen_text_metrics.zh-CN.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/gen_text_metrics.zh-CN.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/image_metrics.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/image_metrics.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/image_metrics.zh-CN.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/image_metrics.zh-CN.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/image_process.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/image_process.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/image_process.zh-CN.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/image_process.zh-CN.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/index.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/synth_metrics.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/synth_metrics.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/text_metrics.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/text_metrics.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/text_metrics.zh-CN.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/text_metrics.zh-CN.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/text_process.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/text_process.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/text_process.zh-CN.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/text_process.zh-CN.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/video_metrics.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/video_metrics.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/video_metrics.zh-CN.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/video_metrics.zh-CN.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/video_process.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/video_process.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/src/metrics/video_process.zh-CN.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/doctrees/src/metrics/video_process.zh-CN.doctree -------------------------------------------------------------------------------- /docs/_build/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: fb225de1c1c504e1d9fc70e0be2640f3 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/README.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/README.doctree -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/environment.pickle -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/index.doctree -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/src/getting_started/Bring_Your_Own_Scorer.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/src/getting_started/Bring_Your_Own_Scorer.doctree -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/src/getting_started/Installation.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/src/getting_started/Installation.doctree -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/src/getting_started/customized_scorer.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/src/getting_started/customized_scorer.doctree -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/src/getting_started/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/src/getting_started/index.doctree -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/src/metrics/image_metrics.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/src/metrics/image_metrics.doctree -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/src/metrics/image_metrics.zh-CN.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/src/metrics/image_metrics.zh-CN.doctree -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/src/metrics/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/src/metrics/index.doctree -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/src/metrics/synth_metrics.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/src/metrics/synth_metrics.doctree -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/src/metrics/text_metrics.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/src/metrics/text_metrics.doctree -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/src/metrics/text_metrics.zh-CN.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/src/metrics/text_metrics.zh-CN.doctree -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/src/metrics/video_metrics.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/src/metrics/video_metrics.doctree -------------------------------------------------------------------------------- /docs/_build/html/.doctrees/src/metrics/video_metrics.zh-CN.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/.doctrees/src/metrics/video_metrics.zh-CN.doctree -------------------------------------------------------------------------------- /docs/_build/html/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | .. Open-DataFlow-Eval documentation master file, created by 2 | sphinx-quickstart on Sat Nov 2 18:54:36 2024. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Open-DataFlow-Eval documentation 7 | ================================ 8 | 9 | Welcome to Open-DataFlow-Eval's documentation! 10 | 11 | This documentation is intended to provide a comprehensive guide to the metrics, APIs, and dataflow used in the Open-DataFlow-Eval project. 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | :caption: Open-DataFlow-Eval: 16 | 17 | src/getting_started/index 18 | src/metrics/index 19 | 20 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/src/getting_started/Bring_Your_Own_Scorer.md.txt: -------------------------------------------------------------------------------- 1 | # Customized Scorer 2 | 3 | ## TODO 4 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/src/getting_started/Installation.md.txt: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## TODO 4 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/src/getting_started/customized_scorer.md.txt: -------------------------------------------------------------------------------- 1 | # Customized Scorer 2 | 3 | ## TODO 4 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/src/getting_started/index.rst.txt: -------------------------------------------------------------------------------- 1 | Getting Started 2 | ================================ 3 | 4 | Please see docs below for information on how to get started with Open-DataFlow-Eval. 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | :caption: Getting Started: 9 | 10 | Installation.md 11 | customized_scorer.md 12 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/src/metrics/image_process.md.txt: -------------------------------------------------------------------------------- 1 | # Introduction to Image Data Processors 2 | Note: We have set up corresponding filters for each scorer, named following the rules ```MetricnameScorer``` and ```MetricFilter```. Please refer to the [image_metrics.md](./image_metrics.md) for an introduction to these filters. The following introduces only the processors that differ from the previous scorers. 3 | 4 | 5 | | Processor Name | Description | 6 | |---|---| 7 | | ImagePHashDeduplicator | Based on the [Perceptual hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html) method from [imagededup](https://github.com/idealo/imagededup) | 8 | | ImageDHashDeduplicator | Based on the [Difference hashing](https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html) method from [imagededup](https://github.com/idealo/imagededup) | 9 | | ImageWHashDeduplicator | Based on the [Wavelet hashing](https://fullstackml.com/wavelet-image-hash-in-python-3504fdd282b5) method from [imagededup](https://github.com/idealo/imagededup) | 10 | | ImageAHashDeduplicator | Based on the [Average hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html) method from [imagededup](https://github.com/idealo/imagededup) | 11 | | ImageAspectRatioFilter | Filters images based on their aspect ratio | 12 | | ImageResolutionFilter | Filters images based on their resolution | 13 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/src/metrics/image_process.zh-CN.md.txt: -------------------------------------------------------------------------------- 1 | # 图像数据处理器介绍 2 | 注:我们为每个打分器都设置了对应的过滤器,命名规则为```MetricnameScorer```与```MetriFilter```,这部分过滤器的介绍请参考[image_metrics.zh-CN.md](./image_metrics.zh-CN.md)。以下仅介绍与之前的打分器不同的处理器。 3 | 4 | 5 | |处理器名称|简介| 6 | |---|---| 7 | |ImagePHashDeduplicator|基于[imagededup](https://github.com/idealo/imagededup)的[Perceptual hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)方法| 8 | |ImageDHashDeduplicator|基于[imagededup](https://github.com/idealo/imagededup)的[Difference hashing](https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)方法| 9 | |ImageWHashDeduplicator|基于[imagededup](https://github.com/idealo/imagededup)的[Wavelet hashing](https://fullstackml.com/wavelet-image-hash-in-python-3504fdd282b5)方法| 10 | |ImageAHashDeduplicator|基于[imagededup](https://github.com/idealo/imagededup)的[Average hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)| 11 | |ImageAspectRatioFilter|根据图像长宽比进行过滤| 12 | |ImageResolutionFilter|根据图像分辨率进行过滤| -------------------------------------------------------------------------------- /docs/_build/html/_sources/src/metrics/index.rst.txt: -------------------------------------------------------------------------------- 1 | Metrics 2 | ================================ 3 | 4 | Please see docs below for information on the Metrics we provided. 5 | 6 | .. toctree:: 7 | :maxdepth: 1 8 | :caption: Metrics Documentation: 9 | 10 | image_metrics.md 11 | image_metrics.zh-CN.md 12 | text_metrics.md 13 | text_metrics.zh-CN.md 14 | video_metrics.md 15 | video_metrics.zh-CN.md 16 | gen_text_metrics.md 17 | gen_text_metrics.zh-CN.md 18 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/src/metrics/synth_metrics.md.txt: -------------------------------------------------------------------------------- 1 | ### Synth 2 | TODO -------------------------------------------------------------------------------- /docs/_build/html/_sources/src/metrics/video_process.md.txt: -------------------------------------------------------------------------------- 1 | # Video Data Processing 2 | The processing of video data primarily relies on dataset filtering methods based on evaluation scores. 3 | 4 | ## Pure Video Processing 5 | ### Method Categories 6 | | Category Description | Metric List | 7 | |--- |--- | 8 | | Based on Video Statistics | Motion Score | 9 | | Based on Pre-trained Models | FastVQAScorer, FasterVQAScorer, DOVERScorer | 10 | 11 | ### Method Overview 12 | | Name | Filtering Metric | Filtering Dimension | Scorer Introduction | Score Range | 13 | | ---- | ---- | ---- | ---- | ---- | 14 | | VideoMotionFilter | Motion Score | Statistics | Calculates the magnitude of optical flow vectors between frames as the score | | 15 | | [FastVQAFilter](https://arxiv.org/abs/2207.02595v1) | Pre-trained model Scoring | Model | Scorer based on Video Swin Transformer, incorporating the Fragment Sampling module, which improves accuracy and speed | [0,1] | 16 | | [FasterVQAFilter](https://arxiv.org/abs/2210.05357) | Pre-trained model Scoring | Model |An optimized version of FastVQAScorer, with improvements to the Fragment Sampling module, achieving significant speed enhancements | [0,1] | 17 | | [DOVERFilter](https://arxiv.org/abs/2211.04894) | Pre-trained model scoring | Model |Based on FastVQAScorer, it provides scores from both technical and aesthetic perspectives | | 18 | 19 | ## Video-Text Processing 20 | | Category Description | Metric List | 21 | |--- |--- | 22 | | Based on pre-trained vision-language models | EMScore, PAC-S | 23 | 24 | | Name | Filtering Metric | Filtering Dimension | Scorer Introduction | Score Range | 25 | | ---- | ---- | ---- | ---- | ---- | 26 | | [EMScorer](https://arxiv.org/abs/2111.08919) | Video-Text Similarity Scoring | Model | A video-text scorer based on CLIP, supporting both with-reference and no-reference scoring. | [0,1] | 27 | | [PACScorer](https://arxiv.org/abs/2303.12112) | ideo-Text Similarity Scoring | Model | A video-text scorer based on CLIP, with tuned CLIP Encoder on top of EMScore | [0,1] | -------------------------------------------------------------------------------- /docs/_build/html/_sources/src/metrics/video_process.zh-CN.md.txt: -------------------------------------------------------------------------------- 1 | # 视频数据处理 2 | 视频数据的处理主要是基于评估得分的数据集过滤方法。 3 | ## 纯视频处理 4 | ### 方法分类 5 | |类别描述 | 指标列表| 6 | |--- |--- | 7 | | 基于视频统计信息 | Motion Score| 8 | | 基于预训练模型 | FastVQAScorer, FasterVQAScorer, DOVERScorer| 9 | 10 | ### 方法介绍 11 | | 名称 | 过滤指标 | 过滤维度| 打分器简介 |打分取值范围| 12 | | ---- | ---- | ---- | ---- | ---- | 13 | | VideoMotionFilter | Motion Score| 统计|计算帧之间的光流向量的幅度作为评分 | | 14 | | [FastVQAFilter](https://arxiv.org/abs/2207.02595v1) | 预训练模型打分 | 模型 | 基于Video Swin Transformer的打分器,加入了Fragment Sampling模块,获得了准确性和速度的提升 | [0,1]| 15 | | [FasterVQAFilter](https://arxiv.org/abs/2210.05357) | 预训练模型打分 | 模型 | 基于Video Swin Transformer的打分器,在FastVQAScorer的基础上对Fragment Sampling模块进行优化,得到了显著的速度提升 | [0,1] | 16 | | [DOVERFilter](https://arxiv.org/abs/2211.04894) | 预训练模型打分 | 模型|基于FastVQAScorer的打分器,同时给出了从技术和美学两个角度的评分 || 17 | 18 | ## 视频-文本处理 19 | 20 | |类别描述 | 指标列表| 21 | |--- |--- | 22 | | 基于预训练图文模型 | EMScore, PAC-S| 23 | 24 | 25 | | 名称 | 过滤指标 | 过滤维度| 打分器简介 |打分取值范围| 26 | | ---- | ---- | ---- | ---- | ---- | 27 | | [EMScorer](https://arxiv.org/abs/2111.08919) | 基于视频-文本相似度的打分| 模型|基于CLIP的视频-文本打分器,同时支持with-reference和no-reference的打分功能|[0,1] | 28 | | [PACScorer](https://arxiv.org/abs/2303.12112) | 基于视频-文本相似度的打分 | 模型 | 基于CLIP的视频-文本打分器,在EMScore的基础上对CLIP Encoder进行了调优| [0,1] | 29 | -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/Roboto-Slab-Bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/Roboto-Slab-Bold.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/Roboto-Slab-Bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/Roboto-Slab-Bold.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/Roboto-Slab-Regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/Roboto-Slab-Regular.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/Roboto-Slab-Regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/Roboto-Slab-Regular.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/fontawesome-webfont.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/fontawesome-webfont.eot -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/fontawesome-webfont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/fontawesome-webfont.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/fontawesome-webfont.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/fontawesome-webfont.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-bold-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/lato-bold-italic.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-bold-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/lato-bold-italic.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/lato-bold.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/lato-bold.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-normal-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/lato-normal-italic.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-normal-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/lato-normal-italic.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-normal.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/lato-normal.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/css/fonts/lato-normal.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/css/fonts/lato-normal.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/documentation_options.js: -------------------------------------------------------------------------------- 1 | const DOCUMENTATION_OPTIONS = { 2 | VERSION: '0.1', 3 | LANGUAGE: 'zh-CN', 4 | COLLAPSE_INDEX: false, 5 | BUILDER: 'html', 6 | FILE_SUFFIX: '.html', 7 | LINK_SUFFIX: '.html', 8 | HAS_SOURCE: true, 9 | SOURCELINK_SUFFIX: '.txt', 10 | NAVIGATION_WITH_KEYS: false, 11 | SHOW_SEARCH_SUMMARY: true, 12 | ENABLE_SEARCH_SHORTCUTS: true, 13 | }; -------------------------------------------------------------------------------- /docs/_build/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/file.png -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bold.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-bold.eot -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-bold.ttf -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-bold.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-bold.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bolditalic.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-bolditalic.eot -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bolditalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-bolditalic.ttf -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bolditalic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-bolditalic.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-bolditalic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-bolditalic.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-italic.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-italic.eot -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-italic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-italic.ttf -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-italic.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-italic.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-regular.eot -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-regular.ttf -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-regular.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/Lato/lato-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/Lato/lato-regular.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.eot -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.ttf -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.eot -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.ttf -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff -------------------------------------------------------------------------------- /docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff2 -------------------------------------------------------------------------------- /docs/_build/html/_static/js/badge_only.js: -------------------------------------------------------------------------------- 1 | !function(e){var t={};function r(n){if(t[n])return t[n].exports;var o=t[n]={i:n,l:!1,exports:{}};return e[n].call(o.exports,o,o.exports,r),o.l=!0,o.exports}r.m=e,r.c=t,r.d=function(e,t,n){r.o(e,t)||Object.defineProperty(e,t,{enumerable:!0,get:n})},r.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,"a",t),t},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r.p="",r(r.s=4)}({4:function(e,t,r){}}); -------------------------------------------------------------------------------- /docs/_build/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/minus.png -------------------------------------------------------------------------------- /docs/_build/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/_static/plus.png -------------------------------------------------------------------------------- /docs/_build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/docs/_build/html/objects.inv -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = 'Open-DataFlow-Eval' 10 | copyright = '2024, Open-DataFlow' 11 | author = 'Open-DataFlow' 12 | release = '0.1' 13 | 14 | # -- General configuration --------------------------------------------------- 15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 16 | 17 | extensions = ['myst_parser'] 18 | 19 | templates_path = ['_templates'] 20 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 21 | 22 | language = 'zh_CN' 23 | 24 | # -- Options for HTML output ------------------------------------------------- 25 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 26 | 27 | html_theme = 'sphinx_rtd_theme' 28 | html_static_path = ['_static'] 29 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Open-DataFlow-Eval documentation master file, created by 2 | sphinx-quickstart on Sat Nov 2 18:54:36 2024. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Open-DataFlow-Eval documentation 7 | ================================ 8 | 9 | Welcome to Open-DataFlow-Eval's documentation! 10 | 11 | This documentation is intended to provide a comprehensive guide to the metrics, APIs, and dataflow used in the Open-DataFlow-Eval project. 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | :caption: Open-DataFlow-Eval: 16 | 17 | src/getting_started/index 18 | src/metrics/index 19 | src/developer/index 20 | 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/src/developer/index.rst: -------------------------------------------------------------------------------- 1 | Developer Documentation 2 | ================================ 3 | 4 | Please see docs below for information on how to develop OpenDataFlow-Eval. 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | :caption: Developer docs: 9 | 10 | logging.md 11 | testcase.md 12 | -------------------------------------------------------------------------------- /docs/src/developer/logging.md: -------------------------------------------------------------------------------- 1 | ## Logger 2 | 3 | 目前logger的初始化在pipeline_step.py中 4 | ```python 5 | import logging 6 | logging.basicConfig(level=logging.INFO, 7 | format="%(asctime)s | %(filename)-20s- %(module)-20s- %(funcName)-20s- %(lineno)5d - %(name)-10s | %(levelname)8s | Processno %(process)5d - Threadno %(thread)-15d : %(message)s", 8 | datefmt="%Y-%m-%d %H:%M:%S" 9 | ) 10 | ``` 11 | 使用方法如下所示,其中debug, info, warning, error代表不同的日志等级,默认情况下DEBUG等级的日志不会显示。 12 | ```python 13 | def main(): 14 | 15 | logging.debug("This is DEBUG message") 16 | logging.info("This is INFO message") 17 | logging.warning("This is WARNING message") 18 | logging.error("This is ERROR message") 19 | 20 | return 21 | 22 | main() 23 | ``` 24 | 关于等级的分配原则: 25 | 1. DEBUG:一些没什么用需要屏蔽的输出 / 不想展示的技术细节,如: 26 | ```python 27 | for x in ['Text', 'image', 'video']: 28 | module_path = "dataflow.Eval." + x 29 | try: 30 | module_lib = importlib.import_module(module_path) 31 | clss = getattr(module_lib, name) 32 | self._obj_map[name] = clss 33 | return clss 34 | except AttributeError as e: 35 | logging.debug(f"{str(e)}") 36 | continue 37 | except Exception as e: 38 | raise e 39 | ``` 40 | 2. INFO: 让用户得知目前的运行情况,如: 41 | ```python 42 | def pipeline_step(yaml_path, step_name, step_type): 43 | import logging 44 | import yaml 45 | logging.info(f"Loading yaml {yaml_path} ......") 46 | with open(yaml_path, "r") as f: 47 | config = yaml.safe_load(f) 48 | config = merge_yaml(config) 49 | logging.info(f"Load yaml success, config: {config}") 50 | if step_type == "process": 51 | algorithm = get_processor(step_name, config) 52 | elif step_type == "generator": 53 | algorithm = get_generator(step_name, config) 54 | logging.info("Start running ...") 55 | algorithm.run() 56 | ``` 57 | 3. WARNING:可能出现问题的错误信息(暂时没有例子) 58 | 4. ERROR:运行出现错误,打印错误信息 59 | 60 | 算子内部的logging可以参考`DataFlow/dataflow/generator/algorithms/TreeSitterParser.py` -------------------------------------------------------------------------------- /docs/src/developer/testcase.md: -------------------------------------------------------------------------------- 1 | ## Testcase 2 | 3 | testcase的写法可以参考CodePipeline/test/ast_checker_test.py 4 | ```python 5 | import unittest 6 | import tempfile 7 | import os 8 | import sys 9 | import json 10 | sys.path.append("..") 11 | sys.path.append(".") 12 | sys.path.append("../..") 13 | from dataflow.utils.utils import get_generator 14 | 15 | class TreeSitterParserTest(unittest.TestCase): 16 | 17 | def setUp(self): 18 | self.input_file = "/root/workspace/culfjk4p420c73amv510/herunming/DataFlow/CodePipeline/data/ast_data.jsonl" 19 | self.output_file = tempfile.NamedTemporaryFile(delete=False, mode='w+', suffix='.jsonl') 20 | 21 | def tearDown(self): 22 | os.unlink(self.output_file.name) 23 | 24 | def test_run(self): 25 | config = { 26 | "input_file": self.input_file, 27 | "output_file": self.output_file.name, 28 | "input_key": "content", 29 | "output_key": "", 30 | "vllm_used": False 31 | } 32 | generator = get_generator("TreeSitterParser", config) 33 | generator.run() 34 | with open(self.output_file.name, "r") as f: 35 | data = [json.loads(_) for _ in f] 36 | for i in range(5): 37 | self.assertEqual(data[i]['ast_error'], 0, f"Not Equal to 0 in data {i}") 38 | for i in range(5, 10): 39 | self.assertEqual(data[i]['ast_error'], 1, f"Not Equal to 1 in data {i}") 40 | 41 | if __name__ == "__main__": 42 | unittest.main() 43 | ``` 44 | 其中setUp是启动test前的准备工作,tearDown是后处理函数,test_run是运行的主函数,unittest.Testcase类内置如assertEqual等函数 -------------------------------------------------------------------------------- /docs/src/getting_started/Installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## TODO 4 | -------------------------------------------------------------------------------- /docs/src/getting_started/customized_scorer.md: -------------------------------------------------------------------------------- 1 | # Customized Scorer 2 | 3 | ## TODO 4 | -------------------------------------------------------------------------------- /docs/src/getting_started/index.rst: -------------------------------------------------------------------------------- 1 | Getting Started 2 | ================================ 3 | 4 | Please see docs below for information on how to get started with Open-DataFlow-Eval. 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | :caption: Getting Started: 9 | 10 | Installation.md 11 | customized_scorer.md 12 | -------------------------------------------------------------------------------- /docs/src/metrics/image_process.md: -------------------------------------------------------------------------------- 1 | # Introduction to Image Data Processors 2 | Note: We have set up corresponding filters for each scorer, named following the rules ```MetricnameScorer``` and ```MetricFilter```. Please refer to the [image_metrics.md](./image_metrics.md) for an introduction to these filters. The following introduces only the processors that differ from the previous scorers. 3 | 4 | 5 | | Processor Name | Description | 6 | |---|---| 7 | | ImagePHashDeduplicator | Based on the [Perceptual hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html) method from [imagededup](https://github.com/idealo/imagededup) | 8 | | ImageDHashDeduplicator | Based on the [Difference hashing](https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html) method from [imagededup](https://github.com/idealo/imagededup) | 9 | | ImageWHashDeduplicator | Based on the [Wavelet hashing](https://fullstackml.com/wavelet-image-hash-in-python-3504fdd282b5) method from [imagededup](https://github.com/idealo/imagededup) | 10 | | ImageAHashDeduplicator | Based on the [Average hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html) method from [imagededup](https://github.com/idealo/imagededup) | 11 | | ImageAspectRatioFilter | Filters images based on their aspect ratio | 12 | | ImageResolutionFilter | Filters images based on their resolution | 13 | -------------------------------------------------------------------------------- /docs/src/metrics/image_process.zh-CN.md: -------------------------------------------------------------------------------- 1 | # 图像数据处理器介绍 2 | 注:我们为每个打分器都设置了对应的过滤器,命名规则为```MetricnameScorer```与```MetriFilter```,这部分过滤器的介绍请参考[image_metrics.zh-CN.md](./image_metrics.zh-CN.md)。以下仅介绍与之前的打分器不同的处理器。 3 | 4 | 5 | |处理器名称|简介| 6 | |---|---| 7 | |ImagePHashDeduplicator|基于[imagededup](https://github.com/idealo/imagededup)的[Perceptual hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)方法| 8 | |ImageDHashDeduplicator|基于[imagededup](https://github.com/idealo/imagededup)的[Difference hashing](https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)方法| 9 | |ImageWHashDeduplicator|基于[imagededup](https://github.com/idealo/imagededup)的[Wavelet hashing](https://fullstackml.com/wavelet-image-hash-in-python-3504fdd282b5)方法| 10 | |ImageAHashDeduplicator|基于[imagededup](https://github.com/idealo/imagededup)的[Average hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)| 11 | |ImageAspectRatioFilter|根据图像长宽比进行过滤| 12 | |ImageResolutionFilter|根据图像分辨率进行过滤| -------------------------------------------------------------------------------- /docs/src/metrics/index.rst: -------------------------------------------------------------------------------- 1 | Metrics 2 | ================================ 3 | 4 | Please see docs below for information on the Metrics we provided. 5 | 6 | .. toctree:: 7 | :maxdepth: 1 8 | :caption: Metrics Documentation: 9 | 10 | image_metrics.md 11 | image_metrics.zh-CN.md 12 | text_metrics.md 13 | text_metrics.zh-CN.md 14 | video_metrics.md 15 | video_metrics.zh-CN.md 16 | gen_text_metrics.md 17 | gen_text_metrics.zh-CN.md 18 | -------------------------------------------------------------------------------- /docs/src/metrics/video_process.md: -------------------------------------------------------------------------------- 1 | # Video Data Processing 2 | The processing of video data primarily relies on dataset filtering methods based on evaluation scores. 3 | 4 | ## Pure Video Processing 5 | ### Method Categories 6 | | Category Description | Metric List | 7 | |--- |--- | 8 | | Based on Video Statistics | Motion Score | 9 | | Based on Pre-trained Models | FastVQAScorer, FasterVQAScorer, DOVERScorer | 10 | 11 | ### Method Overview 12 | | Name | Filtering Metric | Filtering Dimension | Scorer Introduction | Score Range | 13 | | ---- | ---- | ---- | ---- | ---- | 14 | | VideoMotionFilter | Motion Score | Statistics | Calculates the magnitude of optical flow vectors between frames as the score | | 15 | | [FastVQAFilter](https://arxiv.org/abs/2207.02595v1) | Pre-trained model Scoring | Model | Scorer based on Video Swin Transformer, incorporating the Fragment Sampling module, which improves accuracy and speed | [0,1] | 16 | | [FasterVQAFilter](https://arxiv.org/abs/2210.05357) | Pre-trained model Scoring | Model |An optimized version of FastVQAScorer, with improvements to the Fragment Sampling module, achieving significant speed enhancements | [0,1] | 17 | | [DOVERFilter](https://arxiv.org/abs/2211.04894) | Pre-trained model scoring | Model |Based on FastVQAScorer, it provides scores from both technical and aesthetic perspectives | | 18 | 19 | ## Video-Text Processing 20 | | Category Description | Metric List | 21 | |--- |--- | 22 | | Based on pre-trained vision-language models | EMScore, PAC-S | 23 | 24 | | Name | Filtering Metric | Filtering Dimension | Scorer Introduction | Score Range | 25 | | ---- | ---- | ---- | ---- | ---- | 26 | | [EMScorer](https://arxiv.org/abs/2111.08919) | Video-Text Similarity Scoring | Model | A video-text scorer based on CLIP, supporting both with-reference and no-reference scoring. | [0,1] | 27 | | [PACScorer](https://arxiv.org/abs/2303.12112) | ideo-Text Similarity Scoring | Model | A video-text scorer based on CLIP, with tuned CLIP Encoder on top of EMScore | [0,1] | -------------------------------------------------------------------------------- /docs/src/metrics/video_process.zh-CN.md: -------------------------------------------------------------------------------- 1 | # 视频数据处理 2 | 视频数据的处理主要是基于评估得分的数据集过滤方法。 3 | ## 纯视频处理 4 | ### 方法分类 5 | |类别描述 | 指标列表| 6 | |--- |--- | 7 | | 基于视频统计信息 | Motion Score| 8 | | 基于预训练模型 | FastVQAScorer, FasterVQAScorer, DOVERScorer| 9 | 10 | ### 方法介绍 11 | | 名称 | 过滤指标 | 过滤维度| 打分器简介 |打分取值范围| 12 | | ---- | ---- | ---- | ---- | ---- | 13 | | VideoMotionFilter | Motion Score| 统计|计算帧之间的光流向量的幅度作为评分 | | 14 | | [FastVQAFilter](https://arxiv.org/abs/2207.02595v1) | 预训练模型打分 | 模型 | 基于Video Swin Transformer的打分器,加入了Fragment Sampling模块,获得了准确性和速度的提升 | [0,1]| 15 | | [FasterVQAFilter](https://arxiv.org/abs/2210.05357) | 预训练模型打分 | 模型 | 基于Video Swin Transformer的打分器,在FastVQAScorer的基础上对Fragment Sampling模块进行优化,得到了显著的速度提升 | [0,1] | 16 | | [DOVERFilter](https://arxiv.org/abs/2211.04894) | 预训练模型打分 | 模型|基于FastVQAScorer的打分器,同时给出了从技术和美学两个角度的评分 || 17 | 18 | ## 视频-文本处理 19 | 20 | |类别描述 | 指标列表| 21 | |--- |--- | 22 | | 基于预训练图文模型 | EMScore, PAC-S| 23 | 24 | 25 | | 名称 | 过滤指标 | 过滤维度| 打分器简介 |打分取值范围| 26 | | ---- | ---- | ---- | ---- | ---- | 27 | | [EMScorer](https://arxiv.org/abs/2111.08919) | 基于视频-文本相似度的打分| 模型|基于CLIP的视频-文本打分器,同时支持with-reference和no-reference的打分功能|[0,1] | 28 | | [PACScorer](https://arxiv.org/abs/2303.12112) | 基于视频-文本相似度的打分 | 模型 | 基于CLIP的视频-文本打分器,在EMScore的基础上对CLIP Encoder进行了调优| [0,1] | 29 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | from dataflow.utils import calculate_score 2 | 3 | calculate_score() 4 | -------------------------------------------------------------------------------- /image_process.py: -------------------------------------------------------------------------------- 1 | from dataflow.utils.registry import PROCESSOR_REGISTRY, FORMATTER_REGISTRY 2 | from dataflow.core import ScoreRecord 3 | from dataflow.config import new_init_config 4 | 5 | cfg = new_init_config() 6 | dataset_dict = {} 7 | score_record = ScoreRecord() 8 | for processor_name, processor_args in cfg.processors.items(): 9 | if "num_workers" in cfg: 10 | processor_args["num_workers"] = cfg.num_workers 11 | if "model_cache_path" in cfg: 12 | processor_args["model_cache_dir"] = cfg.model_cache_path 13 | processor = PROCESSOR_REGISTRY.get(processor_name)(args_dict=processor_args) 14 | if processor.data_type not in dataset_dict: 15 | formatter = FORMATTER_REGISTRY.get(cfg['data'][processor.data_type]['formatter'])(cfg['data'][processor.data_type]) 16 | datasets = formatter.load_dataset() 17 | dataset_dict[processor.data_type] = datasets 18 | dataset = datasets[0] if type(datasets) == tuple else datasets 19 | dataset.set_score_record(score_record) 20 | else: 21 | datasets = dataset_dict[processor.data_type] 22 | dataset_dict[processor.data_type] = processor(dataset_dict[processor.data_type]) 23 | print(dataset_dict[processor.data_type].indices) 24 | -------------------------------------------------------------------------------- /new_image_eval_example.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, '../DataFlow') 3 | 4 | from dataflow.utils.utils import calculate_score 5 | 6 | calculate_score() -------------------------------------------------------------------------------- /process.py: -------------------------------------------------------------------------------- 1 | from dataflow.utils.utils import process 2 | 3 | process() -------------------------------------------------------------------------------- /processed.jsonl: -------------------------------------------------------------------------------- 1 | {"problem": "A rectangle has a length of 10 cm and a width of 5 cm. What is its area?"} 2 | {"problem": "If a car travels at 60 km/h for 2 hours, how far does it go?"} 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | av==12.3.0 2 | decord==0.6.0 3 | einops==0.8.0 4 | fasttext==0.9.3 5 | filelock==3.15.4 6 | fsspec==2024.6.1 7 | ftfy==6.2.3 8 | google-api-core==2.19.1 9 | google-api-python-client==2.140.0 10 | google-auth==2.33.0 11 | google-auth-httplib2==0.2.0 12 | googleapis-common-protos==1.63.2 13 | imagededup == 0.3.2 14 | jsonargparse==4.32.0 15 | kenlm==0.2.0 16 | langkit==0.0.33 17 | loguru==0.7.2 18 | matplotlib==3.9.2 19 | multiprocess==0.70.16 20 | nltk==3.8 21 | numpy==1.26.4 22 | openai=1.44.1 23 | pandas==2.2.2 24 | prettytable==3.11.0 25 | pyspark==3.5.2 26 | PyYAML==6.0.2 27 | regex==2024.7.24 28 | safetensors==0.4.4 29 | scikit-learn==1.5.1 30 | scikit-video==1.1.11 31 | scipy==1.13.1 32 | sentencepiece==0.2.0 33 | setuptools==72.1.0 34 | timm==1.0.8 35 | torch==2.4.0 36 | torchvision==0.19.0 37 | tqdm==4.66.5 38 | transformers==4.44.2 39 | vendi-score==0.0.3 40 | vllm==0.6.0 41 | wget==3.2 42 | -------------------------------------------------------------------------------- /static/images/Face.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/static/images/Face.png -------------------------------------------------------------------------------- /static/images/example_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-DataFlow/DataFlow/ef7f30848d767ffc96d7733a8b6cfac2e4005f42/static/images/example_1.png --------------------------------------------------------------------------------