├── .github └── workflows │ └── python-app.yml ├── .gitignore ├── FlagOpen.png ├── LICENSE ├── README.md ├── README_zh.md ├── config └── cleaner_config.yaml ├── contact_me.png ├── dedup.png ├── dist ├── flagdata-1.0.0-py3-none-any.whl └── flagdata-1.0.0.tar.gz ├── flagdata ├── __init__.py ├── all2txt │ ├── README.md │ ├── README_zh.md │ ├── __init__.py │ ├── epub2txt.py │ └── pdf2txt.py ├── analysis │ ├── README.md │ ├── README_zh.md │ ├── __init__.py │ ├── average_rotation_analysis.py │ ├── data │ │ ├── average_rounds_data.jsonl │ │ └── language_distribution_data.jsonl │ ├── draw_pie_chart.py │ ├── field_distribution_analysis.py │ ├── language_distribution_analysis.py │ ├── nested_pie_chart.py │ ├── png │ │ ├── field_distribution_analysis.gif │ │ └── language_distribution_analysis.png │ └── text_length_analysis.py ├── cleaner │ ├── __init__.py │ ├── arxiv_cleaner.py │ ├── base_cleaner.py │ ├── book_cleaner.py │ ├── cleaner_builder.py │ ├── configs │ │ ├── arxiv_clean.yaml │ │ ├── book_clean.yaml │ │ ├── book_config.json │ │ ├── html_clean.yaml │ │ ├── qa_clean.yaml │ │ └── text_clean.yaml │ ├── docs │ │ ├── Book_Cleaner.md │ │ ├── Book_Cleaner_ZH.md │ │ ├── Qa_Cleaner.md │ │ ├── Qa_Cleaner_ZH.md │ │ └── Text_Cleaner.md │ ├── html_cleaner.py │ ├── input │ │ ├── arxiv_demo_input.jsonl │ │ ├── book_demo_data │ │ │ ├── 11836262_Wolf s Mate.epub │ │ │ ├── 11836269_Noches de terciopelo (Spanish Edition).epub │ │ │ ├── 11836275_《术士的指环》合集(第一、二、三卷).epub │ │ │ ├── 11836283_《望古神话之秦墟》(全本校对)作者 月关.epub │ │ │ ├── 11836286_《五代刀锋》[长篇历史小说·实体书版套装3册].epub │ │ │ ├── 11836294_Mensajes desde el lago (Spanish Edition).epub │ │ │ ├── 11836298_La vida es corta pero ancha.epub │ │ │ ├── 11836315_El Guardaespaldas (Spanish Edition).epub │ │ │ ├── 11836358_The Moth Catcher (Vera Stanhope series Book 7).mobi │ │ │ ├── 11836359_Intern For My Best Friend s Dad An Instalove Possessive Age Gap Romanc.epub │ │ │ └── 11836366_Keeper of the Innocents (The Keeper Witches 2) .epub │ │ ├── html_demo_input.txt │ │ ├── qa_demo_input.jsonl │ │ ├── ref.jsonl │ │ └── text_demo_input.jsonl │ ├── qa_cleaner.py │ ├── text_cleaner.py │ └── utils │ │ ├── __init__.py │ │ ├── common_utils.py │ │ ├── extractor.py │ │ ├── http_utils.py │ │ ├── ruleset.py │ │ ├── string_utils.py │ │ └── time_formatter.py ├── data_gen │ ├── README.md │ ├── README_zh.md │ ├── __init__.py │ ├── example.py │ ├── prompt_template.py │ ├── strategy.py │ └── utils.py ├── data_operator │ ├── Operator.md │ ├── Operator_ZH.md │ ├── __init__.py │ ├── base_operator.py │ ├── formatter │ │ ├── __init__.py │ │ ├── base_formatter.py │ │ ├── csv_formatter.py │ │ ├── json_formatter.py │ │ ├── parquet_formatter.py │ │ └── tsv_formatter.py │ ├── make_data.py │ ├── new_data.csv │ ├── new_data.json │ ├── new_data.parquet │ ├── new_data.tsv │ ├── pruner │ │ ├── __init__.py │ │ ├── catalogue_pruner.py │ │ ├── chinese_conversion_pruner.py │ │ ├── consecutive_newlines_pruner.py │ │ ├── control_char_pruner.py │ │ ├── copyright_pruner.py │ │ ├── email_pruner.py │ │ ├── end_at_last_punctuation_pruner.py │ │ ├── figuret_able_caption_pruner.py │ │ ├── ip_pruner.py │ │ ├── latex_macro_expander_pruner.py │ │ ├── link_pruner.py │ │ ├── non_chinese_char_pruner.py │ │ ├── punctuation_normalization_pruner.py │ │ ├── repeat_sentence_pruner.py │ │ ├── replace_pruner.py │ │ ├── specific_pattern_pruner.py │ │ ├── table_pruner.py │ │ ├── test.py │ │ └── unicode_pruner.py │ ├── samplefilter │ │ ├── __init__.py │ │ ├── actionalbe_verb_num_filter.py │ │ ├── alphanumeric_ratio_filter.py │ │ ├── avg_line_length_filter.py │ │ ├── field_value_filter.py │ │ ├── flagged_words_ratio_filter.py │ │ ├── language_confidence_filter.py │ │ ├── max_line_length_filter.py │ │ ├── numeric_field_value_filter.py │ │ ├── special_character_ratio_filter.py │ │ ├── stropword_ratio_filter.py │ │ ├── suffix_filter.py │ │ ├── text_length_filter.py │ │ ├── token_num_filter.py │ │ ├── word_num_filter.py │ │ └── word_repetition_ratio_filter.py │ ├── test.py │ ├── test_data.csv │ ├── test_data.json │ ├── test_data.parquet │ └── test_data.tsv ├── deduplication │ ├── README.md │ ├── README_zh.md │ ├── __init__.py │ ├── minhash.py │ ├── stringMatching.py │ └── udf_spark_stringMatching.py ├── language_identification │ ├── README.md │ ├── README_zh.md │ ├── __init__.py │ ├── jsonql.py │ └── split_by_lang.py └── quality_assessment │ ├── Bert │ ├── bert_config.yaml │ ├── evaluate.py │ ├── input_data │ │ └── example_data.jsonl │ ├── models │ │ ├── config.json │ │ └── tokenizer │ │ │ ├── special_tokens_map.json │ │ │ ├── tokenizer_config.json │ │ │ └── vocab.txt │ ├── network │ │ ├── document_bert_architectures.py │ │ └── model_architechure_bert_multi_scale.py │ └── utils │ │ ├── data.py │ │ └── encode.py │ ├── FastText │ ├── data │ │ ├── cleared1.jsonl │ │ └── cn_stopwords.txt │ └── evaluate.py │ ├── README.md │ ├── README_zh.md │ └── quality_assessment.png ├── flagdata_logo.png ├── pic ├── data_operator.png ├── some_operator.png └── users.png ├── pipeline.png ├── pipeline_zh.png ├── pyproject.toml ├── quickstart └── cleaner │ ├── run_cleaner.py │ └── run_custom_cleaner.py ├── requirements.txt └── tests └── test_cleaner.py /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/.github/workflows/python-app.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/.gitignore -------------------------------------------------------------------------------- /FlagOpen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/FlagOpen.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/README.md -------------------------------------------------------------------------------- /README_zh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/README_zh.md -------------------------------------------------------------------------------- /config/cleaner_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/config/cleaner_config.yaml -------------------------------------------------------------------------------- /contact_me.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/contact_me.png -------------------------------------------------------------------------------- /dedup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/dedup.png -------------------------------------------------------------------------------- /dist/flagdata-1.0.0-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/dist/flagdata-1.0.0-py3-none-any.whl -------------------------------------------------------------------------------- /dist/flagdata-1.0.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/dist/flagdata-1.0.0.tar.gz -------------------------------------------------------------------------------- /flagdata/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.0" 2 | -------------------------------------------------------------------------------- /flagdata/all2txt/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/all2txt/README.md -------------------------------------------------------------------------------- /flagdata/all2txt/README_zh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/all2txt/README_zh.md -------------------------------------------------------------------------------- /flagdata/all2txt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flagdata/all2txt/epub2txt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/all2txt/epub2txt.py -------------------------------------------------------------------------------- /flagdata/all2txt/pdf2txt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/all2txt/pdf2txt.py -------------------------------------------------------------------------------- /flagdata/analysis/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/analysis/README.md -------------------------------------------------------------------------------- /flagdata/analysis/README_zh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/analysis/README_zh.md -------------------------------------------------------------------------------- /flagdata/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flagdata/analysis/average_rotation_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/analysis/average_rotation_analysis.py -------------------------------------------------------------------------------- /flagdata/analysis/data/average_rounds_data.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/analysis/data/average_rounds_data.jsonl -------------------------------------------------------------------------------- /flagdata/analysis/data/language_distribution_data.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/analysis/data/language_distribution_data.jsonl -------------------------------------------------------------------------------- /flagdata/analysis/draw_pie_chart.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/analysis/draw_pie_chart.py -------------------------------------------------------------------------------- /flagdata/analysis/field_distribution_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/analysis/field_distribution_analysis.py -------------------------------------------------------------------------------- /flagdata/analysis/language_distribution_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/analysis/language_distribution_analysis.py -------------------------------------------------------------------------------- /flagdata/analysis/nested_pie_chart.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/analysis/nested_pie_chart.py -------------------------------------------------------------------------------- /flagdata/analysis/png/field_distribution_analysis.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/analysis/png/field_distribution_analysis.gif -------------------------------------------------------------------------------- /flagdata/analysis/png/language_distribution_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/analysis/png/language_distribution_analysis.png -------------------------------------------------------------------------------- /flagdata/analysis/text_length_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/analysis/text_length_analysis.py -------------------------------------------------------------------------------- /flagdata/cleaner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flagdata/cleaner/arxiv_cleaner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/arxiv_cleaner.py -------------------------------------------------------------------------------- /flagdata/cleaner/base_cleaner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/base_cleaner.py -------------------------------------------------------------------------------- /flagdata/cleaner/book_cleaner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/book_cleaner.py -------------------------------------------------------------------------------- /flagdata/cleaner/cleaner_builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/cleaner_builder.py -------------------------------------------------------------------------------- /flagdata/cleaner/configs/arxiv_clean.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/configs/arxiv_clean.yaml -------------------------------------------------------------------------------- /flagdata/cleaner/configs/book_clean.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/configs/book_clean.yaml -------------------------------------------------------------------------------- /flagdata/cleaner/configs/book_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/configs/book_config.json -------------------------------------------------------------------------------- /flagdata/cleaner/configs/html_clean.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/configs/html_clean.yaml -------------------------------------------------------------------------------- /flagdata/cleaner/configs/qa_clean.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flagdata/cleaner/configs/text_clean.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/configs/text_clean.yaml -------------------------------------------------------------------------------- /flagdata/cleaner/docs/Book_Cleaner.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/docs/Book_Cleaner.md -------------------------------------------------------------------------------- /flagdata/cleaner/docs/Book_Cleaner_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/docs/Book_Cleaner_ZH.md -------------------------------------------------------------------------------- /flagdata/cleaner/docs/Qa_Cleaner.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/docs/Qa_Cleaner.md -------------------------------------------------------------------------------- /flagdata/cleaner/docs/Qa_Cleaner_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/docs/Qa_Cleaner_ZH.md -------------------------------------------------------------------------------- /flagdata/cleaner/docs/Text_Cleaner.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/docs/Text_Cleaner.md -------------------------------------------------------------------------------- /flagdata/cleaner/html_cleaner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/html_cleaner.py -------------------------------------------------------------------------------- /flagdata/cleaner/input/arxiv_demo_input.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/arxiv_demo_input.jsonl -------------------------------------------------------------------------------- /flagdata/cleaner/input/book_demo_data/11836262_Wolf s Mate.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/book_demo_data/11836262_Wolf s Mate.epub -------------------------------------------------------------------------------- /flagdata/cleaner/input/book_demo_data/11836269_Noches de terciopelo (Spanish Edition).epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/book_demo_data/11836269_Noches de terciopelo (Spanish Edition).epub -------------------------------------------------------------------------------- /flagdata/cleaner/input/book_demo_data/11836275_《术士的指环》合集(第一、二、三卷).epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/book_demo_data/11836275_《术士的指环》合集(第一、二、三卷).epub -------------------------------------------------------------------------------- /flagdata/cleaner/input/book_demo_data/11836283_《望古神话之秦墟》(全本校对)作者 月关.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/book_demo_data/11836283_《望古神话之秦墟》(全本校对)作者 月关.epub -------------------------------------------------------------------------------- /flagdata/cleaner/input/book_demo_data/11836286_《五代刀锋》[长篇历史小说·实体书版套装3册].epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/book_demo_data/11836286_《五代刀锋》[长篇历史小说·实体书版套装3册].epub -------------------------------------------------------------------------------- /flagdata/cleaner/input/book_demo_data/11836294_Mensajes desde el lago (Spanish Edition).epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/book_demo_data/11836294_Mensajes desde el lago (Spanish Edition).epub -------------------------------------------------------------------------------- /flagdata/cleaner/input/book_demo_data/11836298_La vida es corta pero ancha.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/book_demo_data/11836298_La vida es corta pero ancha.epub -------------------------------------------------------------------------------- /flagdata/cleaner/input/book_demo_data/11836315_El Guardaespaldas (Spanish Edition).epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/book_demo_data/11836315_El Guardaespaldas (Spanish Edition).epub -------------------------------------------------------------------------------- /flagdata/cleaner/input/book_demo_data/11836358_The Moth Catcher (Vera Stanhope series Book 7).mobi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/book_demo_data/11836358_The Moth Catcher (Vera Stanhope series Book 7).mobi -------------------------------------------------------------------------------- /flagdata/cleaner/input/book_demo_data/11836359_Intern For My Best Friend s Dad An Instalove Possessive Age Gap Romanc.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/book_demo_data/11836359_Intern For My Best Friend s Dad An Instalove Possessive Age Gap Romanc.epub -------------------------------------------------------------------------------- /flagdata/cleaner/input/book_demo_data/11836366_Keeper of the Innocents (The Keeper Witches 2) .epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/book_demo_data/11836366_Keeper of the Innocents (The Keeper Witches 2) .epub -------------------------------------------------------------------------------- /flagdata/cleaner/input/html_demo_input.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/html_demo_input.txt -------------------------------------------------------------------------------- /flagdata/cleaner/input/qa_demo_input.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/qa_demo_input.jsonl -------------------------------------------------------------------------------- /flagdata/cleaner/input/ref.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/ref.jsonl -------------------------------------------------------------------------------- /flagdata/cleaner/input/text_demo_input.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/input/text_demo_input.jsonl -------------------------------------------------------------------------------- /flagdata/cleaner/qa_cleaner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/qa_cleaner.py -------------------------------------------------------------------------------- /flagdata/cleaner/text_cleaner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/text_cleaner.py -------------------------------------------------------------------------------- /flagdata/cleaner/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flagdata/cleaner/utils/common_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/utils/common_utils.py -------------------------------------------------------------------------------- /flagdata/cleaner/utils/extractor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/utils/extractor.py -------------------------------------------------------------------------------- /flagdata/cleaner/utils/http_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/utils/http_utils.py -------------------------------------------------------------------------------- /flagdata/cleaner/utils/ruleset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/utils/ruleset.py -------------------------------------------------------------------------------- /flagdata/cleaner/utils/string_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/utils/string_utils.py -------------------------------------------------------------------------------- /flagdata/cleaner/utils/time_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/cleaner/utils/time_formatter.py -------------------------------------------------------------------------------- /flagdata/data_gen/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_gen/README.md -------------------------------------------------------------------------------- /flagdata/data_gen/README_zh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_gen/README_zh.md -------------------------------------------------------------------------------- /flagdata/data_gen/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flagdata/data_gen/example.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_gen/example.py -------------------------------------------------------------------------------- /flagdata/data_gen/prompt_template.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_gen/prompt_template.py -------------------------------------------------------------------------------- /flagdata/data_gen/strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_gen/strategy.py -------------------------------------------------------------------------------- /flagdata/data_gen/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_gen/utils.py -------------------------------------------------------------------------------- /flagdata/data_operator/Operator.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/Operator.md -------------------------------------------------------------------------------- /flagdata/data_operator/Operator_ZH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/Operator_ZH.md -------------------------------------------------------------------------------- /flagdata/data_operator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/__init__.py -------------------------------------------------------------------------------- /flagdata/data_operator/base_operator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/base_operator.py -------------------------------------------------------------------------------- /flagdata/data_operator/formatter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/formatter/__init__.py -------------------------------------------------------------------------------- /flagdata/data_operator/formatter/base_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/formatter/base_formatter.py -------------------------------------------------------------------------------- /flagdata/data_operator/formatter/csv_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/formatter/csv_formatter.py -------------------------------------------------------------------------------- /flagdata/data_operator/formatter/json_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/formatter/json_formatter.py -------------------------------------------------------------------------------- /flagdata/data_operator/formatter/parquet_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/formatter/parquet_formatter.py -------------------------------------------------------------------------------- /flagdata/data_operator/formatter/tsv_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/formatter/tsv_formatter.py -------------------------------------------------------------------------------- /flagdata/data_operator/make_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/make_data.py -------------------------------------------------------------------------------- /flagdata/data_operator/new_data.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/new_data.csv -------------------------------------------------------------------------------- /flagdata/data_operator/new_data.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/new_data.json -------------------------------------------------------------------------------- /flagdata/data_operator/new_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/new_data.parquet -------------------------------------------------------------------------------- /flagdata/data_operator/new_data.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/new_data.tsv -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/__init__.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/catalogue_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/catalogue_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/chinese_conversion_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/chinese_conversion_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/consecutive_newlines_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/consecutive_newlines_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/control_char_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/control_char_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/copyright_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/copyright_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/email_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/email_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/end_at_last_punctuation_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/end_at_last_punctuation_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/figuret_able_caption_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/figuret_able_caption_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/ip_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/ip_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/latex_macro_expander_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/latex_macro_expander_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/link_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/link_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/non_chinese_char_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/non_chinese_char_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/punctuation_normalization_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/punctuation_normalization_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/repeat_sentence_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/repeat_sentence_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/replace_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/replace_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/specific_pattern_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/specific_pattern_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/table_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/table_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/test.py -------------------------------------------------------------------------------- /flagdata/data_operator/pruner/unicode_pruner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/pruner/unicode_pruner.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/__init__.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/actionalbe_verb_num_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/actionalbe_verb_num_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/alphanumeric_ratio_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/alphanumeric_ratio_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/avg_line_length_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/avg_line_length_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/field_value_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/field_value_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/flagged_words_ratio_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/flagged_words_ratio_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/language_confidence_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/language_confidence_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/max_line_length_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/max_line_length_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/numeric_field_value_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/numeric_field_value_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/special_character_ratio_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/special_character_ratio_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/stropword_ratio_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/stropword_ratio_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/suffix_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/suffix_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/text_length_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/text_length_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/token_num_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/token_num_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/word_num_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/word_num_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/samplefilter/word_repetition_ratio_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/samplefilter/word_repetition_ratio_filter.py -------------------------------------------------------------------------------- /flagdata/data_operator/test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/test.py -------------------------------------------------------------------------------- /flagdata/data_operator/test_data.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/test_data.csv -------------------------------------------------------------------------------- /flagdata/data_operator/test_data.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/test_data.json -------------------------------------------------------------------------------- /flagdata/data_operator/test_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/test_data.parquet -------------------------------------------------------------------------------- /flagdata/data_operator/test_data.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/data_operator/test_data.tsv -------------------------------------------------------------------------------- /flagdata/deduplication/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/deduplication/README.md -------------------------------------------------------------------------------- /flagdata/deduplication/README_zh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/deduplication/README_zh.md -------------------------------------------------------------------------------- /flagdata/deduplication/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flagdata/deduplication/minhash.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/deduplication/minhash.py -------------------------------------------------------------------------------- /flagdata/deduplication/stringMatching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/deduplication/stringMatching.py -------------------------------------------------------------------------------- /flagdata/deduplication/udf_spark_stringMatching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/deduplication/udf_spark_stringMatching.py -------------------------------------------------------------------------------- /flagdata/language_identification/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/language_identification/README.md -------------------------------------------------------------------------------- /flagdata/language_identification/README_zh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/language_identification/README_zh.md -------------------------------------------------------------------------------- /flagdata/language_identification/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /flagdata/language_identification/jsonql.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/language_identification/jsonql.py -------------------------------------------------------------------------------- /flagdata/language_identification/split_by_lang.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/language_identification/split_by_lang.py -------------------------------------------------------------------------------- /flagdata/quality_assessment/Bert/bert_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/Bert/bert_config.yaml -------------------------------------------------------------------------------- /flagdata/quality_assessment/Bert/evaluate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/Bert/evaluate.py -------------------------------------------------------------------------------- /flagdata/quality_assessment/Bert/input_data/example_data.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/Bert/input_data/example_data.jsonl -------------------------------------------------------------------------------- /flagdata/quality_assessment/Bert/models/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/Bert/models/config.json -------------------------------------------------------------------------------- /flagdata/quality_assessment/Bert/models/tokenizer/special_tokens_map.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/Bert/models/tokenizer/special_tokens_map.json -------------------------------------------------------------------------------- /flagdata/quality_assessment/Bert/models/tokenizer/tokenizer_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/Bert/models/tokenizer/tokenizer_config.json -------------------------------------------------------------------------------- /flagdata/quality_assessment/Bert/models/tokenizer/vocab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/Bert/models/tokenizer/vocab.txt -------------------------------------------------------------------------------- /flagdata/quality_assessment/Bert/network/document_bert_architectures.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/Bert/network/document_bert_architectures.py -------------------------------------------------------------------------------- /flagdata/quality_assessment/Bert/network/model_architechure_bert_multi_scale.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/Bert/network/model_architechure_bert_multi_scale.py -------------------------------------------------------------------------------- /flagdata/quality_assessment/Bert/utils/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/Bert/utils/data.py -------------------------------------------------------------------------------- /flagdata/quality_assessment/Bert/utils/encode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/Bert/utils/encode.py -------------------------------------------------------------------------------- /flagdata/quality_assessment/FastText/data/cleared1.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/FastText/data/cleared1.jsonl -------------------------------------------------------------------------------- /flagdata/quality_assessment/FastText/data/cn_stopwords.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/FastText/data/cn_stopwords.txt -------------------------------------------------------------------------------- /flagdata/quality_assessment/FastText/evaluate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/FastText/evaluate.py -------------------------------------------------------------------------------- /flagdata/quality_assessment/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/README.md -------------------------------------------------------------------------------- /flagdata/quality_assessment/README_zh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/README_zh.md -------------------------------------------------------------------------------- /flagdata/quality_assessment/quality_assessment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata/quality_assessment/quality_assessment.png -------------------------------------------------------------------------------- /flagdata_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/flagdata_logo.png -------------------------------------------------------------------------------- /pic/data_operator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/pic/data_operator.png -------------------------------------------------------------------------------- /pic/some_operator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/pic/some_operator.png -------------------------------------------------------------------------------- /pic/users.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/pic/users.png -------------------------------------------------------------------------------- /pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/pipeline.png -------------------------------------------------------------------------------- /pipeline_zh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/pipeline_zh.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/pyproject.toml -------------------------------------------------------------------------------- /quickstart/cleaner/run_cleaner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/quickstart/cleaner/run_cleaner.py -------------------------------------------------------------------------------- /quickstart/cleaner/run_custom_cleaner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/quickstart/cleaner/run_custom_cleaner.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/requirements.txt -------------------------------------------------------------------------------- /tests/test_cleaner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagData/HEAD/tests/test_cleaner.py --------------------------------------------------------------------------------