├── .gitignore ├── README.md ├── code ├── data_cleaning │ ├── __init__.py │ ├── anonymization.py │ ├── download_sentencepiece_kenlm_models.py │ ├── filtering.py │ ├── flagged_words.py │ ├── languages_id.py │ ├── main_filtering.py │ ├── normalization.py │ ├── parameters_filtering.py │ ├── run_example.sh │ ├── stopwords.py │ └── write_arrow_to_jsonl.py ├── exact_dedup │ ├── CONTRIBUTING.md │ ├── Cargo.lock │ ├── Cargo.toml │ ├── LICENSE │ ├── run_example.sh │ ├── scripts │ │ ├── cmp_dedup.py │ │ ├── count_occurrences.py │ │ ├── count_topk_occurrences.py │ │ ├── deduplicate_single_file.sh │ │ ├── finish_single_file_hf.py │ │ ├── load_dataset_hf.py │ │ └── make_suffix_array.py │ └── src │ │ ├── main.rs │ │ └── table.rs └── near_dedup │ └── run_example.sh ├── data ├── data_input │ └── sample.jsonl └── data_output │ └── .keep ├── requirements.txt └── run_example.sh /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/README.md -------------------------------------------------------------------------------- /code/data_cleaning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/data_cleaning/__init__.py -------------------------------------------------------------------------------- /code/data_cleaning/anonymization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/data_cleaning/anonymization.py -------------------------------------------------------------------------------- /code/data_cleaning/download_sentencepiece_kenlm_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/data_cleaning/download_sentencepiece_kenlm_models.py -------------------------------------------------------------------------------- /code/data_cleaning/filtering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/data_cleaning/filtering.py -------------------------------------------------------------------------------- /code/data_cleaning/flagged_words.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/data_cleaning/flagged_words.py -------------------------------------------------------------------------------- /code/data_cleaning/languages_id.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/data_cleaning/languages_id.py -------------------------------------------------------------------------------- /code/data_cleaning/main_filtering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/data_cleaning/main_filtering.py -------------------------------------------------------------------------------- /code/data_cleaning/normalization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/data_cleaning/normalization.py -------------------------------------------------------------------------------- /code/data_cleaning/parameters_filtering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/data_cleaning/parameters_filtering.py -------------------------------------------------------------------------------- /code/data_cleaning/run_example.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/data_cleaning/run_example.sh -------------------------------------------------------------------------------- /code/data_cleaning/stopwords.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/data_cleaning/stopwords.py -------------------------------------------------------------------------------- /code/data_cleaning/write_arrow_to_jsonl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/data_cleaning/write_arrow_to_jsonl.py -------------------------------------------------------------------------------- /code/exact_dedup/CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/exact_dedup/CONTRIBUTING.md -------------------------------------------------------------------------------- /code/exact_dedup/Cargo.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/exact_dedup/Cargo.lock -------------------------------------------------------------------------------- /code/exact_dedup/Cargo.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/exact_dedup/Cargo.toml -------------------------------------------------------------------------------- /code/exact_dedup/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/exact_dedup/LICENSE -------------------------------------------------------------------------------- /code/exact_dedup/run_example.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/exact_dedup/run_example.sh -------------------------------------------------------------------------------- /code/exact_dedup/scripts/cmp_dedup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/exact_dedup/scripts/cmp_dedup.py -------------------------------------------------------------------------------- /code/exact_dedup/scripts/count_occurrences.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/exact_dedup/scripts/count_occurrences.py -------------------------------------------------------------------------------- /code/exact_dedup/scripts/count_topk_occurrences.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/exact_dedup/scripts/count_topk_occurrences.py -------------------------------------------------------------------------------- /code/exact_dedup/scripts/deduplicate_single_file.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/exact_dedup/scripts/deduplicate_single_file.sh -------------------------------------------------------------------------------- /code/exact_dedup/scripts/finish_single_file_hf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/exact_dedup/scripts/finish_single_file_hf.py -------------------------------------------------------------------------------- /code/exact_dedup/scripts/load_dataset_hf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/exact_dedup/scripts/load_dataset_hf.py -------------------------------------------------------------------------------- /code/exact_dedup/scripts/make_suffix_array.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/exact_dedup/scripts/make_suffix_array.py -------------------------------------------------------------------------------- /code/exact_dedup/src/main.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/exact_dedup/src/main.rs -------------------------------------------------------------------------------- /code/exact_dedup/src/table.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/exact_dedup/src/table.rs -------------------------------------------------------------------------------- /code/near_dedup/run_example.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/code/near_dedup/run_example.sh -------------------------------------------------------------------------------- /data/data_input/sample.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/data/data_input/sample.jsonl -------------------------------------------------------------------------------- /data/data_output/.keep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/requirements.txt -------------------------------------------------------------------------------- /run_example.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/sailcraft/HEAD/run_example.sh --------------------------------------------------------------------------------