├── .gitignore ├── LICENSE ├── README.md ├── setup.py ├── zyda ├── __init__.py ├── connected_components │ ├── generate_connected_components.py │ └── generate_indices_to_remove.py ├── lsh_minhash │ ├── __init__.py │ ├── build_lsh_index.py │ ├── compute_minhash.py │ └── compute_optimal_params.py ├── preprocessing_and_filtering │ ├── __init__.py │ ├── cursed_substrings.json │ ├── preprocess_and_filter.py │ ├── profanity_word_list.json │ ├── sexual_word_list.json │ └── zh_pornsignals.json └── utils │ ├── __init__.py │ ├── common.py │ ├── filtering.py │ └── text.py └── zyda_reproduction ├── 1_downloading ├── download_arxiv_pile_peS2o_c4_refinedweb.py ├── download_refinedweb.py ├── process_repo_slimpajama.py └── process_repo_starcoder.py ├── 2_preprocessing_and_filtering ├── preprocess_pile_c4_peS2o_arxiv.sh ├── preprocess_refinedweb.sh ├── preprocess_slimpajama.sh └── preprocess_starcoder.sh ├── 3_minhashing ├── minhash_pile_c4_peS2o_arxiv.sh ├── minhash_refinedweb.sh ├── minhash_slimpajama.sh └── minhash_starcoder.sh ├── 4_lsh_indexing └── run_lsh_dupes_0.4_all.sh ├── 5_clustering └── run_cc_lsh_0.4_dupes.sh └── 6_generating_final_dataset ├── convert_jsonls_to_parquet.py ├── generate_final_jsonls.py ├── run_convert_to_parquet_lsh_0.4.sh └── run_generate_lsh_0.4_jsonls.sh /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/setup.py -------------------------------------------------------------------------------- /zyda/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /zyda/connected_components/generate_connected_components.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda/connected_components/generate_connected_components.py -------------------------------------------------------------------------------- /zyda/connected_components/generate_indices_to_remove.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda/connected_components/generate_indices_to_remove.py -------------------------------------------------------------------------------- /zyda/lsh_minhash/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /zyda/lsh_minhash/build_lsh_index.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda/lsh_minhash/build_lsh_index.py -------------------------------------------------------------------------------- /zyda/lsh_minhash/compute_minhash.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda/lsh_minhash/compute_minhash.py -------------------------------------------------------------------------------- /zyda/lsh_minhash/compute_optimal_params.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda/lsh_minhash/compute_optimal_params.py -------------------------------------------------------------------------------- /zyda/preprocessing_and_filtering/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /zyda/preprocessing_and_filtering/cursed_substrings.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda/preprocessing_and_filtering/cursed_substrings.json -------------------------------------------------------------------------------- /zyda/preprocessing_and_filtering/preprocess_and_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda/preprocessing_and_filtering/preprocess_and_filter.py -------------------------------------------------------------------------------- /zyda/preprocessing_and_filtering/profanity_word_list.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda/preprocessing_and_filtering/profanity_word_list.json -------------------------------------------------------------------------------- /zyda/preprocessing_and_filtering/sexual_word_list.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda/preprocessing_and_filtering/sexual_word_list.json -------------------------------------------------------------------------------- /zyda/preprocessing_and_filtering/zh_pornsignals.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda/preprocessing_and_filtering/zh_pornsignals.json -------------------------------------------------------------------------------- /zyda/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /zyda/utils/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda/utils/common.py -------------------------------------------------------------------------------- /zyda/utils/filtering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda/utils/filtering.py -------------------------------------------------------------------------------- /zyda/utils/text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda/utils/text.py -------------------------------------------------------------------------------- /zyda_reproduction/1_downloading/download_arxiv_pile_peS2o_c4_refinedweb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/1_downloading/download_arxiv_pile_peS2o_c4_refinedweb.py -------------------------------------------------------------------------------- /zyda_reproduction/1_downloading/download_refinedweb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/1_downloading/download_refinedweb.py -------------------------------------------------------------------------------- /zyda_reproduction/1_downloading/process_repo_slimpajama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/1_downloading/process_repo_slimpajama.py -------------------------------------------------------------------------------- /zyda_reproduction/1_downloading/process_repo_starcoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/1_downloading/process_repo_starcoder.py -------------------------------------------------------------------------------- /zyda_reproduction/2_preprocessing_and_filtering/preprocess_pile_c4_peS2o_arxiv.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/2_preprocessing_and_filtering/preprocess_pile_c4_peS2o_arxiv.sh -------------------------------------------------------------------------------- /zyda_reproduction/2_preprocessing_and_filtering/preprocess_refinedweb.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/2_preprocessing_and_filtering/preprocess_refinedweb.sh -------------------------------------------------------------------------------- /zyda_reproduction/2_preprocessing_and_filtering/preprocess_slimpajama.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/2_preprocessing_and_filtering/preprocess_slimpajama.sh -------------------------------------------------------------------------------- /zyda_reproduction/2_preprocessing_and_filtering/preprocess_starcoder.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/2_preprocessing_and_filtering/preprocess_starcoder.sh -------------------------------------------------------------------------------- /zyda_reproduction/3_minhashing/minhash_pile_c4_peS2o_arxiv.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/3_minhashing/minhash_pile_c4_peS2o_arxiv.sh -------------------------------------------------------------------------------- /zyda_reproduction/3_minhashing/minhash_refinedweb.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/3_minhashing/minhash_refinedweb.sh -------------------------------------------------------------------------------- /zyda_reproduction/3_minhashing/minhash_slimpajama.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/3_minhashing/minhash_slimpajama.sh -------------------------------------------------------------------------------- /zyda_reproduction/3_minhashing/minhash_starcoder.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/3_minhashing/minhash_starcoder.sh -------------------------------------------------------------------------------- /zyda_reproduction/4_lsh_indexing/run_lsh_dupes_0.4_all.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/4_lsh_indexing/run_lsh_dupes_0.4_all.sh -------------------------------------------------------------------------------- /zyda_reproduction/5_clustering/run_cc_lsh_0.4_dupes.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/5_clustering/run_cc_lsh_0.4_dupes.sh -------------------------------------------------------------------------------- /zyda_reproduction/6_generating_final_dataset/convert_jsonls_to_parquet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/6_generating_final_dataset/convert_jsonls_to_parquet.py -------------------------------------------------------------------------------- /zyda_reproduction/6_generating_final_dataset/generate_final_jsonls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/6_generating_final_dataset/generate_final_jsonls.py -------------------------------------------------------------------------------- /zyda_reproduction/6_generating_final_dataset/run_convert_to_parquet_lsh_0.4.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/6_generating_final_dataset/run_convert_to_parquet_lsh_0.4.sh -------------------------------------------------------------------------------- /zyda_reproduction/6_generating_final_dataset/run_generate_lsh_0.4_jsonls.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zyphra/Zyda_processing/HEAD/zyda_reproduction/6_generating_final_dataset/run_generate_lsh_0.4_jsonls.sh --------------------------------------------------------------------------------