├── .github └── workflows │ ├── code_quality.yml │ └── test.yml ├── .gitignore ├── .gitmodules ├── LICENSE ├── Makefile ├── README.md ├── README_training.md ├── bsmetadata ├── __init__.py ├── deepspeed_configs │ └── v2.json ├── evaluation.py ├── evaluation_utils.py ├── experiments │ ├── __init__.py │ ├── datasetv2.py │ ├── sample.py │ ├── with_metadata.py │ ├── with_metadata_datasetv2.py │ ├── with_metadata_datasetv2_tf.py │ └── without_metadata.py ├── hydra_configs │ ├── html_config.yaml │ ├── test.yaml │ └── v2.yaml ├── input_pipeline.py ├── metadata_processors.py ├── metadata_utils.py ├── paragraph_by_metadata_html.py ├── post_processing_utils.py ├── preprocessing_scripts │ ├── download_entity_processing_files.sh │ └── download_wiki_dump.sh ├── preprocessing_tools │ ├── __init__.py │ ├── html_parser │ │ ├── __init__.py │ │ ├── filters_and_cleaners.py │ │ ├── objects.py │ │ └── variables.py │ └── wikipedia_desc_utils.py ├── preprocessing_utils.py └── train.py ├── examples └── build_dataset │ ├── README.md │ ├── config.yaml │ └── run_build_dataset_with_metadata.py ├── experiments ├── .gitkeep ├── gcp │ └── dataset │ │ └── c4 │ │ ├── README.md │ │ ├── tag_metadata.py │ │ └── upload.sh ├── hpsearch │ └── test.sh ├── jz │ ├── README.md │ ├── dataset │ │ └── c4 │ │ │ ├── README.md │ │ │ ├── c4-en-html-deduped-full │ │ │ ├── 00_import_c4.slurm │ │ │ ├── 01_add_metadata.slurm │ │ │ ├── 02_add_website_desc.slurm │ │ │ ├── 03_recreate_arrow_dataset.slurm │ │ │ ├── export_to_jsonlines.slurm │ │ │ ├── post_process_website_desc.slurm │ │ │ └── push_to_hub.slurm │ │ │ ├── c4-en-v2 │ │ │ ├── 00_import_c4.slurm │ │ │ ├── 01_add_metadata_to_toy_c4_dataset.slurm │ │ │ ├── 02_add_website_desc.slurm │ │ │ ├── 03_add_entities.slurm │ │ │ ├── 04_recreate_arrow_dataset.slurm │ │ │ ├── 05_add_entitiies_v2.slurm │ │ │ ├── 06_shard_ds.slurm │ │ │ ├── 07_add_entitiies_v3.slurm │ │ │ ├── 08_add_entitiies_batch_2.slurm │ │ │ ├── 09_add_entitiies_batch_1_2_retry.slurm │ │ │ ├── 10_add_entitiies_batch_1_2_retry_2.slurm │ │ │ ├── 11_add_entitiies_batch_2_retry_oom.slurm │ │ │ ├── 12_add_entitiies_batch_1_retry_oom.slurm │ │ │ ├── 13_add_entitiies_batch_2_retry_oom_2.slurm │ │ │ ├── 14_add_entitiies_batch_2_retry_oom_3.slurm │ │ │ ├── 15_add_entitiies_batch_2_retry_oom_4.slurm │ │ │ ├── 16_add_entitiies_batch_2_retry_oom_5.slurm │ │ │ ├── 17_reconcatenate.slurm │ │ │ ├── 18_reconcatenate_batch_1.slurm │ │ │ ├── 19_export_to_jsonlines.slurm │ │ │ ├── 20_push_to_hub.slurm │ │ │ ├── README.md │ │ │ └── configs │ │ │ │ ├── 17_config.txt │ │ │ │ └── 18_config.txt │ │ │ └── python_scripts │ │ │ ├── add_metadata.py │ │ │ ├── concatenate_dataset.py │ │ │ ├── create_arrow_dataset.py │ │ │ ├── export_to_compressed_jsonl.py │ │ │ ├── post_process.py │ │ │ └── shard_ds.py │ ├── entity │ │ └── exp_1 │ │ │ └── toy_example │ │ │ ├── 01_load_tokenizer_and_model.slurm │ │ │ ├── 02_load_dataset.slurm │ │ │ ├── 03_create_dataset.slurm │ │ │ └── 04_do_training.slurm │ ├── joint_training_toy │ │ ├── joint_training_toy1 │ │ │ ├── 01_load_tokenizer_and_model.slurm │ │ │ ├── 02_load_dataset.slurm │ │ │ ├── 03_create_dataset.slurm │ │ │ ├── 04_do_training.slurm │ │ │ ├── README.md │ │ │ └── multi_steps.bash │ │ ├── joint_training_toy2-fp16 │ │ │ ├── 01_load_tokenizer_and_model.slurm │ │ │ ├── 02_load_dataset.slurm │ │ │ ├── 03_create_dataset.slurm │ │ │ ├── 04_do_training.slurm │ │ │ ├── README.md │ │ │ └── multi_steps.bash │ │ ├── joint_training_toy3-fp16-multigpu │ │ │ ├── 01_load_tokenizer_and_model.slurm │ │ │ ├── 02_load_dataset.slurm │ │ │ ├── 03_create_dataset.slurm │ │ │ ├── 04_do_training.slurm │ │ │ ├── README.md │ │ │ ├── local_test.sh │ │ │ └── multi_steps.bash │ │ └── joint_training_toy4-test-gpt2-xl │ │ │ ├── 04_do_training.slurm │ │ │ └── local_test.sh │ ├── templates │ │ └── SLURM │ │ │ ├── experiment_example │ │ │ ├── README.md │ │ │ └── subexperiment_1 │ │ │ │ ├── 01_load_tokenizer_and_model.slurm │ │ │ │ ├── 02_load_dataset.slurm │ │ │ │ ├── 03_create_dataset.slurm │ │ │ │ ├── 04_do_training.slurm │ │ │ │ └── multi_steps.bash │ │ │ └── experiment_template │ │ │ ├── 01_load_tokenizer_and_model.slurm │ │ │ ├── 02_load_dataset.slurm │ │ │ ├── 03_create_dataset.slurm │ │ │ ├── 04_do_training.slurm │ │ │ ├── README.md │ │ │ └── multi_steps.bash │ ├── toy_experiments │ │ ├── README.md │ │ └── dumpdb │ │ │ ├── dumpdb.py │ │ │ └── dumpdb.slurm │ ├── utils │ │ ├── convert_checkpoint_to_hf_format.py │ │ ├── loading_script_utils │ │ │ ├── load_dataset.py │ │ │ └── load_tokenizer_and_model.py │ │ └── sync_wandb.slurm │ └── website_metadata │ │ └── exp_1 │ │ ├── model_15k │ │ ├── 01_load_tokenizer_and_model.slurm │ │ ├── 02_load_dataset.slurm │ │ ├── 03_create_dataset.slurm │ │ └── 04_do_training.slurm │ │ ├── model_25k_clean │ │ ├── 01_load_tokenizer_and_model.slurm │ │ ├── 02_load_dataset.slurm │ │ ├── 03_create_dataset.slurm │ │ └── 04_do_training.slurm │ │ └── toy_example │ │ ├── 01_load_tokenizer_and_model.slurm │ │ ├── 02_load_dataset.slurm │ │ ├── 03_create_dataset.slurm │ │ └── 04_do_training.slurm └── shared_tips │ └── upload_custom_dataset_on_the_hub.md ├── input_examples.jsonl ├── poetry.lock ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── requirements_resolved_with_extras_and_dev.txt ├── setup.cfg ├── setup.py └── tests ├── data ├── train_toy_raw_wikitext.jsonl ├── train_toy_wikitext_with_metadata.jsonl ├── val_toy_raw_wikitext.jsonl └── val_toy_wikitext_with_metadata.jsonl ├── mocks └── mock_dump_db.py ├── preprocessing_tools └── html_parser │ └── test_html_parser.py ├── test_datasource_preprocessor.py ├── test_entity_preprocessor.py ├── test_get_dataloaders.py ├── test_get_paragraphs.py ├── test_length_preprocessor.py ├── test_metadata_utils.py ├── test_parse_date.py ├── test_preprocessing_utils.py ├── test_train.py └── test_website_post_processor.py /.github/workflows/code_quality.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/.github/workflows/code_quality.yml -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/.github/workflows/test.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/.gitmodules -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/LICENSE -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/README.md -------------------------------------------------------------------------------- /README_training.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/README_training.md -------------------------------------------------------------------------------- /bsmetadata/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bsmetadata/deepspeed_configs/v2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/deepspeed_configs/v2.json -------------------------------------------------------------------------------- /bsmetadata/evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/evaluation.py -------------------------------------------------------------------------------- /bsmetadata/evaluation_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/evaluation_utils.py -------------------------------------------------------------------------------- /bsmetadata/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bsmetadata/experiments/datasetv2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/experiments/datasetv2.py -------------------------------------------------------------------------------- /bsmetadata/experiments/sample.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/experiments/sample.py -------------------------------------------------------------------------------- /bsmetadata/experiments/with_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/experiments/with_metadata.py -------------------------------------------------------------------------------- /bsmetadata/experiments/with_metadata_datasetv2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/experiments/with_metadata_datasetv2.py -------------------------------------------------------------------------------- /bsmetadata/experiments/with_metadata_datasetv2_tf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/experiments/with_metadata_datasetv2_tf.py -------------------------------------------------------------------------------- /bsmetadata/experiments/without_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/experiments/without_metadata.py -------------------------------------------------------------------------------- /bsmetadata/hydra_configs/html_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/hydra_configs/html_config.yaml -------------------------------------------------------------------------------- /bsmetadata/hydra_configs/test.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/hydra_configs/test.yaml -------------------------------------------------------------------------------- /bsmetadata/hydra_configs/v2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/hydra_configs/v2.yaml -------------------------------------------------------------------------------- /bsmetadata/input_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/input_pipeline.py -------------------------------------------------------------------------------- /bsmetadata/metadata_processors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/metadata_processors.py -------------------------------------------------------------------------------- /bsmetadata/metadata_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/metadata_utils.py -------------------------------------------------------------------------------- /bsmetadata/paragraph_by_metadata_html.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/paragraph_by_metadata_html.py -------------------------------------------------------------------------------- /bsmetadata/post_processing_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/post_processing_utils.py -------------------------------------------------------------------------------- /bsmetadata/preprocessing_scripts/download_entity_processing_files.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/preprocessing_scripts/download_entity_processing_files.sh -------------------------------------------------------------------------------- /bsmetadata/preprocessing_scripts/download_wiki_dump.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/preprocessing_scripts/download_wiki_dump.sh -------------------------------------------------------------------------------- /bsmetadata/preprocessing_tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bsmetadata/preprocessing_tools/html_parser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/preprocessing_tools/html_parser/__init__.py -------------------------------------------------------------------------------- /bsmetadata/preprocessing_tools/html_parser/filters_and_cleaners.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/preprocessing_tools/html_parser/filters_and_cleaners.py -------------------------------------------------------------------------------- /bsmetadata/preprocessing_tools/html_parser/objects.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/preprocessing_tools/html_parser/objects.py -------------------------------------------------------------------------------- /bsmetadata/preprocessing_tools/html_parser/variables.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/preprocessing_tools/html_parser/variables.py -------------------------------------------------------------------------------- /bsmetadata/preprocessing_tools/wikipedia_desc_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/preprocessing_tools/wikipedia_desc_utils.py -------------------------------------------------------------------------------- /bsmetadata/preprocessing_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/preprocessing_utils.py -------------------------------------------------------------------------------- /bsmetadata/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/bsmetadata/train.py -------------------------------------------------------------------------------- /examples/build_dataset/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/examples/build_dataset/README.md -------------------------------------------------------------------------------- /examples/build_dataset/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/examples/build_dataset/config.yaml -------------------------------------------------------------------------------- /examples/build_dataset/run_build_dataset_with_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/examples/build_dataset/run_build_dataset_with_metadata.py -------------------------------------------------------------------------------- /experiments/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/gcp/dataset/c4/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/gcp/dataset/c4/README.md -------------------------------------------------------------------------------- /experiments/gcp/dataset/c4/tag_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/gcp/dataset/c4/tag_metadata.py -------------------------------------------------------------------------------- /experiments/gcp/dataset/c4/upload.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/gcp/dataset/c4/upload.sh -------------------------------------------------------------------------------- /experiments/hpsearch/test.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/hpsearch/test.sh -------------------------------------------------------------------------------- /experiments/jz/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/README.md -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/README.md -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-html-deduped-full/00_import_c4.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-html-deduped-full/00_import_c4.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-html-deduped-full/01_add_metadata.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-html-deduped-full/01_add_metadata.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-html-deduped-full/02_add_website_desc.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-html-deduped-full/02_add_website_desc.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-html-deduped-full/03_recreate_arrow_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-html-deduped-full/03_recreate_arrow_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-html-deduped-full/export_to_jsonlines.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-html-deduped-full/export_to_jsonlines.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-html-deduped-full/post_process_website_desc.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-html-deduped-full/post_process_website_desc.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-html-deduped-full/push_to_hub.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-html-deduped-full/push_to_hub.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/00_import_c4.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/00_import_c4.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/01_add_metadata_to_toy_c4_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/01_add_metadata_to_toy_c4_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/02_add_website_desc.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/02_add_website_desc.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/03_add_entities.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/03_add_entities.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/04_recreate_arrow_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/04_recreate_arrow_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/05_add_entitiies_v2.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/05_add_entitiies_v2.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/06_shard_ds.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/06_shard_ds.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/07_add_entitiies_v3.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/07_add_entitiies_v3.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/08_add_entitiies_batch_2.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/08_add_entitiies_batch_2.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/09_add_entitiies_batch_1_2_retry.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/09_add_entitiies_batch_1_2_retry.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/10_add_entitiies_batch_1_2_retry_2.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/10_add_entitiies_batch_1_2_retry_2.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/11_add_entitiies_batch_2_retry_oom.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/11_add_entitiies_batch_2_retry_oom.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/12_add_entitiies_batch_1_retry_oom.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/12_add_entitiies_batch_1_retry_oom.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/13_add_entitiies_batch_2_retry_oom_2.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/13_add_entitiies_batch_2_retry_oom_2.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/14_add_entitiies_batch_2_retry_oom_3.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/14_add_entitiies_batch_2_retry_oom_3.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/15_add_entitiies_batch_2_retry_oom_4.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/15_add_entitiies_batch_2_retry_oom_4.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/16_add_entitiies_batch_2_retry_oom_5.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/16_add_entitiies_batch_2_retry_oom_5.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/17_reconcatenate.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/17_reconcatenate.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/18_reconcatenate_batch_1.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/18_reconcatenate_batch_1.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/19_export_to_jsonlines.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/19_export_to_jsonlines.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/20_push_to_hub.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/20_push_to_hub.slurm -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/README.md -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/configs/17_config.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/configs/17_config.txt -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/c4-en-v2/configs/18_config.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/c4-en-v2/configs/18_config.txt -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/python_scripts/add_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/python_scripts/add_metadata.py -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/python_scripts/concatenate_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/python_scripts/concatenate_dataset.py -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/python_scripts/create_arrow_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/python_scripts/create_arrow_dataset.py -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/python_scripts/export_to_compressed_jsonl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/python_scripts/export_to_compressed_jsonl.py -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/python_scripts/post_process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/python_scripts/post_process.py -------------------------------------------------------------------------------- /experiments/jz/dataset/c4/python_scripts/shard_ds.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/dataset/c4/python_scripts/shard_ds.py -------------------------------------------------------------------------------- /experiments/jz/entity/exp_1/toy_example/01_load_tokenizer_and_model.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/entity/exp_1/toy_example/01_load_tokenizer_and_model.slurm -------------------------------------------------------------------------------- /experiments/jz/entity/exp_1/toy_example/02_load_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/entity/exp_1/toy_example/02_load_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/entity/exp_1/toy_example/03_create_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/entity/exp_1/toy_example/03_create_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/entity/exp_1/toy_example/04_do_training.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/entity/exp_1/toy_example/04_do_training.slurm -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy1/01_load_tokenizer_and_model.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy1/01_load_tokenizer_and_model.slurm -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy1/02_load_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy1/02_load_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy1/03_create_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy1/03_create_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy1/04_do_training.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy1/04_do_training.slurm -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy1/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy1/README.md -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy1/multi_steps.bash: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy1/multi_steps.bash -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy2-fp16/01_load_tokenizer_and_model.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy2-fp16/01_load_tokenizer_and_model.slurm -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy2-fp16/02_load_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy2-fp16/02_load_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy2-fp16/03_create_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy2-fp16/03_create_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy2-fp16/04_do_training.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy2-fp16/04_do_training.slurm -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy2-fp16/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy2-fp16/README.md -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy2-fp16/multi_steps.bash: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy2-fp16/multi_steps.bash -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy3-fp16-multigpu/01_load_tokenizer_and_model.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy3-fp16-multigpu/01_load_tokenizer_and_model.slurm -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy3-fp16-multigpu/02_load_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy3-fp16-multigpu/02_load_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy3-fp16-multigpu/03_create_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy3-fp16-multigpu/03_create_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy3-fp16-multigpu/04_do_training.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy3-fp16-multigpu/04_do_training.slurm -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy3-fp16-multigpu/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy3-fp16-multigpu/README.md -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy3-fp16-multigpu/local_test.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy3-fp16-multigpu/local_test.sh -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy3-fp16-multigpu/multi_steps.bash: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy3-fp16-multigpu/multi_steps.bash -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy4-test-gpt2-xl/04_do_training.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy4-test-gpt2-xl/04_do_training.slurm -------------------------------------------------------------------------------- /experiments/jz/joint_training_toy/joint_training_toy4-test-gpt2-xl/local_test.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/joint_training_toy/joint_training_toy4-test-gpt2-xl/local_test.sh -------------------------------------------------------------------------------- /experiments/jz/templates/SLURM/experiment_example/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/templates/SLURM/experiment_example/README.md -------------------------------------------------------------------------------- /experiments/jz/templates/SLURM/experiment_example/subexperiment_1/01_load_tokenizer_and_model.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/01_load_tokenizer_and_model.slurm -------------------------------------------------------------------------------- /experiments/jz/templates/SLURM/experiment_example/subexperiment_1/02_load_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/02_load_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/templates/SLURM/experiment_example/subexperiment_1/03_create_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/03_create_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm -------------------------------------------------------------------------------- /experiments/jz/templates/SLURM/experiment_example/subexperiment_1/multi_steps.bash: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/multi_steps.bash -------------------------------------------------------------------------------- /experiments/jz/templates/SLURM/experiment_template/01_load_tokenizer_and_model.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/templates/SLURM/experiment_template/01_load_tokenizer_and_model.slurm -------------------------------------------------------------------------------- /experiments/jz/templates/SLURM/experiment_template/02_load_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/templates/SLURM/experiment_template/02_load_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/templates/SLURM/experiment_template/03_create_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/templates/SLURM/experiment_template/03_create_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/templates/SLURM/experiment_template/04_do_training.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/templates/SLURM/experiment_template/04_do_training.slurm -------------------------------------------------------------------------------- /experiments/jz/templates/SLURM/experiment_template/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/templates/SLURM/experiment_template/README.md -------------------------------------------------------------------------------- /experiments/jz/templates/SLURM/experiment_template/multi_steps.bash: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/templates/SLURM/experiment_template/multi_steps.bash -------------------------------------------------------------------------------- /experiments/jz/toy_experiments/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/toy_experiments/README.md -------------------------------------------------------------------------------- /experiments/jz/toy_experiments/dumpdb/dumpdb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/toy_experiments/dumpdb/dumpdb.py -------------------------------------------------------------------------------- /experiments/jz/toy_experiments/dumpdb/dumpdb.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/toy_experiments/dumpdb/dumpdb.slurm -------------------------------------------------------------------------------- /experiments/jz/utils/convert_checkpoint_to_hf_format.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/utils/convert_checkpoint_to_hf_format.py -------------------------------------------------------------------------------- /experiments/jz/utils/loading_script_utils/load_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/utils/loading_script_utils/load_dataset.py -------------------------------------------------------------------------------- /experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py -------------------------------------------------------------------------------- /experiments/jz/utils/sync_wandb.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/utils/sync_wandb.slurm -------------------------------------------------------------------------------- /experiments/jz/website_metadata/exp_1/model_15k/01_load_tokenizer_and_model.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/website_metadata/exp_1/model_15k/01_load_tokenizer_and_model.slurm -------------------------------------------------------------------------------- /experiments/jz/website_metadata/exp_1/model_15k/02_load_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/website_metadata/exp_1/model_15k/02_load_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/website_metadata/exp_1/model_15k/03_create_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/website_metadata/exp_1/model_15k/03_create_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/website_metadata/exp_1/model_15k/04_do_training.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/website_metadata/exp_1/model_15k/04_do_training.slurm -------------------------------------------------------------------------------- /experiments/jz/website_metadata/exp_1/model_25k_clean/01_load_tokenizer_and_model.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/website_metadata/exp_1/model_25k_clean/01_load_tokenizer_and_model.slurm -------------------------------------------------------------------------------- /experiments/jz/website_metadata/exp_1/model_25k_clean/02_load_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/website_metadata/exp_1/model_25k_clean/02_load_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/website_metadata/exp_1/model_25k_clean/03_create_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/website_metadata/exp_1/model_25k_clean/03_create_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/website_metadata/exp_1/model_25k_clean/04_do_training.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/website_metadata/exp_1/model_25k_clean/04_do_training.slurm -------------------------------------------------------------------------------- /experiments/jz/website_metadata/exp_1/toy_example/01_load_tokenizer_and_model.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/website_metadata/exp_1/toy_example/01_load_tokenizer_and_model.slurm -------------------------------------------------------------------------------- /experiments/jz/website_metadata/exp_1/toy_example/02_load_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/website_metadata/exp_1/toy_example/02_load_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/website_metadata/exp_1/toy_example/03_create_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/website_metadata/exp_1/toy_example/03_create_dataset.slurm -------------------------------------------------------------------------------- /experiments/jz/website_metadata/exp_1/toy_example/04_do_training.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/jz/website_metadata/exp_1/toy_example/04_do_training.slurm -------------------------------------------------------------------------------- /experiments/shared_tips/upload_custom_dataset_on_the_hub.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/experiments/shared_tips/upload_custom_dataset_on_the_hub.md -------------------------------------------------------------------------------- /input_examples.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/input_examples.jsonl -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/poetry.lock -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/requirements-dev.txt -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/requirements.txt -------------------------------------------------------------------------------- /requirements_resolved_with_extras_and_dev.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/requirements_resolved_with_extras_and_dev.txt -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/setup.cfg -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/setup.py -------------------------------------------------------------------------------- /tests/data/train_toy_raw_wikitext.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/data/train_toy_raw_wikitext.jsonl -------------------------------------------------------------------------------- /tests/data/train_toy_wikitext_with_metadata.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/data/train_toy_wikitext_with_metadata.jsonl -------------------------------------------------------------------------------- /tests/data/val_toy_raw_wikitext.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/data/val_toy_raw_wikitext.jsonl -------------------------------------------------------------------------------- /tests/data/val_toy_wikitext_with_metadata.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/data/val_toy_wikitext_with_metadata.jsonl -------------------------------------------------------------------------------- /tests/mocks/mock_dump_db.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/mocks/mock_dump_db.py -------------------------------------------------------------------------------- /tests/preprocessing_tools/html_parser/test_html_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/preprocessing_tools/html_parser/test_html_parser.py -------------------------------------------------------------------------------- /tests/test_datasource_preprocessor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/test_datasource_preprocessor.py -------------------------------------------------------------------------------- /tests/test_entity_preprocessor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/test_entity_preprocessor.py -------------------------------------------------------------------------------- /tests/test_get_dataloaders.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/test_get_dataloaders.py -------------------------------------------------------------------------------- /tests/test_get_paragraphs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/test_get_paragraphs.py -------------------------------------------------------------------------------- /tests/test_length_preprocessor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/test_length_preprocessor.py -------------------------------------------------------------------------------- /tests/test_metadata_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/test_metadata_utils.py -------------------------------------------------------------------------------- /tests/test_parse_date.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/test_parse_date.py -------------------------------------------------------------------------------- /tests/test_preprocessing_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/test_preprocessing_utils.py -------------------------------------------------------------------------------- /tests/test_train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/test_train.py -------------------------------------------------------------------------------- /tests/test_website_post_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/metadata/HEAD/tests/test_website_post_processor.py --------------------------------------------------------------------------------