├── .github
    └── workflows
    │   └── trufflehog.yml
├── .gitignore
├── LICENSE
├── README.md
├── text
    ├── README.md
    ├── data
    │   ├── decontamination
    │   │   └── README.md
    │   ├── finemath
    │   │   └── README.md
    │   ├── fineweb-edu
    │   │   └── README.md
    │   └── smoltalk
    │   │   ├── README.md
    │   │   ├── constraints
    │   │       ├── README.md
    │   │       ├── filter_ifeval_data.py
    │   │       ├── launch_ifeval_pipeline.slurm
    │   │       └── pipeline
    │   │       │   ├── __init__.py
    │   │       │   ├── ifeval_tasks.py
    │   │       │   ├── json_schemas.py
    │   │       │   ├── pipeline.py
    │   │       │   └── system_prompts.py
    │   │   ├── magpie_ultra_v1
    │   │       ├── README.md
    │   │       └── pipeline.py
    │   │   ├── rewrite
    │   │       ├── README.md
    │   │       ├── launch_rewrite_pipeline.slurm
    │   │       └── pipeline
    │   │       │   ├── __init__.py
    │   │       │   ├── dataset.py
    │   │       │   └── pipeline.py
    │   │   └── summarization
    │   │       ├── README.md
    │   │       └── pipelines
    │   │           ├── cnn_daily_summaries.py
    │   │           └── email_summaries.py
    ├── evaluation
    │   ├── README.md
    │   ├── math_utils.py
    │   ├── requirements.txt
    │   ├── smollm2_base.txt
    │   ├── smollm2_instruct.txt
    │   └── tasks.py
    ├── finetuning
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── requirements.txt
    │   └── train.py
    └── pretraining
    │   ├── README.md
    │   ├── continual-pretraining
    │       ├── README.md
    │       └── finemath
    │       │   ├── 160B-runs
    │       │       ├── fwedu-finemath-infiwebmath-3plus.yaml
    │       │       └── fwedu-finemath-infiwebmath-4plus.yaml
    │       │   ├── 60B-runs
    │       │       ├── finemath-3plus.yaml
    │       │       ├── finemath-4plus.yaml
    │       │       ├── finemath-infiwebmath-3plus.yaml
    │       │       ├── finemath-infiwebmath-4plus.yaml
    │       │       ├── infiwebmath-3plus.yaml
    │       │       ├── infiwebmath-4plus.yaml
    │       │       ├── infiwebmath.yaml
    │       │       └── openwebmath.yaml
    │       │   ├── finemath-tokenize.py
    │       │   ├── tokenization_InfiMM-WebMath-40B.patch
    │       │   └── tokenization_finemath.patch
    │   ├── launch.slurm
    │   ├── smollm1
    │       ├── config_smollm1_135M.yaml
    │       ├── config_smollm1_1B.yaml
    │       └── config_smollm1_360M.yaml
    │   └── smollm2
    │       ├── config_smollm2_135M.yaml
    │       ├── config_smollm2_1B.yaml
    │       └── config_smollm2_360M.yaml
├── tools
    ├── README.md
    ├── smol_tools
    │   ├── README.md
    │   ├── demo_tkinter.py
    │   ├── requirements.txt
    │   └── smol_tools
    │   │   ├── agent.py
    │   │   ├── base.py
    │   │   ├── chatter.py
    │   │   ├── rewriter.py
    │   │   ├── summarizer.py
    │   │   └── titler.py
    ├── smollm_local_inference
    │   ├── README.md
    │   ├── llama-cpp-python.py
    │   ├── mlc.py
    │   ├── mlx.py
    │   └── transformers-js.js
    └── smolvlm_local_inference
    │   ├── README.md
    │   └── SmolVLM_video_inference.py
└── vision
    ├── README.md
    ├── data
        ├── README.md
        └── datasets_processing_scripts
        │   ├── 01_tar_datasets_with_jpeg
        │       └── python_scripts
        │       │   ├── 01_convert_coco_per_shard_idx.py
        │       │   ├── 02_convert_cm4_per_shard_idx.py
        │       │   ├── 03_convert_laoin_per_shard_idx.py
        │       │   ├── 04_convert_cm4_per_shard_idx.py
        │       │   ├── 05_convert_scaled_laion_per_shard_idx.py
        │       │   ├── 06_convert_lrv_per_shard_idx.py
        │       │   ├── 07_convert_llava_per_shard_idx.py
        │       │   ├── 08_convert_svit_per_shard_idx.py
        │       │   ├── 09_convert_ultrachat_per_shard_idx.py
        │       │   ├── 10_convert_m3it_per_shard_idx.py
        │       │   ├── 11_convert_spot_difference_per_shard_idx.py
        │       │   ├── 12_convert_llavar_per_shard_idx.py
        │       │   ├── 13_convert_vqav2_task_finetuning_per_shard_idx.py
        │       │   ├── 14_convert_tikz_per_shard_idx.py
        │       │   ├── 15_convert_docvqa_per_shard_idx.py
        │       │   ├── 16_convert_image_website_code_per_shard_idx.py
        │       │   ├── 17_convert_websight_v02_per_shard_idx.py
        │       │   ├── 18_convert_sft_per_shard_idx.py
        │       │   └── 19_convert_websight_mix_per_shard_idx.py
        │   ├── build_concatenation_datasets_sft
        │       ├── build_concat_ds_sft.py
        │       ├── build_ds_sft.py
        │       ├── build_the_cauldron.py
        │       ├── create_set_hashes_test_images.py
        │       ├── job_build_the_cauldron.slurm
        │       ├── job_merge_on_image_individual_dataset.slurm
        │       ├── merge_on_image_individual_dataset.py
        │       ├── tar_dataset_pattern_check.py
        │       └── viz_tool.py
        │   ├── build_ethic_dataset
        │       ├── bias_generation_eval_idefics.py
        │       └── bias_generation_eval_idefics2.py
        │   ├── build_image_website_code
        │       ├── 01_generate_ideas_website.py
        │       ├── 02_generate_html_css_codes.py
        │       ├── 03_extraction_html_css_codes.py
        │       └── 04_screenshot_rendered_websites.py
        │   ├── build_laion_coco_dataset
        │       └── python_scripts
        │       │   ├── 02_01_find_opt_out.py
        │       │   ├── 02_02_remove_opt_out.py
        │       │   ├── 03_remove_nsfw_images.py
        │       │   ├── 04_remove_small_images.py
        │       │   ├── 05_binary_classification.py
        │       │   ├── make_laion_coco.py
        │       │   ├── make_laion_coco_1_4_dataset.py
        │       │   └── train_bin_classif.py
        │   ├── build_laion_dataset
        │       └── python_scripts
        │       │   ├── 01_01_download_prepare_laion.py
        │       │   ├── 01_02_template_loading_script_laion.py
        │       │   ├── 02_filter_laion.py
        │       │   ├── 03_01_prepare_dedup_laion.py
        │       │   ├── 03_02_dedup_laion.py
        │       │   ├── 04_01_find_laion_urls.py
        │       │   ├── 04_02_create_ds_laion_urls.py
        │       │   ├── 04_03_find_opt_out_images_laion.py
        │       │   ├── 04_04_remove_opt_out_images_laion.py
        │       │   └── merge_1_4_laion_big_shards.py
        │   ├── build_webdocs_dataset
        │       └── python_scripts
        │       │   ├── 01_download_warc.py
        │       │   ├── 02_bis_extract_html_get_image_urls_new_rules.py
        │       │   ├── 02_extract_html_get_image_urls.py
        │       │   ├── 02_parallel_extract_html_get_image_urls.py
        │       │   ├── 03_dl_images_create_dataset.py
        │       │   ├── 03_parallel_dl_images_create_dataset.py
        │       │   ├── 04_merge_web_docs_with_images.py
        │       │   ├── 05_filtering_web_docs.py
        │       │   ├── 06_01_create_set_image_urls_in_webdocs.py
        │       │   ├── 06_02_merge_sets_image_urls_in_webdocs.py
        │       │   ├── 06_03_remove_image_duplicates.py
        │       │   ├── 07_01_nsfw_image_filtering.py
        │       │   ├── 07_02_nsfw_image_visualization.py
        │       │   ├── 07_03_nsfw_image_removal.py
        │       │   ├── 08_01_prepare_urldedup.py
        │       │   ├── 08_02_urldedup.py
        │       │   ├── 09_01_create_web_docs_texts_only.py
        │       │   ├── 09_02_get_domain_to_positions.py
        │       │   ├── 09_03_split_domain_to_positions.py
        │       │   ├── 09_04_get_domain_to_duplicated_texts.py
        │       │   ├── 09_05_merge_domain_to_duplicated_texts_sharded.py
        │       │   ├── 09_06_line_dedup.py
        │       │   ├── 09_07_merge_web_docs_texts_only_and_rest.py
        │       │   ├── 10_final_cleaning.py
        │       │   ├── 11_01_create_set_img_urls.py
        │       │   ├── 11_02_get_docs_to_remove_by_set_img_urls_dedup.py
        │       │   ├── 11_03_set_img_urls_dedup.py
        │       │   ├── 12_01_find_opt_out_images.py
        │       │   ├── 12_02_remove_opt_out_images.py
        │       │   ├── 13_final_processing.py
        │       │   ├── 14_01_filter_perplexity_with_language_model.py
        │       │   ├── 15_01_find_urls_obelics.py
        │       │   ├── 15_02_find_opt_out.py
        │       │   ├── 15_03_remove_opt_out_documents.py
        │       │   └── 15_04_remove_opt_out_images.py
        │   ├── build_websight_v02
        │       └── python_scripts
        │       │   ├── 01_generate_ideas_websites.py
        │       │   ├── 02_01_generate_html_codes_prompt_1.py
        │       │   ├── 02_02_generate_html_codes_prompt_2.py
        │       │   ├── 03_filtering_html_codes.py
        │       │   └── 04_screenshot_html_codes.py
        │   ├── clean_m4_prelimenary_experiments
        │       ├── README.md
        │       ├── explore
        │       │   ├── assets
        │       │   │   └── DOM_tree_viz.html
        │       │   ├── explore.py
        │       │   └── global_visualization.py
        │       └── python_scripts
        │       │   ├── 01_shard_names.txt
        │       │   ├── 02_add_html_back.py
        │       │   ├── 03_clean_v2.py
        │       │   ├── 04_get_banned_urls.py
        │       │   ├── 05_filter_cm4.py
        │       │   └── get_modelling_metadata_dataset.py
        │   ├── create_evaluation_datasets
        │       ├── Flickr30k
        │       │   └── flickr30k.py
        │       ├── MMBench
        │       │   └── make_mmbench.py
        │       ├── NLVR2
        │       │   └── NLVR2.py
        │       ├── README.md
        │       ├── SEED
        │       │   └── make_seed.py
        │       ├── ScienceQA
        │       │   ├── scienceqa.py
        │       │   └── scienceqa_old_setup.py
        │       ├── SugarCrepe
        │       │   └── make_sugarcrepe.py
        │       ├── create_AI2D
        │       │   ├── create_ai2d.py
        │       │   ├── create_ai2d_2.py
        │       │   ├── create_ai2d_4.py
        │       │   ├── create_ai2d_5_abcd.py
        │       │   ├── create_ai2d_6_abcd.py
        │       │   └── create_ai2d_7_abcd.py
        │       ├── create_clevr.py
        │       ├── create_coco.py
        │       ├── create_fairface.py
        │       ├── create_flickr30k.py
        │       ├── create_hateful_memes.py
        │       ├── create_imagenet1k.py
        │       ├── create_imagenet1k_1ksupportset_subsets.py
        │       ├── create_imagenet1k_5ksupportset_subsets.py
        │       ├── create_math_vista.py
        │       ├── create_math_vista_mcq.py
        │       ├── create_math_vista_open_ended.py
        │       ├── create_mmbench.py
        │       ├── create_mmmu.py
        │       ├── create_mmmu_mcq.py
        │       ├── create_mmmu_open_ended.py
        │       ├── create_mmstar.py
        │       ├── create_nlvr2.py
        │       ├── create_nocaps.py
        │       ├── create_okvqa.py
        │       ├── create_renderedsst2.py
        │       ├── create_scienceqa.py
        │       ├── create_scienceqa_old_setup.py
        │       ├── create_textcaps.py
        │       ├── create_textvqa.py
        │       ├── create_visdial.py
        │       ├── create_vizwiz.py
        │       ├── create_vqav2.py
        │       ├── create_vqav2_subsets.py
        │       └── dedup_val_mmbench.py
        │   ├── create_fine_tuning_datasets
        │       ├── create_aokvqa.py
        │       ├── create_llavar.py
        │       ├── create_m3it.py
        │       ├── create_pgm.py
        │       ├── create_raven.py
        │       ├── create_spot_difference.py
        │       └── create_table_datasets.py
        │   ├── create_valid_ds
        │       ├── create_cm4_valid.py
        │       ├── create_coco_valid.py
        │       └── create_wiki_valid.py
        │   ├── enwiki
        │       ├── REAME.md
        │       └── python_scripts
        │       │   ├── 01_extract_text_and_urls_from_wikipedia_web_documents.py
        │       │   ├── 02_load_wit_images_in_ds.py
        │       │   ├── 02bis_get_stats.py
        │       │   ├── 03_extract_intermediary_dataset.py
        │       │   ├── 04_get_list_of_remaining_images.py
        │       │   ├── 04bis_get_list_of_remaining_images.py
        │       │   ├── 05_download_remaining_urls.py
        │       │   ├── 06_create_image_dataset.py
        │       │   ├── 07_get_images_in_ds.py
        │       │   └── 08_save_dataset.py
        │   ├── integrate_evaluation_benchmarks_chatbot
        │       ├── gqa.py
        │       ├── llava_wild.py
        │       ├── mm_vet.py
        │       ├── mmbench.py
        │       ├── mmbench_no_mcq.py
        │       ├── pope.py
        │       ├── qbench.py
        │       ├── scienceqa.py
        │       ├── scienceqa_no_mcq.py
        │       ├── seed_img.py
        │       └── vsr.py
        │   └── upload_rendered_text_dataset
        │       └── upload_tar_to_s3.py
    ├── evaluation
        └── README.md
    ├── experiments
        ├── evaluation
        │   └── vloom
        │   │   ├── README.md
        │   │   ├── async_eval_template
        │   │       ├── run_evals_0_shots.slurm
        │   │       ├── run_evals_0_shots_a_la_flamingo.slurm
        │   │       ├── run_evals_4_shots.slurm
        │   │       └── run_evals_perplexity_validation.slurm
        │   │   ├── async_evals_tr_341
        │   │       ├── run_evals_4_shots_captioning_1024.slurm
        │   │       ├── run_evals_4_shots_captioning_2048.slurm
        │   │       ├── run_evals_4_shots_vqa_1024.slurm
        │   │       └── run_evals_4_shots_vqa_2048.slurm
        │   │   ├── async_evals_tr_343
        │   │       ├── run_evals_4_shots_captioning_1024.slurm
        │   │       ├── run_evals_4_shots_captioning_2048.slurm
        │   │       ├── run_evals_4_shots_vqa_1024.slurm
        │   │       └── run_evals_4_shots_vqa_2048.slurm
        │   │   ├── async_evals_tr_346
        │   │       ├── run_evals_0_shots_test_2048_docvqa.slurm
        │   │       ├── run_evals_0_shots_val_1024.slurm
        │   │       ├── run_evals_0_shots_val_1536.slurm
        │   │       ├── run_evals_0_shots_val_2048.slurm
        │   │       └── run_evals_0_shots_val_512.slurm
        │   │   ├── async_evals_tr_348
        │   │       ├── run_evals_4_shots_captioning_1024.slurm
        │   │       ├── run_evals_4_shots_captioning_1920.slurm
        │   │       ├── run_evals_4_shots_vqa_1024.slurm
        │   │       └── run_evals_4_shots_vqa_1920.slurm
        │   │   ├── async_evals_tr_349
        │   │       ├── run_evals_0_shots_test_2048_docvqa.slurm
        │   │       ├── run_evals_0_shots_val_1024.slurm
        │   │       ├── run_evals_0_shots_val_1536.slurm
        │   │       ├── run_evals_0_shots_val_2048.slurm
        │   │       └── run_evals_0_shots_val_512.slurm
        │   │   ├── common
        │   │       ├── accelerate_config.yaml
        │   │       ├── run_cron_evals_multi_task_cluster.slurm
        │   │       ├── run_evals_cluster.slurm
        │   │       ├── run_evals_local_datasets.slurm
        │   │       ├── run_evals_local_datasets_tickets.slurm
        │   │       ├── run_evals_multi_task_cluster.slurm
        │   │       ├── run_evals_multi_task_cluster_s3.slurm
        │   │       ├── sync_evaluations_on_gcs.slurm
        │   │       ├── sync_evaluations_on_s3.slurm
        │   │       └── sync_evaluations_on_wandb.slurm
        │   │   ├── cron_eval_template
        │   │       ├── run_evals_0_shots.slurm
        │   │       └── run_evals_2_shots.slurm
        │   │   ├── cron_tr_cron_template
        │   │       ├── run_evals_0_shots.slurm
        │   │       └── run_evals_4_shots.slurm
        │   │   └── slurm_scripts_templates
        │   │       └── run_evals_master_template.slurm
        └── pretraining
        │   └── vloom
        │       ├── README.md
        │       ├── common
        │           ├── sync_and_upload.sh
        │           └── webdataset_get_file.sh
        │       ├── slurm_scripts_templates
        │           ├── accelerate_config_multi_node.yaml
        │           ├── accelerate_config_single_node.yaml
        │           ├── ds_config.json
        │           ├── ds_config_bf16.json
        │           ├── hfc_with_launcher
        │           │   ├── 01_launch.sh
        │           │   ├── cleanup-checkpoints.slurm
        │           │   ├── config.yaml
        │           │   ├── convert-checkpoints.slurm
        │           │   ├── s3-upload-checkpoints.slurm
        │           │   ├── schedule-evals.slurm
        │           │   └── train.slurm
        │           ├── multi_node_run.slurm
        │           ├── single_node_run.slurm
        │           └── with_launcher
        │           │   ├── 01_launch.slurm
        │           │   ├── accelerate_config.yaml
        │           │   ├── config.yaml
        │           │   ├── ds_config.json
        │           │   └── train.slurm
        │       ├── tr_341_smolvlm_025b_1st_stage
        │           ├── 01_launch.sh
        │           ├── cleanup-checkpoints.slurm
        │           ├── config.yaml
        │           ├── convert-checkpoints.slurm
        │           ├── merge_lora_and_resize_eou.slurm
        │           ├── resize_embed_for_eou.py
        │           ├── s3-upload-checkpoints.slurm
        │           ├── s3-upload-run-files.slurm
        │           ├── schedule-evals.sh
        │           ├── slurm-status.slurm
        │           └── train.slurm
        │       ├── tr_343_smolvlm_05b_1st_stage
        │           ├── 01_launch.sh
        │           ├── cleanup-checkpoints.slurm
        │           ├── config.yaml
        │           ├── convert-checkpoints.slurm
        │           ├── merge_lora_and_resize_eou.slurm
        │           ├── resize_embed_for_eou.py
        │           ├── s3-upload-checkpoints.slurm
        │           ├── s3-upload-run-files.slurm
        │           ├── schedule-evals.slurm
        │           ├── slurm-status.slurm
        │           └── train.slurm
        │       ├── tr_345_vsmollm2_256M_2nd_stage
        │           ├── 01_launch.sh
        │           ├── cleanup-checkpoints.slurm
        │           ├── config.yaml
        │           ├── convert-checkpoints.slurm
        │           ├── merge_lora_and_resize_eou.slurm
        │           ├── resize_embed_for_eou.py
        │           ├── s3-upload-checkpoints.slurm
        │           ├── s3-upload-run-files.slurm
        │           ├── schedule-evals.sh
        │           ├── slurm-status.slurm
        │           └── train.slurm
        │       ├── tr_346_vsmollm2_256M_3rd_stage
        │           ├── 01_launch.sh
        │           ├── cleanup-checkpoints.slurm
        │           ├── config.yaml
        │           ├── convert-checkpoints.slurm
        │           ├── merge_lora_and_resize_eou.slurm
        │           ├── resize_embed_for_eou.py
        │           ├── s3-upload-checkpoints.slurm
        │           ├── s3-upload-run-files.slurm
        │           ├── schedule-evals.sh
        │           ├── slurm-status.slurm
        │           └── train.slurm
        │       ├── tr_347_smolvlm_500M_2nd_stage
        │           ├── 01_launch.sh
        │           ├── cleanup-checkpoints.slurm
        │           ├── config.yaml
        │           ├── convert-checkpoints.slurm
        │           ├── merge_lora_and_resize_eou.slurm
        │           ├── resize_embed_for_eou.py
        │           ├── s3-upload-checkpoints.slurm
        │           ├── s3-upload-run-files.slurm
        │           ├── schedule-evals.sh
        │           ├── slurm-status.slurm
        │           └── train.slurm
        │       ├── tr_348_smolvlm_2B
        │           ├── 01_launch.sh
        │           ├── cleanup-checkpoints.slurm
        │           ├── config.yaml
        │           ├── convert-checkpoints.slurm
        │           ├── merge_lora_and_resize_eou.slurm
        │           ├── resize_embed_for_eou.py
        │           ├── s3-upload-checkpoints.slurm
        │           ├── s3-upload-run-files.slurm
        │           ├── schedule-evals.slurm
        │           ├── slurm-status.slurm
        │           └── train.slurm
        │       ├── tr_349_vsmollm2_500M_3rd_stage
        │           ├── 01_launch.sh
        │           ├── cleanup-checkpoints.slurm
        │           ├── config.yaml
        │           ├── convert-checkpoints.slurm
        │           ├── merge_lora_and_resize_eou.slurm
        │           ├── resize_embed_for_eou.py
        │           ├── s3-upload-checkpoints.slurm
        │           ├── s3-upload-run-files.slurm
        │           ├── schedule-evals.sh
        │           ├── slurm-status.slurm
        │           └── train.slurm
        │       ├── tr_350_smolvlm_2B_2nd_stage
        │           ├── 01_launch.sh
        │           ├── cleanup-checkpoints.slurm
        │           ├── config.yaml
        │           ├── convert-checkpoints.slurm
        │           ├── merge_lora_and_resize_eou.slurm
        │           ├── resize_embed_for_eou.py
        │           ├── s3-upload-checkpoints.slurm
        │           ├── s3-upload-run-files.slurm
        │           ├── schedule-evals.sh
        │           ├── slurm-status.slurm
        │           └── train.slurm
        │       └── tr_cron_template
        │           ├── 01_launch.sh
        │           ├── README.md
        │           ├── cleanup-checkpoints.slurm
        │           ├── config.yaml
        │           ├── convert-checkpoints.slurm
        │           ├── s3-upload-checkpoints.slurm
        │           ├── s3-upload-run-files.slurm
        │           ├── schedule-evals.slurm
        │           └── train.slurm
    ├── finetuning
        ├── README.md
        ├── SmolVLM2_Video_FT.ipynb
        └── Smol_VLM_FT.ipynb
    └── m4
        ├── __init__.py
        ├── evaluation
            ├── README.md
            ├── __init__.py
            ├── config.py
            ├── custom_metrics
            │   ├── __init__.py
            │   ├── classification_vqa_metrics.py
            │   ├── doc_vqa_metrics.py
            │   ├── image_caption_matching_metrics.py
            │   ├── open_ended_vqa_metrics.py
            │   ├── perplexity_metrics.py
            │   ├── unfolded_classification_metrics.py
            │   ├── unfolded_image_captioning_metrics.py
            │   └── utils.py
            ├── evaluators
            │   ├── __init__.py
            │   ├── in_contexter.py
            │   └── linear_prober.py
            ├── generation
            │   ├── README.md
            │   ├── config.py
            │   ├── deprecated_generation
            │   │   ├── generate.py
            │   │   ├── launch_generation.py
            │   │   ├── log_generation.py
            │   │   ├── log_generation.slurm
            │   │   ├── make_generation.slurm
            │   │   └── master_generate.slurm
            │   └── generate.py
            ├── launch.py
            ├── scripts
            │   ├── README.md
            │   ├── copy_remote_sample_datasets.py
            │   ├── create_sample_evaluation_datasets.py
            │   ├── create_sample_evaluation_datasets_simplified.py
            │   ├── docvqa_to_submission_format.ipynb
            │   ├── mmbench_submission_format.py
            │   ├── mmmu_submission_format.py
            │   ├── sync_evaluations_on_wandb.py
            │   └── visualize_generations.py
            ├── tasks
            │   ├── __init__.py
            │   └── base.py
            ├── utils.py
            └── vqa_labels.py
        ├── models
            ├── __init__.py
            ├── common.py
            ├── custom_modules.py
            ├── idefics
            │   ├── configuration_idefics.py
            │   ├── evaluation_captioning_in_context_idefics.py
            │   ├── evaluation_classification_in_context_idefics.py
            │   ├── evaluation_classification_vqa_in_context_idefics.py
            │   ├── evaluation_image_caption_matching_idefics.py
            │   ├── evaluation_open_ended_vqa_in_context_idefics.py
            │   ├── evaluation_perplexity_in_context_idefics.py
            │   ├── make_tiny_llama.py
            │   ├── make_tiny_model.py
            │   └── modeling_idefics.py
            ├── perceiver
            │   └── perceiver.py
            ├── vgpt2
            │   ├── __init__.py
            │   ├── configuration_vgpt2.py
            │   ├── evaluation_captioning_in_context_vgpt2.py
            │   ├── evaluation_classification_in_context_vgpt2.py
            │   ├── evaluation_classification_vqa_in_context_vgpt2.py
            │   ├── evaluation_image_caption_matching_vgpt2.py
            │   ├── evaluation_open_ended_vqa_in_context_vgpt2.py
            │   ├── evaluation_perplexity_in_context_vgpt2.py
            │   └── modeling_vgpt2.py
            ├── vllama3
            │   ├── __init__.py
            │   ├── configuration_vllama3.py
            │   ├── evaluation_captioning_in_context_vllama3.py
            │   ├── evaluation_classification_in_context_vllama3.py
            │   ├── evaluation_open_ended_vqa_in_context_vllama3.py
            │   ├── make_tiny_llama3.py
            │   ├── make_tiny_model.py
            │   └── modeling_vllama3.py
            ├── vmistral
            │   ├── __init__.py
            │   ├── configuration_vmistral.py
            │   ├── evaluation_captioning_in_context_vmistral.py
            │   ├── evaluation_classification_in_context_vmistral.py
            │   ├── evaluation_classification_vqa_in_context_vmistral.py
            │   ├── evaluation_image_caption_matching_vmistral.py
            │   ├── evaluation_open_ended_vqa_in_context_vmistral.py
            │   ├── evaluation_perplexity_in_context_vmistral.py
            │   ├── make_tiny_mistral.py
            │   ├── make_tiny_model.py
            │   └── modeling_vmistral.py
            └── zero_checkpoint_to_hf.py
        ├── scripts
            ├── README.md
            ├── behead_unused_params.py
            ├── clean_jsonl_evals.py
            ├── cleanup-checkpoints.py
            ├── convert-checkpoints.py
            ├── convert_vmistral_lm_head.py
            ├── convert_vmistral_old_siglip_to_new_siglip.py
            ├── convert_zero_state_dict_for_new_siglip.py
            ├── job_update_siglip_model_pos_embeds.slurm
            ├── merge_lora_and_behead.sh
            ├── merge_lora_and_resize_eou_template.slurm
            ├── merge_lora_and_save.py
            ├── merge_lora_template.slurm
            ├── resize_embed_for_eou.py
            ├── s3-upload-checkpoints.py
            ├── s3_checkpoint_download_convert_upload.py
            ├── s3_checkpoint_download_convert_upload.slurm
            ├── s3_downloaded_checkpoints_cleanup.slurm
            ├── schedule-evals.py
            ├── update_model_embeds.py
            ├── update_model_perceiver_latents.py
            ├── update_siglip_model_pos_embeds.py
            └── update_vision_model_position_embeds.py
        ├── sourcing
            ├── __init__.py
            ├── data_collection
            │   ├── README.md
            │   ├── __init__.py
            │   ├── callers
            │   │   ├── __init__.py
            │   │   ├── deduplicate_images_web_documents.py
            │   │   ├── download_warc.py
            │   │   ├── extract_html.py
            │   │   ├── extract_image_text_pairs.py
            │   │   ├── extract_web_documents.py
            │   │   ├── filter_laion_pairs.py
            │   │   ├── filter_web_documents.py
            │   │   ├── get_reference_clip_distrib.py
            │   │   └── line_deduplicate_web_documents.py
            │   ├── configs
            │   │   ├── config_extract_web_documents.yaml
            │   │   ├── config_filter_laion_pairs.yaml
            │   │   ├── config_filter_text_image_pairs.yaml
            │   │   └── config_filter_web_documents.yaml
            │   ├── debug
            │   │   ├── __init__.py
            │   │   ├── debug.py
            │   │   └── get_intuition.py
            │   ├── docs
            │   │   ├── filtering_doc.md
            │   │   └── image_deduplication_doc.md
            │   ├── outputs
            │   │   ├── README.md
            │   │   ├── clip_scores_laion400m_10000.npy
            │   │   ├── clip_scores_red_caps_10000.npy
            │   │   ├── clip_scores_sbu_captions_10000.npy
            │   │   ├── distributions_extracted.png
            │   │   ├── distributions_reference.png
            │   │   └── image_text_pairs.jsonl
            │   ├── processors
            │   │   ├── __init__.py
            │   │   ├── dom_tree_simplificator.py
            │   │   ├── html_extractor.py
            │   │   ├── image_deduplicator.py
            │   │   ├── laion_pair_filtering.py
            │   │   ├── pair_extractor.py
            │   │   ├── pair_filtering.py
            │   │   ├── pre_extraction_simplificator.py
            │   │   ├── warc_downloader.py
            │   │   ├── web_document_extractor.py
            │   │   ├── web_document_filtering.py
            │   │   ├── web_document_image_deduplication.py
            │   │   └── web_document_line_deduplication.py
            │   ├── utils
            │   │   ├── __init__.py
            │   │   ├── clip_utils.py
            │   │   ├── fetching_utils.py
            │   │   ├── filtering_utils.py
            │   │   ├── kl_utils.py
            │   │   ├── simplification_utils.py
            │   │   ├── tags_attributes.py
            │   │   └── utils.py
            │   └── visualization
            │   │   ├── __init__.py
            │   │   ├── assets
            │   │       └── DOM_tree_viz.html
            │   │   ├── choose_filtering_parameters_laion_pairs.py
            │   │   ├── choose_filtering_parameters_web_documents_node_level.py
            │   │   ├── get_stats_vis_choose_filtering_parameters_laion_pairs.py
            │   │   ├── global_visualization.py
            │   │   ├── pair_stat_dashboard.py
            │   │   ├── pair_visualization.py
            │   │   ├── plot_clip_distrib.py
            │   │   ├── web_document_and_filtering_visualization.py
            │   │   ├── web_document_visualization.py
            │   │   └── wikipedia
            │   │       ├── explore.py
            │   │       ├── explore_wiki_results.py
            │   │       └── global_visualization.py
            ├── get_html_files
            │   └── common_crawl.md
            ├── get_modelling_metadata_dataset
            │   ├── get_modelling_metadata_dataset.py
            │   ├── get_modelling_metadata_dataset.slurm
            │   └── shard_names.txt
            ├── pmd
            │   ├── __init__.py
            │   ├── cache_path.py
            │   ├── fix_image_path.py
            │   ├── helpers.py
            │   ├── jz_loaders
            │   │   ├── __init__.py
            │   │   ├── jz_conceptual_captions
            │   │   │   ├── __init__.py
            │   │   │   └── jz_conceptual_captions.py
            │   │   └── jz_wit
            │   │   │   ├── __init__.py
            │   │   │   └── jz_wit.py
            │   ├── loader_builder.py
            │   ├── local_loaders
            │   │   ├── __init__.py
            │   │   ├── coco
            │   │   │   ├── __init__.py
            │   │   │   └── coco.py
            │   │   ├── laion_2b_en
            │   │   │   └── laion_2b_en.py
            │   │   ├── localized_narratives__ADE20k
            │   │   │   ├── __init__.py
            │   │   │   └── localized_narratives__ADE20k.py
            │   │   ├── localized_narratives__coco
            │   │   │   ├── __init__.py
            │   │   │   └── localized_narratives__coco.py
            │   │   ├── localized_narratives__flickr30k
            │   │   │   ├── __init__.py
            │   │   │   └── localized_narratives__flickr30k.py
            │   │   ├── localized_narratives__openimages
            │   │   │   ├── __init__.py
            │   │   │   └── localized_narratives__openimages.py
            │   │   └── yfcc100m
            │   │   │   ├── __init__.py
            │   │   │   └── yfcc100m.py
            │   └── scripts
            │   │   ├── README.md
            │   │   ├── check_none_ims.py
            │   │   ├── jz_image_pmd.slurm
            │   │   ├── jz_pmd.py
            │   │   └── pmd.py
            └── processing
            │   ├── README.md
            │   ├── __init__.py
            │   ├── app.py
            │   └── extracting_ngrams
            │       ├── README.md
            │       ├── __init__.py
            │       ├── extract_documents_ngrams.py
            │       ├── processing_pipeline.slurm
            │       ├── run_document_ngrams_extraction.sh
            │       └── utils.py
        ├── testing_utils.py
        ├── training
            ├── DATA_DOCUMENTATION.md
            ├── __init__.py
            ├── config.py
            ├── dataset.py
            ├── dataset_utils.py
            ├── debug_utils.py
            ├── main.py
            ├── packing.py
            ├── setup_language_model.py
            ├── setup_vision_model.py
            ├── trainer.py
            ├── types.py
            └── utils.py
        └── utils
            ├── __init__.py
            ├── activation_tracker.py
            ├── check_valid_tokenizer.py
            ├── datasets
                ├── __init__.py
                ├── create_webdataset_tar.py
                └── get_self_contained_ds.py
            ├── debug.py
            ├── logging.py
            ├── progress.py
            └── training
                ├── __init__.py
                └── timer.py


/.github/workflows/trufflehog.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 | 
 4 | name: Secret Leaks
 5 | 
 6 | permissions:
 7 |   contents: read
 8 | 
 9 | jobs:
10 |   trufflehog:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Checkout code
14 |         uses: actions/checkout@v4
15 |         with:
16 |           fetch-depth: 0
17 |       - name: Secret Scanning
18 |         uses: trufflesecurity/trufflehog@main


--------------------------------------------------------------------------------
/text/data/decontamination/README.md:
--------------------------------------------------------------------------------
1 | # Decontamination
2 | 
3 | TODO: add code.
4 | Placeholder here: https://github.com/huggingface/cosmopedia/tree/main/decontamination 


--------------------------------------------------------------------------------
/text/data/finemath/README.md:
--------------------------------------------------------------------------------
 1 | # 📚 FineWeb-Edu pipeline
 2 | 
 3 | <center>
 4 |     <img src="https://cdn-uploads.huggingface.co/production/uploads/61c141342aac764ce1654e43/wwRnEQydH9qdRtFofIE-A.png" alt="FineWeb-Edu: The finest collection of educational content the web has to offer">
 5 | </center>
 6 | 
 7 | 
 8 | Here you can find the pipeline for training [FineWeb-Edu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/)'s [classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) and running the annotation on FineWeb.
 9 | 
10 | ### 1. Finetune a model for educational value regression
11 | 
12 | * edit `train_edu_bert.slurm`
13 | ```bash
14 | --base_model_name="Snowflake/snowflake-arctic-embed-m" \  # BERT-like base model
15 | --dataset_name="HuggingFaceFW/fineweb-edu-llama3-annotations" \  # Llama3-annotated eduational value dataset
16 | --target_column="score" 
17 | ```
18 | * run the training script on a SLURM cluster:
19 | ```bash
20 | sbatch train_edu_bert.slurm
21 | ```
22 | 
23 | ### 2. Annotate a dataset with the educational scores predicted by the model
24 |     
25 | ```bash
26 | sbatch run_edu_bert.slurm
27 | ```


--------------------------------------------------------------------------------
/text/data/smoltalk/README.md:
--------------------------------------------------------------------------------
 1 | # SmolTalk: distilabel pipelines
 2 | We released [SmolTalk](https://huggingface.co/datasets/HuggingFaceTB/smoltalk) the SFT dataset used for building SmolLM2 instruct models. It was created with [distilabel](https://github.com/argilla-io/distilabel) and you can find the synthetic data pipelines here.
 3 | 
 4 | <div align="center">
 5 | <img src="https://cdn-uploads.huggingface.co/production/uploads/61c141342aac764ce1654e43/JLTEbnsBQ_qY032mxFzgC.png" width="800"/>
 6 | <p><em>Comparison of models finetuned on SmolTalk and Orca AgentInstruct 1M. For more details, refer to the <a href="https://huggingface.co/datasets/HuggingFaceTB/smoltalk" target="_blank">dataset card</a>.</em></p>
 7 | </div>
 8 | 
 9 | > [!NOTE]
10 | > This section is still in WIP. We will upload the rest of the pipelines soon. Thanks for your patience!
11 | 
12 | 


--------------------------------------------------------------------------------
/text/data/smoltalk/constraints/README.md:
--------------------------------------------------------------------------------
1 | # Smol-Constraints
2 | 
3 | This pipeline generates synthetic data similar to the one in the [google/IFEval](https://huggingface.co/datasets/google/IFEval) dataset/benchmark.
4 | 


--------------------------------------------------------------------------------
/text/data/smoltalk/constraints/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/text/data/smoltalk/constraints/pipeline/__init__.py


--------------------------------------------------------------------------------
/text/data/smoltalk/magpie_ultra_v1/README.md:
--------------------------------------------------------------------------------
 1 | # MagPie Ultra v1.0
 2 | 
 3 | This [`distilabel`](https://github.com/argilla-io/distilabel) was used to generate the [magpie-ultra-v1.0](https://huggingface.co/datasets/argilla/magpie-ultra-v1.0) dataset. The dataset follows the [MagPie](https://magpie-align.github.io) pipeline recipe to generate a multi-turn conversation dataset using [meta-llama/Llama-3.1-405B-Instruct-FP8](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct-FP8).
 4 | 
 5 | ## Setup
 6 | 
 7 | You will need to install `distilabel` with a few extra dependencies to be able to execute the pipeline:
 8 | 
 9 | ```bash
10 | pip install distilabel[ray,vllm,sentence-transformers,faiss-cpu,hf-transformers]
11 | ```


--------------------------------------------------------------------------------
/text/data/smoltalk/rewrite/README.md:
--------------------------------------------------------------------------------
1 | # Smol-Rewrite
2 | 
3 | This pipeline is used to generate a synthetic dataset for training a re-writing assistant.
4 | 


--------------------------------------------------------------------------------
/text/data/smoltalk/rewrite/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/text/data/smoltalk/rewrite/pipeline/__init__.py


--------------------------------------------------------------------------------
/text/data/smoltalk/rewrite/pipeline/pipeline.py:
--------------------------------------------------------------------------------
 1 | from dataset import get_dataset
 2 | from distilabel.llms import vLLM
 3 | from distilabel.pipeline import Pipeline
 4 | from distilabel.steps import StepResources
 5 | from distilabel.steps.tasks import TextGeneration
 6 | 
 7 | with Pipeline(name="smol-rewrite").ray() as pipeline:
 8 |     TextGeneration(
 9 |         llm=vLLM(
10 |             model="Qwen/Qwen2.5-72B-Instruct",
11 |             tokenizer="Qwen/Qwen2.5-72B-Instruct",
12 |             generation_kwargs={
13 |                 "temperature": 0.2,
14 |                 "max_new_tokens": 1024,
15 |                 "top_p": 0.95,
16 |             },
17 |             extra_kwargs={
18 |                 "tensor_parallel_size": 8,
19 |                 "max_model_len": 4096,
20 |                 "enable_prefix_caching": True,
21 |             },
22 |         ),
23 |         input_batch_size=1000,
24 |         resources=StepResources(replicas=4),
25 |     )
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     dataset = get_dataset()
30 |     distiset = pipeline.run(dataset=dataset, dataset_batch_size=10000, use_cache=True)
31 |     distiset.push_to_hub("HuggingFaceTB/smollm-v2-rewriting")
32 | 


--------------------------------------------------------------------------------
/text/data/smoltalk/summarization/README.md:
--------------------------------------------------------------------------------
1 | # Smol-Summarization


--------------------------------------------------------------------------------
/text/evaluation/README.md:
--------------------------------------------------------------------------------
 1 | # SmolLM evaluation scripts
 2 | 
 3 | We're using the [LightEval](https://github.com/huggingface/lighteval/) library to benchmark our models. 
 4 | 
 5 | Check out the [quick tour](https://github.com/huggingface/lighteval/wiki/Quicktour) to configure it to your own hardware and tasks.
 6 | 
 7 | ## Setup
 8 | 
 9 | Use conda/venv with `python>=3.10`.
10 | 
11 | Adjust the pytorch installation according to your environment:
12 | ```bash
13 | pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu121
14 | ```
15 | For reproducibility, we recommend fixed versions of the libraries:
16 | ```bash
17 | pip install -r requirements.txt
18 | ```
19 | 
20 | ## Running the evaluations
21 | 
22 | ### SmolLM2 base models
23 | 
24 | ```bash
25 | lighteval accelerate \
26 |   --model_args "pretrained=HuggingFaceTB/SmolLM2-1.7B,revision=main,dtype=bfloat16,vllm,gpu_memory_utilisation=0.8,max_model_length=2048" \
27 |   --custom_tasks "tasks.py" --tasks "smollm2_base.txt" --output_dir "./evals" --save_details
28 | ```
29 | 
30 | ### SmolLM2 instruction-tuned models
31 | 
32 | (note the `--use_chat_template` flag)
33 | ```bash
34 | lighteval accelerate \
35 |   --model_args "pretrained=HuggingFaceTB/SmolLM2-1.7B-Instruct,revision=main,dtype=bfloat16,vllm,gpu_memory_utilisation=0.8,max_model_length=2048" \
36 |   --custom_tasks "tasks.py" --tasks "smollm2_instruct.txt" --use_chat_template --output_dir "./evals" --save_details
37 | ```
38 | 
39 | ### FineMath dataset ablations
40 | 
41 | See the collection for model names: https://huggingface.co/collections/HuggingFaceTB/finemath-6763fb8f71b6439b653482c2
42 | 
43 | ```bash
44 | lighteval accelerate \
45 |   --model_args "pretrained=HuggingFaceTB/finemath-ablation-4plus-160B,revision=main,dtype=bfloat16,vllm,gpu_memory_utilisation=0.7,max_model_length=4096" \
46 |   --custom_tasks "tasks.py" --tasks "custom|math|4|1,custom|gsm8k|5|1,custom|arc:challenge|0|1,custom|mmlu_pro|0|1,custom|hellaswag|0|1" --output_dir "./evals" --save_details
47 | ```
48 | 


--------------------------------------------------------------------------------
/text/evaluation/requirements.txt:
--------------------------------------------------------------------------------
1 | lighteval[accelerate,extended_tasks,vllm] @ git+https://github.com/huggingface/lighteval@ea46419a93fb390e8f694f7c6c64c1e684487c9d
2 | fsspec>=2024.3.0
3 | word2number


--------------------------------------------------------------------------------
/text/evaluation/smollm2_base.txt:
--------------------------------------------------------------------------------
1 | custom|hellaswag|0|1
2 | custom|arc|0|1
3 | custom|piqa|0|1
4 | custom|mmlu_pro|0|1
5 | custom|commonsense_qa|0|1
6 | custom|trivia_qa|0|1
7 | custom|winogrande|0|1
8 | custom|openbook_qa|0|1
9 | custom|gsm8k|5|1


--------------------------------------------------------------------------------
/text/evaluation/smollm2_instruct.txt:
--------------------------------------------------------------------------------
1 | extended|ifeval|0|0
2 | custom|hellaswag|0|1
3 | custom|arc|0|1
4 | custom|piqa|0|1
5 | custom|mmlu_pro|0|1
6 | custom|bbh|3|1
7 | custom|gsm8k|5|1


--------------------------------------------------------------------------------
/text/finetuning/Dockerfile:
--------------------------------------------------------------------------------
 1 | # base image: CUDA 12.1
 2 | FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
 3 | 
 4 | WORKDIR /app
 5 | 
 6 | # install necessary packages
 7 | RUN apt-get update && apt-get install -y \
 8 |     git \
 9 |     wget \
10 |     curl \
11 |     ca-certificates \
12 |     libglib2.0-0 \
13 |     libsm6 \
14 |     libxrender1 \
15 |     libxext6 \
16 |     libssl-dev \
17 |     libffi-dev \
18 |     python3 \
19 |     python3-pip \
20 |     && rm -rf /var/lib/apt/lists/*
21 | 
22 | # set python3 as default python
23 | RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1
24 | 
25 | RUN pip install --upgrade pip setuptools
26 | 
27 | RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu121
28 | 
29 | COPY requirements.txt .
30 | RUN pip install -r requirements.txt
31 | 
32 | COPY . .
33 | 
34 | ENV PYTHONUNBUFFERED=1
35 | 
36 | CMD ["bash"]


--------------------------------------------------------------------------------
/text/finetuning/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | trl>=0.15
3 | peft
4 | accelerate
5 | datasets
6 | wandb
7 | bitsandbytes
8 | 


--------------------------------------------------------------------------------
/text/pretraining/README.md:
--------------------------------------------------------------------------------
 1 | # Pretraining
 2 | We use [nanotron](https://github.com/huggingface/nanotron/) library for training SmolLM and SmolLM2 base models.
 3 | 
 4 | The scripts for training SmolLM v1 can be found in the `smollm1` folder, and those for training SmolLM2 can be found in the `smollm2` folder, we will add the details for the data mixture soon. SmolLM2 uses similar architecture as SmolLM but uses an improved data mixture and significantly longer training periods (11 trillion tokens for the 1.7B, 4 trillion for the 360M and 2 trillion for the 135M).
 5 | 
 6 | ## Setup
 7 | 
 8 | Please refer to [nanotron](https://github.com/huggingface/nanotron/) for detailed instructions on setting up your training environment and launching jobs.
 9 | 
10 | After setting up the environment and tokenizing the training datasets with [datatrove](https://github.com/huggingface/datatrove) (instructions available [here](https://github.com/huggingface/nanotron/blob/main/docs/nanoset.md#nanosets)), you can modify the configurations to match your number of nodes and local paths.
11 | 
12 | Below is an example of launching SmolLM1 135M training on 1 node (you can change the DP value to 8 in the config and adjust the batch size) and run:
13 | 
14 | ```bash
15 | git clone https://github.com/huggingface/nanotron
16 | cd nanotron
17 | # follow installation
18 | CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=8 run_train.py --config-file smollm1/config_smollm1_135M.yaml
19 | ```
20 | 
21 | If you are working on a slurm cluster, you can modify the `launch.slurm` and launch the training with:
22 | 
23 | ```bash
24 | sbatch launch.slurm
25 | ```
26 | > [!NOTE]
27 | > Don't forget to create the logs directory before launching the job:
28 | 
29 | ## Continual pre-training
30 | 
31 | The nanotron checkpoints for SmolLM2 models are available at: https://huggingface.co/HuggingFaceTB/SmolLM2-nanotron-ckpt 
32 | 
33 | You can find an example of continual pre-training in the [continual-pretraining](./continual-pretraining) folder.
34 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
1 | # Tools for local inference
2 | 
3 | Here you can find tools and demos for running SmolLM2 and SmolVLM locally, leveraing libraries such as llama.cpp, MLX, MLC and Transformers.js.


--------------------------------------------------------------------------------
/tools/smol_tools/requirements.txt:
--------------------------------------------------------------------------------
1 | tkmacosx>=1.0.5
2 | pynput>=1.7.7
3 | llama-cpp-python>=0.3.1
4 | pyperclip>=1.9.0
5 | transformers>=4.46.2
6 | pygments>=2.18.0


--------------------------------------------------------------------------------
/tools/smol_tools/smol_tools/rewriter.py:
--------------------------------------------------------------------------------
 1 | from .base import SmolTool
 2 | from typing import Generator
 3 | 
 4 | class SmolRewriter(SmolTool):
 5 |     def __init__(self):
 6 |         super().__init__(
 7 |             model_repo="andito/SmolLM2-1.7B-Instruct-F16-GGUF",
 8 |             model_filename="smollm2-1.7b-8k-dpo-f16.gguf",
 9 |             system_prompt="You are an AI writing assistant. Your task is to rewrite the user's email to make it more professional and approachable while maintaining its main points and key message. Do not return any text other than the rewritten message.",
10 |             prefix_text="Rewrite the message below to make it more professional and approachable while maintaining its main points and key message. Do not add any new information or return any text other than the rewritten message\nThe message:"
11 |         )
12 | 
13 |     def process(self, text: str) -> Generator[str, None, None]:
14 |         messages = [
15 |             {"role": "system", "content": self.system_prompt},
16 |             {"role": "user", "content": f"{self.prefix_text}\n{text}"}
17 |         ]
18 |         yield from self._create_chat_completion(messages, temperature=0.4, repeat_penalty=1.0, top_k=0, max_tokens=1024)


--------------------------------------------------------------------------------
/tools/smol_tools/smol_tools/summarizer.py:
--------------------------------------------------------------------------------
 1 | from .base import SmolTool
 2 | from typing import Generator, Optional
 3 | from dataclasses import dataclass
 4 | from datetime import datetime
 5 | from typing import List
 6 | 
 7 | @dataclass
 8 | class SummaryMessage:
 9 |     role: str  # "user" or "assistant"
10 |     content: str
11 |     timestamp: datetime
12 | 
13 | class SmolSummarizer(SmolTool):
14 |     def __init__(self):
15 |         self.name = "SmolLM2-1.7B"
16 |         
17 |         super().__init__(
18 |             model_repo="andito/SmolLM2-1.7B-Instruct-F16-GGUF",
19 |             model_filename="smollm2-1.7b-8k-dpo-f16.gguf",
20 |             system_prompt="Concisely summarize the main points of the input text in up to three sentences, focusing on key information and events.",
21 |         )
22 | 
23 |     def process(self, text: str, question: Optional[str] = None) -> Generator[str, None, None]:
24 |         if question is None:
25 |             print("Summarizing text")
26 |             prompt = f"{self.prefix_text}\n{text}"
27 |             messages = [
28 |                 {"role": "system", "content": self.system_prompt},
29 |                 {"role": "user", "content": prompt},
30 |                 {"role": "assistant", "content": "This is a short summary of the text:"}
31 |             ]
32 |         else:
33 |             print("Answering question")
34 |             prompt = f"Original text:\n{text}\n\nQuestion: {question}"
35 |             messages = [
36 |                 {"role": "user", "content": prompt},
37 |             ]
38 | 
39 |         for chunk in self._create_chat_completion(messages, max_tokens=1024, temperature=0.1, top_p=0.9):
40 |             yield chunk
41 | 


--------------------------------------------------------------------------------
/tools/smol_tools/smol_tools/titler.py:
--------------------------------------------------------------------------------
 1 | from .base import SmolTool
 2 | from typing import Generator
 3 | 
 4 | class SmolTitler(SmolTool):
 5 |     def __init__(self):
 6 |         super().__init__(
 7 |             model_repo="andito/SmolLM2-1.7B-Instruct-F16-GGUF",
 8 |             model_filename="smollm2-1.7b-8k-dpo-f16.gguf",
 9 |             system_prompt="",
10 |             prefix_text="Create a title for this conversation:",
11 |         )
12 | 
13 |     def process(self, text: str) -> Generator[str, None, None]:
14 |         messages = [
15 |             {"role": "user", "content": f"{self.prefix_text}\n{text}"}
16 |         ]
17 |         yield from self._create_chat_completion(messages, max_tokens=128, temperature=0.6, top_p=0.9, top_k=0, repeat_penalty=1.1)


--------------------------------------------------------------------------------
/tools/smollm_local_inference/llama-cpp-python.py:
--------------------------------------------------------------------------------
 1 | from llama_cpp import Llama
 2 | 
 3 | llm = Llama.from_pretrained(
 4 |     repo_id="HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF",
 5 |     filename="*q4_k_m.gguf",
 6 |     verbose=False
 7 | )
 8 | 
 9 | output = llm(
10 |       "Q: Name the planets in the solar system? A: ", # Prompt
11 |       max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
12 |       stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
13 |       echo=True # Echo the prompt back in the output
14 | ) # Generate a completion, can also call create_completion
15 | 
16 | print(output)


--------------------------------------------------------------------------------
/tools/smollm_local_inference/mlc.py:
--------------------------------------------------------------------------------
 1 | from mlc_llm import MLCEngine
 2 | 
 3 | # Create engine
 4 | model = "HF://mlc-ai/SmolLM2-1.7B-Instruct-q0f16-MLC"
 5 | engine = MLCEngine(model)
 6 | 
 7 | # Run chat completion in OpenAI API.
 8 | for response in engine.chat.completions.create(
 9 |     messages=[{"role": "user", "content": "What is the meaning of life?"}],
10 |     model=model,
11 |     stream=True,
12 | ):
13 |     for choice in response.choices:
14 |         print(choice.delta.content, end="", flush=True)
15 | print("\n")
16 | 
17 | engine.terminate()
18 | 


--------------------------------------------------------------------------------
/tools/smollm_local_inference/mlx.py:
--------------------------------------------------------------------------------
 1 | from mlx_lm import load, generate
 2 | 
 3 | model, tokenizer = load("HuggingFaceTB/SmolLM2-1.7B-Instruct-Q8-mlx")
 4 | 
 5 | prompt = "Hello"
 6 | 
 7 | messages = [{"role": "user", "content": prompt}]
 8 | prompt = tokenizer.apply_chat_template(
 9 |     messages, tokenize=False, add_generation_prompt=True
10 | )
11 | 
12 | response = generate(model, tokenizer, prompt=prompt, verbose=True)
13 | print(response)
14 | 


--------------------------------------------------------------------------------
/tools/smollm_local_inference/transformers-js.js:
--------------------------------------------------------------------------------
 1 | import { pipeline } from "@huggingface/transformers";
 2 | 
 3 | // Create a text generation pipeline
 4 | const generator = await pipeline(
 5 |   "text-generation",
 6 |   "HuggingFaceTB/SmolLM2-1.7B-Instruct",
 7 |   { dtype: "q4f16" },
 8 | );
 9 | 
10 | // Define the list of messages
11 | const messages = [
12 |   { role: "system", content: "You are a helpful assistant." },
13 |   { role: "user", content: "Rewrite the following: hello how r u?" },
14 | ];
15 | 
16 | // Generate a response
17 | const output = await generator(messages, { max_new_tokens: 128 });
18 | console.log(output[0].generated_text.at(-1).content);
19 | // "Hello, how's it going?"


--------------------------------------------------------------------------------
/vision/data/README.md:
--------------------------------------------------------------------------------
1 | # Data
2 | 
3 | The scripts inside of datasets_processing_scripts are the ones we used to create all the datasets used for training smolvlm


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/01_convert_coco_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--shard_dir_path", type=Path, required=True)
25 |     parser.add_argument("--saving_dir", type=Path, required=True)
26 |     parser.add_argument("--num_examples_per_shard", type=int, required=True)
27 |     parser.add_argument("--num_proc", type=int, required=True)
28 |     parser.add_argument("--shard_idx", type=int, required=True)
29 |     parser.add_argument("--min_num_shards", type=int)
30 |     args = parser.parse_args()
31 |     return args
32 | 
33 | 
34 | def main(args):
35 |     shard_1_dirs = [shard_dir for shard_dir in args.shard_dir_path.iterdir()]
36 |     ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS
37 | 
38 |     export_dataset_shard_idx_to_tar(
39 |         hf_datasets_paths=shard_1_dirs,
40 |         saving_dir=args.saving_dir,
41 |         ds_type=ds_type,
42 |         num_examples_per_shard=args.num_examples_per_shard,
43 |         num_proc=args.num_proc,
44 |         shard_idx=args.shard_idx,
45 |         min_num_shards=args.min_num_shards,
46 |     )
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     args = get_args()
51 |     main(args)
52 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/02_convert_cm4_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--shard_dir_path", type=Path, required=True)
25 |     parser.add_argument("--saving_dir", type=Path, required=True)
26 |     parser.add_argument("--num_examples_per_shard", type=int, required=True)
27 |     parser.add_argument("--num_proc", type=int, required=True)
28 |     parser.add_argument("--shard_idx", type=int, required=True)
29 |     parser.add_argument("--min_num_shards", type=int)
30 |     args = parser.parse_args()
31 |     return args
32 | 
33 | 
34 | def main(args):
35 |     shard_1_dirs = [shard_dir for shard_dir in args.shard_dir_path.iterdir()]
36 |     ds_type = DatasetTypes.WEB_DOCUMENTS
37 | 
38 |     export_dataset_shard_idx_to_tar(
39 |         hf_datasets_paths=shard_1_dirs,
40 |         saving_dir=args.saving_dir,
41 |         ds_type=ds_type,
42 |         num_examples_per_shard=args.num_examples_per_shard,
43 |         num_proc=args.num_proc,
44 |         shard_idx=args.shard_idx,
45 |         min_num_shards=args.min_num_shards,
46 |     )
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     args = get_args()
51 |     main(args)
52 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/06_convert_lrv_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--saving_dir", type=Path, required=True)
25 |     parser.add_argument("--num_examples_per_shard", type=int, required=True)
26 |     parser.add_argument("--num_proc", type=int, required=True)
27 |     parser.add_argument("--shard_idx", type=int, required=True)
28 |     parser.add_argument("--min_num_shards", type=int)
29 |     args = parser.parse_args()
30 |     return args
31 | 
32 | 
33 | def main(args):
34 |     ds_type = DatasetTypes.LRV_PAIRS
35 | 
36 |     export_dataset_shard_idx_to_tar(
37 |         hf_datasets_paths=["VictorSanh/LrvInstruction:train"],
38 |         saving_dir=args.saving_dir,
39 |         ds_type=ds_type,
40 |         num_examples_per_shard=args.num_examples_per_shard,
41 |         num_proc=args.num_proc,
42 |         shard_idx=args.shard_idx,
43 |         min_num_shards=args.min_num_shards,
44 |     )
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     args = get_args()
49 |     main(args)
50 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/07_convert_llava_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--saving_dir", type=Path, required=True)
25 |     parser.add_argument("--num_examples_per_shard", type=int, required=True)
26 |     parser.add_argument("--num_proc", type=int, required=True)
27 |     parser.add_argument("--shard_idx", type=int, required=True)
28 |     parser.add_argument("--min_num_shards", type=int)
29 |     args = parser.parse_args()
30 |     return args
31 | 
32 | 
33 | def main(args):
34 |     ds_type = DatasetTypes.LLaVA
35 | 
36 |     export_dataset_shard_idx_to_tar(
37 |         hf_datasets_paths=["HuggingFaceM4/LLaVA-Instruct-150K:train"],
38 |         saving_dir=args.saving_dir,
39 |         ds_type=ds_type,
40 |         num_examples_per_shard=args.num_examples_per_shard,
41 |         num_proc=args.num_proc,
42 |         shard_idx=args.shard_idx,
43 |         min_num_shards=args.min_num_shards,
44 |     )
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     args = get_args()
49 |     main(args)
50 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/08_convert_svit_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--saving_dir", type=Path, required=True)
25 |     parser.add_argument("--num_examples_per_shard", type=int, required=True)
26 |     parser.add_argument("--num_proc", type=int, required=True)
27 |     parser.add_argument("--shard_idx", type=int, required=True)
28 |     parser.add_argument("--min_num_shards", type=int)
29 |     args = parser.parse_args()
30 |     return args
31 | 
32 | 
33 | def main(args):
34 |     ds_type = DatasetTypes.LRV_PAIRS
35 | 
36 |     export_dataset_shard_idx_to_tar(
37 |         hf_datasets_paths=["HuggingFaceM4/SVITMerged:train"],
38 |         saving_dir=args.saving_dir,
39 |         ds_type=ds_type,
40 |         num_examples_per_shard=args.num_examples_per_shard,
41 |         num_proc=args.num_proc,
42 |         shard_idx=args.shard_idx,
43 |         min_num_shards=args.min_num_shards,
44 |     )
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     args = get_args()
49 |     main(args)
50 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/09_convert_ultrachat_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--saving_dir", type=Path, required=True)
25 |     parser.add_argument("--num_examples_per_shard", type=int, required=True)
26 |     parser.add_argument("--num_proc", type=int, required=True)
27 |     parser.add_argument("--shard_idx", type=int, required=True)
28 |     parser.add_argument("--min_num_shards", type=int)
29 |     args = parser.parse_args()
30 |     return args
31 | 
32 | 
33 | def main(args):
34 |     ds_type = DatasetTypes.TEXT_DIALOGUE
35 | 
36 |     export_dataset_shard_idx_to_tar(
37 |         hf_datasets_paths=["stingning/ultrachat:train"],
38 |         saving_dir=args.saving_dir,
39 |         ds_type=ds_type,
40 |         num_examples_per_shard=args.num_examples_per_shard,
41 |         num_proc=args.num_proc,
42 |         shard_idx=args.shard_idx,
43 |         min_num_shards=args.min_num_shards,
44 |     )
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     args = get_args()
49 |     main(args)
50 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/10_convert_m3it_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--saving_dir", type=Path, required=True)
25 |     parser.add_argument("--num_examples_per_shard", type=int, required=True)
26 |     parser.add_argument("--num_proc", type=int, required=True)
27 |     parser.add_argument("--shard_idx", type=int, required=True)
28 |     parser.add_argument("--min_num_shards", type=int)
29 |     args = parser.parse_args()
30 |     return args
31 | 
32 | 
33 | def main(args):
34 |     ds_type = DatasetTypes.M3IT_PAIRS
35 | 
36 |     export_dataset_shard_idx_to_tar(
37 |         hf_datasets_paths=["HuggingFaceM4/M3IT:train"],
38 |         saving_dir=args.saving_dir,
39 |         ds_type=ds_type,
40 |         num_examples_per_shard=args.num_examples_per_shard,
41 |         num_proc=args.num_proc,
42 |         shard_idx=args.shard_idx,
43 |         min_num_shards=args.min_num_shards,
44 |     )
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     args = get_args()
49 |     main(args)
50 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/11_convert_spot_difference_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--saving_dir", type=Path, required=True)
25 |     parser.add_argument("--num_examples_per_shard", type=int, required=True)
26 |     parser.add_argument("--num_proc", type=int, required=True)
27 |     parser.add_argument("--shard_idx", type=int, required=True)
28 |     parser.add_argument("--min_num_shards", type=int)
29 |     args = parser.parse_args()
30 |     return args
31 | 
32 | 
33 | def main(args):
34 |     ds_type = DatasetTypes.SPOT_DIFFERENCE_PAIRS
35 | 
36 |     export_dataset_shard_idx_to_tar(
37 |         hf_datasets_paths=["HuggingFaceM4/SpotDifference_4:train"],
38 |         saving_dir=args.saving_dir,
39 |         ds_type=ds_type,
40 |         num_examples_per_shard=args.num_examples_per_shard,
41 |         num_proc=args.num_proc,
42 |         shard_idx=args.shard_idx,
43 |         min_num_shards=args.min_num_shards,
44 |     )
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     args = get_args()
49 |     main(args)
50 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/12_convert_llavar_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--saving_dir", type=Path, required=True)
25 |     parser.add_argument("--num_examples_per_shard", type=int, required=True)
26 |     parser.add_argument("--num_proc", type=int, required=True)
27 |     parser.add_argument("--shard_idx", type=int, required=True)
28 |     parser.add_argument("--min_num_shards", type=int)
29 |     args = parser.parse_args()
30 |     return args
31 | 
32 | 
33 | def main(args):
34 |     ds_type = DatasetTypes.LLaVA
35 | 
36 |     export_dataset_shard_idx_to_tar(
37 |         hf_datasets_paths=["HuggingFaceM4/LLaVAR-Instruct-16K:train"],
38 |         saving_dir=args.saving_dir,
39 |         ds_type=ds_type,
40 |         num_examples_per_shard=args.num_examples_per_shard,
41 |         num_proc=args.num_proc,
42 |         shard_idx=args.shard_idx,
43 |         min_num_shards=args.min_num_shards,
44 |     )
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     args = get_args()
49 |     main(args)
50 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/13_convert_vqav2_task_finetuning_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--saving_dir", type=Path, required=True)
25 |     parser.add_argument("--num_examples_per_shard", type=int, required=True)
26 |     parser.add_argument("--num_proc", type=int, required=True)
27 |     parser.add_argument("--shard_idx", type=int, required=True)
28 |     parser.add_argument("--min_num_shards", type=int)
29 |     args = parser.parse_args()
30 |     return args
31 | 
32 | 
33 | def main(args):
34 |     ds_type = DatasetTypes.VQAV2_TASK_FINETUNING
35 | 
36 |     export_dataset_shard_idx_to_tar(
37 |         hf_datasets_paths=["HuggingFaceM4/vqav2_task_finetuning:train"],
38 |         saving_dir=args.saving_dir,
39 |         ds_type=ds_type,
40 |         num_examples_per_shard=args.num_examples_per_shard,
41 |         num_proc=args.num_proc,
42 |         shard_idx=args.shard_idx,
43 |         min_num_shards=args.min_num_shards,
44 |     )
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     args = get_args()
49 |     main(args)
50 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/14_convert_tikz_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--saving_dir", type=Path, required=True)
25 |     parser.add_argument("--num_examples_per_shard", type=int, required=True)
26 |     parser.add_argument("--num_proc", type=int, required=True)
27 |     parser.add_argument("--shard_idx", type=int, required=True)
28 |     parser.add_argument("--min_num_shards", type=int)
29 |     args = parser.parse_args()
30 |     return args
31 | 
32 | 
33 | def main(args):
34 |     ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS
35 | 
36 |     export_dataset_shard_idx_to_tar(
37 |         hf_datasets_paths=["HuggingFaceM4/datikz_modif:train"],
38 |         saving_dir=args.saving_dir,
39 |         ds_type=ds_type,
40 |         num_examples_per_shard=args.num_examples_per_shard,
41 |         num_proc=args.num_proc,
42 |         shard_idx=args.shard_idx,
43 |         min_num_shards=args.min_num_shards,
44 |     )
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     args = get_args()
49 |     main(args)
50 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/15_convert_docvqa_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--saving_dir", type=Path, required=True)
25 |     parser.add_argument("--num_examples_per_shard", type=int, required=True)
26 |     parser.add_argument("--num_proc", type=int, required=True)
27 |     parser.add_argument("--shard_idx", type=int, required=True)
28 |     parser.add_argument("--min_num_shards", type=int)
29 |     args = parser.parse_args()
30 |     return args
31 | 
32 | 
33 | def main(args):
34 |     ds_type = DatasetTypes.DOCVQA
35 | 
36 |     export_dataset_shard_idx_to_tar(
37 |         hf_datasets_paths=["HuggingFaceM4/DocumentVQA:train"],
38 |         saving_dir=args.saving_dir,
39 |         ds_type=ds_type,
40 |         num_examples_per_shard=args.num_examples_per_shard,
41 |         num_proc=args.num_proc,
42 |         shard_idx=args.shard_idx,
43 |         min_num_shards=args.min_num_shards,
44 |     )
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     args = get_args()
49 |     main(args)
50 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/16_convert_image_website_code_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--saving_dir", type=Path, required=True)
25 |     parser.add_argument("--num_examples_per_shard", type=int, required=True)
26 |     parser.add_argument("--num_proc", type=int, required=True)
27 |     parser.add_argument("--shard_idx", type=int, required=True)
28 |     parser.add_argument("--min_num_shards", type=int)
29 |     args = parser.parse_args()
30 |     return args
31 | 
32 | 
33 | def main(args):
34 |     ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS
35 | 
36 |     export_dataset_shard_idx_to_tar(
37 |         hf_datasets_paths=["HuggingFaceM4/image_to_website_code:train"],
38 |         saving_dir=args.saving_dir,
39 |         ds_type=ds_type,
40 |         num_examples_per_shard=args.num_examples_per_shard,
41 |         num_proc=args.num_proc,
42 |         shard_idx=args.shard_idx,
43 |         min_num_shards=args.min_num_shards,
44 |     )
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     args = get_args()
49 |     main(args)
50 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/17_convert_websight_v02_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--saving_dir", type=Path, required=True)
25 |     parser.add_argument("--num_examples_per_shard", type=int, required=True)
26 |     parser.add_argument("--num_proc", type=int, required=True)
27 |     parser.add_argument("--shard_idx", type=int, required=True)
28 |     parser.add_argument("--min_num_shards", type=int)
29 |     args = parser.parse_args()
30 |     return args
31 | 
32 | 
33 | def main(args):
34 |     ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS
35 | 
36 |     export_dataset_shard_idx_to_tar(
37 |         hf_datasets_paths=["/fsx/hugo/ds_websight_v02"],
38 |         saving_dir=args.saving_dir,
39 |         ds_type=ds_type,
40 |         num_examples_per_shard=args.num_examples_per_shard,
41 |         num_proc=args.num_proc,
42 |         shard_idx=args.shard_idx,
43 |         min_num_shards=args.min_num_shards,
44 |     )
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     args = get_args()
49 |     main(args)
50 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/19_convert_websight_mix_per_shard_idx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from datasets import set_caching_enabled
 6 | 
 7 | from m4.training.types import DatasetTypes
 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_all_shard_idx_to_tar
 9 | 
10 | 
11 | logging.basicConfig(
12 |     level=logging.INFO,
13 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 | 
19 | set_caching_enabled(False)
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--ds_path", type=Path, required=True)
25 |     parser.add_argument("--saving_dir", type=Path, required=True)
26 |     parser.add_argument("--num_examples_per_shard", type=int)
27 |     parser.add_argument("--s3_uri", type=str)
28 |     parser.add_argument("--num_proc", type=int, required=True)
29 |     parser.add_argument("--min_num_shards", type=int)
30 |     parser.add_argument("--save_shard_prefix", type=str, default="")
31 | 
32 |     args = parser.parse_args()
33 |     return args
34 | 
35 | 
36 | def main(args):
37 |     ds_paths = [args.ds_path]
38 |     ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS
39 | 
40 |     export_dataset_all_shard_idx_to_tar(
41 |         hf_datasets_paths=ds_paths,
42 |         saving_dir=args.saving_dir,
43 |         ds_type=ds_type,
44 |         num_examples_per_shard=args.num_examples_per_shard,
45 |         s3_uri=args.s3_uri,
46 |         num_proc=args.num_proc,
47 |         min_num_shards=args.min_num_shards,
48 |         save_shard_prefix=args.save_shard_prefix,
49 |     )
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     args = get_args()
54 |     main(args)
55 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/build_concatenation_datasets_sft/create_set_hashes_test_images.py:
--------------------------------------------------------------------------------
 1 | """
 2 | srun --pty --cpus-per-task=96 --mem-per-cpu=20G --partition=hopper-prod bash -i
 3 | conda activate shared-m4
 4 | """
 5 | 
 6 | 
 7 | import hashlib
 8 | import json
 9 | 
10 | from datasets import concatenate_datasets, load_dataset
11 | from tqdm import tqdm
12 | 
13 | 
14 | NAME_DS_TO_HASH = [
15 |     "HuggingFaceM4/MMBench_modif_chatbot",
16 |     "HuggingFaceM4/MathVista-modif",
17 |     "HuggingFaceM4/MMMU-modif",
18 | ]
19 | 
20 | PATH_SAVE_LIST_HASHES = "/fsx/hugo/fine_tuning_datasets_merge_image_individual/list_hashes_test_images.json"
21 | 
22 | 
23 | list_hashes = []
24 | 
25 | for name_ds in tqdm(NAME_DS_TO_HASH):
26 |     potential_subset_names = ["testmini", "test", "validation", "dev"]
27 |     all_splits = []
28 |     for split in potential_subset_names:
29 |         try:
30 |             all_splits.append(load_dataset(name_ds, split=split))
31 |         except Exception:
32 |             pass
33 |     ds = concatenate_datasets(all_splits)
34 |     if "image" in ds.column_names:
35 |         images = ds["image"]
36 |     elif "images" in ds.column_names:
37 |         images = ds["images"]
38 |         images = [img for list_images in images for img in list_images]
39 |     else:
40 |         raise ValueError("images not found in the dataset")
41 |     for img in tqdm(images):
42 |         md5hash = hashlib.md5(img.tobytes()).hexdigest()
43 |         list_hashes.append(md5hash)
44 | 
45 | 
46 | with open(PATH_SAVE_LIST_HASHES, "w") as f:
47 |     json.dump(list_hashes, f)
48 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/build_concatenation_datasets_sft/job_build_the_cauldron.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=build_the_cauldron
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --cpus-per-task=88
 6 | #SBATCH --mem-per-cpu=21G
 7 | #SBATCH --output=/fsx/m4/experiments/general_logs/build_the_cauldron/res%A_%a
 8 | #SBATCH --partition=hopper-prod
 9 | #SBATCH --qos high
10 | 
11 | source /fsx/m4/start-m4-user
12 | conda activate shared-m4
13 | 
14 | 
15 | python /fsx/hugo/repos/m4_28/datasets_processing_scripts/build_concatenation_datasets_sft/build_the_cauldron.py ${SLURM_ARRAY_TASK_ID}
16 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/build_concatenation_datasets_sft/job_merge_on_image_individual_dataset.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=merge_on_image_individual_dataset
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --cpus-per-task=88
 6 | #SBATCH --mem-per-cpu=21G
 7 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_on_image_individual_dataset/res%A_%a
 8 | #SBATCH --partition=hopper-prod
 9 | 
10 | source /fsx/m4/start-m4-user
11 | conda activate /fsx/m4/conda/hugo_3
12 | 
13 | 
14 | python /fsx/hugo/repos/m4_15/datasets_processing_scripts/build_concatenation_datasets_sft/merge_on_image_individual_dataset.py ${SLURM_ARRAY_TASK_ID}
15 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/build_concatenation_datasets_sft/tar_dataset_pattern_check.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import datasets
 4 | 
 5 | from m4.training.dataset_utils import get_webdataset
 6 | from m4.training.types import DatasetTypes
 7 | 
 8 | 
 9 | base_path = "/fsx/leo/fine_tuning_datasets/concat_chatty_tar/shard_{index}.tar"
10 | 
11 | # Generate paths using a list comprehension
12 | webdataset_paths = [base_path.format(index=i) for i in range(0, 1785)]
13 | 
14 | 
15 | FEATURES = datasets.Features(
16 |     {
17 |         "__key__": datasets.Value("string"),
18 |         "__url__": datasets.Value("string"),
19 |         "images": datasets.Sequence(datasets.Image(decode=True)),
20 |         "texts": [
21 |             {
22 |                 "user": datasets.Value("string"),
23 |                 "assistant": datasets.Value("string"),
24 |                 "source": datasets.Value("string"),
25 |             }
26 |         ],
27 |     }
28 | )
29 | combined_dataset = get_webdataset(
30 |     urls=webdataset_paths,
31 |     ds_type=DatasetTypes.SFT,
32 |     batch_size=10,
33 |     shuffle_initial_urls_list=False,
34 |     shuffle_before_split_by_node_buffer_size=(None),
35 |     shuffle_before_split_by_worker_buffer_size=(None),
36 |     shuffle_after_tarfile_to_samples_buffer_size=(None),
37 |     shuffle_after_batching_buffer_size=None,
38 | )
39 | 
40 | # Regex pattern
41 | pattern = r"[?!:]\."
42 | # Find all occurrences
43 | all_matches = []
44 | # Process each text
45 | for batch in combined_dataset:
46 |     for turns in batch["texts"]:
47 |         for turn in turns:
48 |             text = turn["assistant"]
49 |             matches = re.findall(pattern, text)
50 |             if matches:
51 |                 all_matches.extend(matches)
52 | 
53 | print(f"len matches: {len(all_matches)}")
54 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/build_laion_dataset/python_scripts/03_01_prepare_dedup_laion.py:
--------------------------------------------------------------------------------
 1 | # The followings steps were done in different jobs, this is to give an idea of what was done
 2 | 
 3 | 
 4 | import pickle
 5 | 
 6 | import numpy as np
 7 | from datasets import load_dataset
 8 | from tqdm import tqdm
 9 | 
10 | 
11 | laion_dataset = load_dataset("laion/laion2b-en-vit-h-14-embeddings")["train"]  # Takes a long time to download
12 | # The md5 is shorter than the url to identify an image. Moreover, some images in the dataset are the same but under
13 | # different urls. In this case they have the same md5, and we'll be able to have even more compact data
14 | # laion_dataset_md5 is uploaded at s3://m4-datasets/trash/laion_dataset_md5/
15 | laion_dataset_md5 = laion_dataset.remove_columns([c_n for c_n in laion_dataset.column_names if c_n != "md5"])
16 | 
17 | # Download at https://huggingface.co/datasets/fraisdufour/snip-dedup/resolve/main/is_dup_mlp_1024_128_gelu_snn_2layer_notext.npy
18 | is_dup_all = np.load("/fsx/hugo/prepare_dedup_laion/is_dup_mlp_1024_128_gelu_snn_2layer_notext.npy").ravel()
19 | 
20 | list_index_dup = [idx for idx, el in enumerate(is_dup_all) if el] + [
21 |     idx for idx in range(len(is_dup_all), len(laion_dataset_md5))
22 | ]
23 | set_dup = set()
24 | for idx in tqdm(list_index_dup):
25 |     set_dup.add(laion_dataset_md5[idx]["md5"])
26 | 
27 | # set_dup_md5.pkl is uploaded at s3://m4-datasets/trash/set_dup_md5.pkl
28 | with open("/fsx/hugo/prepare_dedup_laion/set_dup_md5.pkl", "wb") as f:
29 |     pickle.dump(set_dup, f)
30 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/build_laion_dataset/python_scripts/04_02_create_ds_laion_urls.py:
--------------------------------------------------------------------------------
 1 | """
 2 | srun --pty --cpus-per-task=48 --mem-per-cpu=11G bash -i
 3 | conda activate /fsx/m4/conda/shared-m4-2023-03-10
 4 | """
 5 | 
 6 | 
 7 | import json
 8 | import os
 9 | 
10 | from datasets import Dataset
11 | from tqdm import tqdm
12 | 
13 | 
14 | NUM_SHARDS = 200
15 | 
16 | PATH_LAION_URLS_S3 = "s3://m4-datasets/LAION_data/urls_laion_dataset_filtered_dedup/"
17 | PATH_LAION_URLS_LOCAL = "/scratch/laion_urls"
18 | 
19 | PATH_SAVE_DISK_DS_LAION_URLS = "/scratch/ds_laion_urls"
20 | PATH_SAVE_S3_DS_LAION_URLS = "s3://m4-datasets/LAION_data/ds_urls_laion_dataset_filtered_dedup/"
21 | 
22 | NUM_PROC = 48
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     command_sync_s3 = f"aws s3 sync {PATH_LAION_URLS_S3} {PATH_LAION_URLS_LOCAL}"
27 |     os.system(command_sync_s3)
28 | 
29 |     all_urls = []
30 |     for idx_shard in tqdm(range(NUM_SHARDS)):
31 |         if idx_shard not in [184, 189]:
32 |             path_urls_laion_shard = os.path.join(PATH_LAION_URLS_LOCAL, str(idx_shard), "laion_urls.json")
33 |             with open(path_urls_laion_shard) as f:
34 |                 all_urls.extend(json.load(f))
35 | 
36 |     ds_laion_urls = Dataset.from_dict({"url": all_urls})
37 |     ds_laion_urls.save_to_disk(PATH_SAVE_DISK_DS_LAION_URLS, num_proc=NUM_PROC)
38 | 
39 |     command_sync_s3 = f"aws s3 sync {PATH_SAVE_DISK_DS_LAION_URLS} {PATH_SAVE_S3_DS_LAION_URLS}"
40 |     os.system(command_sync_s3)
41 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts/03_parallel_dl_images_create_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import signal
 3 | import subprocess
 4 | import sys
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | idx_machine = int(sys.argv[1])
10 | 
11 | IDX_REMAINING = [idx for idx in range(200)]
12 | NUM_MACHINES = 21
13 | IDX = [el.tolist() for el in np.array_split(IDX_REMAINING, NUM_MACHINES)][idx_machine]
14 | PATH_LOG = "/scratch/log.txt"
15 | 
16 | 
17 | for idx in IDX:
18 |     f = open(PATH_LOG, "a")
19 |     f.write(f"Starting job {idx}\n")
20 |     f.close()
21 | 
22 |     os.system("sudo truncate -s 0 /var/log/syslog")
23 | 
24 |     p = subprocess.Popen(
25 |         f"python3 m4/sourcing/data_collection/callers/dl_images_create_dataset.py {idx} --download_only 1",
26 |         shell=True,
27 |         preexec_fn=os.setsid,
28 |     )
29 |     try:
30 |         p.wait(2 * 60 * 60)
31 |     except subprocess.TimeoutExpired:
32 |         os.killpg(os.getpgid(p.pid), signal.SIGTERM)
33 |         # p.kill()
34 | 
35 |     f = open(PATH_LOG, "a")
36 |     f.write(f"{idx} done with download only\n")
37 |     f.close()
38 | 
39 |     os.system(f"python3 m4/sourcing/data_collection/callers/dl_images_create_dataset.py {idx} --U 1")
40 | 
41 |     f = open(PATH_LOG, "a")
42 |     f.write(f"{idx} done with create image dataset only\n")
43 |     f.close()
44 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts/09_03_split_domain_to_positions.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import random
 4 | 
 5 | from tqdm import tqdm
 6 | 
 7 | 
 8 | random.seed(42)
 9 | 
10 | NUM_SHARDS = 200
11 | 
12 | PATH_LINE_DEDUP_DOMAIN_TO_POSITIONS_S3 = "s3://m4-datasets/webdocs/line_dedup_domain_to_positions.json"
13 | PATH_LINE_DEDUP_DOMAIN_TO_POSITIONS_LOCAL = "/scratch/line_dedup_domain_to_positions.json"
14 | 
15 | PATH_SAVE_S3_LINE_DEDUP_DOMAIN_TO_POSITIONS_SHARDED = (
16 |     "s3://m4-datasets/webdocs/line_dedup_domain_to_positions_sharded/"
17 | )
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     command_sync_s3 = f"aws s3 cp {PATH_LINE_DEDUP_DOMAIN_TO_POSITIONS_S3} {PATH_LINE_DEDUP_DOMAIN_TO_POSITIONS_LOCAL}"
22 |     os.system(command_sync_s3)
23 | 
24 |     with open(PATH_LINE_DEDUP_DOMAIN_TO_POSITIONS_LOCAL) as f:
25 |         domain_to_positions = json.load(f)
26 | 
27 |     keys = list(domain_to_positions.keys())
28 |     random.shuffle(keys)
29 | 
30 |     sublist_size = len(keys) // NUM_SHARDS + 1
31 |     keys_per_shard = [set(keys[i : i + sublist_size]) for i in range(0, len(keys), sublist_size)]
32 | 
33 |     domain_to_positions_shard = []
34 | 
35 |     for idx_shard in tqdm(range(NUM_SHARDS)):
36 |         domain_to_positions_shard.append(
37 |             {k: v for k, v in domain_to_positions.items() if k in keys_per_shard[idx_shard]}
38 |         )
39 | 
40 |         with open(f"/scratch/line_dedup_domain_to_positions_{idx_shard}.json", "w") as f:
41 |             json.dump(domain_to_positions_shard[idx_shard], f)
42 | 
43 |     for idx_shard in tqdm(range(NUM_SHARDS)):
44 |         path_disk = f"/scratch/line_dedup_domain_to_positions_{idx_shard}.json"
45 |         path_s3 = os.path.join(
46 |             PATH_SAVE_S3_LINE_DEDUP_DOMAIN_TO_POSITIONS_SHARDED, str(idx_shard), "line_dedup_domain_to_positions.json"
47 |         )
48 |         command_sync_s3 = f"aws s3 cp {path_disk} {path_s3}"
49 |         os.system(command_sync_s3)
50 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/clean_m4_prelimenary_experiments/README.md:
--------------------------------------------------------------------------------
 1 | This folder traces the exploration of additional cleaning that could be brought to the CM4 dataset.
 2 | 
 3 | As a result of this exploration phase, 2 potential improvements have been identified:
 4 | 1. Remove HTML nodes (and their descendants) whose tag class attribute value contains either "footer" or "site-info". From the exploration, this would correspond to "web" parts of the web page
 5 | 2. Splitting the html at the level of the continue reading occurrence, which is often characterized by the fact that the class attribute value of the tag contains "more-link".
 6 | 
 7 | **Before being fully implemented**, we tested the suitability of 2. by creating a filtered version of CM4 that excluded all documents that would have had an occurance of continuous reading (`04_get_banned_url.slurm` and `05_filter_cm4.slurm`).
 8 | 
 9 | The explore folder contains streamlint spaces that have been used to find new possible cleaning rules.
10 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/create_evaluation_datasets/MMBench/make_mmbench.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | from copy import deepcopy
 3 | from io import BytesIO
 4 | 
 5 | import datasets
 6 | import pandas as pd
 7 | from datasets import Dataset
 8 | from PIL import Image
 9 | 
10 | 
11 | PATH_MMBENCH_DATA = (  # DL from https://opencompass.org.cn/mmbench
12 |     "/Users/hugolaurencon/Desktop/mmbench_dev_20230712.tsv"
13 | )
14 | NUM_PROC = 10
15 | REPO_ID = "HuggingFaceM4/MMBench_dev"
16 | 
17 | 
18 | data_frame = pd.read_csv(PATH_MMBENCH_DATA, sep="\t", header=0)
19 | 
20 | 
21 | ds = Dataset.from_pandas(data_frame)
22 | ds = ds.remove_columns(["index", "category", "source", "l2-category", "comment", "split"])
23 | ds = ds.rename_column("answer", "label")
24 | 
25 | 
26 | def map_func_transform_image_column(example):
27 |     example["image"] = Image.open(BytesIO(base64.b64decode(example["image"])))
28 |     return example
29 | 
30 | 
31 | new_features = deepcopy(ds.features)
32 | new_features["image"] = datasets.Image()
33 | new_features["label"] = datasets.features.ClassLabel(names=["A", "B", "C", "D"])
34 | 
35 | ds = ds.map(map_func_transform_image_column, features=new_features, num_proc=NUM_PROC)
36 | 
37 | ds.push_to_hub(REPO_ID)
38 | 
39 | 
40 | def map_func_modif_context(example):
41 |     question = example["question"]
42 |     hint = example["hint"]
43 |     context = []
44 |     if hint:
45 |         context.append(f"Context: {hint}")
46 |     context.append(f"Question: {question}")
47 |     context.append("Possible answers:")
48 |     for key in ["A", "B", "C", "D"]:
49 |         ans = example[key]
50 |         if ans:
51 |             context.append(f"{key}: {ans}")
52 |     context.append("Correct answer: ")
53 |     example["context"] = "\n".join(context)
54 |     return example
55 | 
56 | 
57 | ds = ds.map(map_func_modif_context, num_proc=NUM_PROC)
58 | ds = ds.remove_columns(["question", "hint", "A", "B", "C", "D"])
59 | ds.push_to_hub(REPO_ID + "_modif", private=True)
60 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/create_evaluation_datasets/create_AI2D/create_ai2d.py:
--------------------------------------------------------------------------------
 1 | """
 2 | srun --pty --cpus-per-task=8 --partition=hopper-cpu --qos high bash -i
 3 | conda activate shared-m4
 4 | """
 5 | 
 6 | 
 7 | import datasets
 8 | from datasets import DatasetDict, load_dataset
 9 | 
10 | 
11 | ORIGINAL_NAME_DS = "lmms-lab/ai2d"
12 | ORIGINAL_SPLIT_DS = "test"
13 | 
14 | NUM_PROC = 32
15 | 
16 | POSSIBLE_LABELS = ["1", "2", "3", "4"]
17 | 
18 | FEATURES = datasets.Features(
19 |     {
20 |         "question": datasets.Value("string"),
21 |         "label": datasets.features.ClassLabel(names=POSSIBLE_LABELS),
22 |         "image": datasets.Image(decode=True),
23 |     }
24 | )
25 | 
26 | NAME_DS_PUSH_HUB = "HuggingFaceM4/AI2D"
27 | 
28 | 
29 | def map_func_transform_ai2d_ds(example):
30 |     example["label"] = str(int(example["answer"]) + 1)
31 |     question = example["question"].strip()
32 |     question = f"Question: {question}\nChoices:\n"
33 |     choices = example["options"]
34 |     for idx_choice, choice in enumerate(choices):
35 |         question += f"Choice {idx_choice + 1}: {choice}\n"
36 |     # question += "Answer with the option number."  # Commented because should be defined in the evaluation prompt
37 |     example["question"] = question.strip()
38 |     return example
39 | 
40 | 
41 | ds_test = load_dataset(ORIGINAL_NAME_DS, split=ORIGINAL_SPLIT_DS)
42 | columns_to_remove = [c_n for c_n in ds_test.column_names if c_n not in list(FEATURES.keys())]
43 | ds_test = ds_test.map(
44 |     map_func_transform_ai2d_ds, remove_columns=columns_to_remove, features=FEATURES, num_proc=NUM_PROC
45 | )
46 | print(ds_test[0]["question"])
47 | 
48 | 
49 | ds_all_splits = DatasetDict({"test": ds_test})
50 | ds_all_splits.push_to_hub(NAME_DS_PUSH_HUB, private=True)
51 | 
52 | # Cache dataset
53 | test_loading = load_dataset(NAME_DS_PUSH_HUB)
54 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/create_evaluation_datasets/create_imagenet1k_1ksupportset_subsets.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from datasets import load_dataset
 3 | from tqdm import tqdm
 4 | 
 5 | 
 6 | NUM_SUBSETS = 6
 7 | 
 8 | NAME_DS = "HuggingFaceM4/imagenet1k_support_1k_query_sets"
 9 | 
10 | 
11 | ds_subsets = [load_dataset(NAME_DS, use_auth_token=True) for _ in range(NUM_SUBSETS)]
12 | 
13 | num_test_examples = ds_subsets[0]["test_query_set"].num_rows
14 | 
15 | selected_indices = np.array_split(range(num_test_examples), NUM_SUBSETS)
16 | 
17 | for idx_ds in range(NUM_SUBSETS):
18 |     ds_subsets[idx_ds]["test_query_set"] = ds_subsets[idx_ds]["test_query_set"].select(selected_indices[idx_ds])
19 | 
20 | for idx_ds in tqdm(range(NUM_SUBSETS)):
21 |     ds_subsets[idx_ds].push_to_hub(repo_id=NAME_DS + f"_part_{idx_ds}", private=True)
22 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/create_evaluation_datasets/create_imagenet1k_5ksupportset_subsets.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from datasets import load_dataset
 3 | from tqdm import tqdm
 4 | 
 5 | 
 6 | NUM_SUBSETS = 6
 7 | 
 8 | NAME_DS = "HuggingFaceM4/imagenet1k_support_5k_query_sets"
 9 | 
10 | 
11 | ds_subsets = [load_dataset(NAME_DS, use_auth_token=True) for _ in range(NUM_SUBSETS)]
12 | 
13 | num_test_examples = ds_subsets[0]["test_query_set"].num_rows
14 | 
15 | selected_indices = np.array_split(range(num_test_examples), NUM_SUBSETS)
16 | 
17 | for idx_ds in range(NUM_SUBSETS):
18 |     ds_subsets[idx_ds]["test_query_set"] = ds_subsets[idx_ds]["test_query_set"].select(selected_indices[idx_ds])
19 | 
20 | for idx_ds in tqdm(range(NUM_SUBSETS)):
21 |     ds_subsets[idx_ds].push_to_hub(repo_id=NAME_DS + f"_part_{idx_ds}", private=True)
22 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/create_evaluation_datasets/create_vqav2_subsets.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from datasets import load_dataset
 3 | from tqdm import tqdm
 4 | 
 5 | 
 6 | NUM_SUBSETS = 6
 7 | 
 8 | NAME_DS = "HuggingFaceM4/VQAv2_modif_support_query_sets"
 9 | 
10 | 
11 | ds_subsets = [load_dataset(NAME_DS, use_auth_token=True) for _ in range(NUM_SUBSETS)]
12 | 
13 | num_test_examples = ds_subsets[0]["test_query_set"].num_rows
14 | 
15 | selected_indices = np.array_split(range(num_test_examples), NUM_SUBSETS)
16 | 
17 | for idx_ds in range(NUM_SUBSETS):
18 |     ds_subsets[idx_ds]["test_query_set"] = ds_subsets[idx_ds]["test_query_set"].select(selected_indices[idx_ds])
19 | 
20 | for idx_ds in tqdm(range(NUM_SUBSETS)):
21 |     ds_subsets[idx_ds].push_to_hub(repo_id=NAME_DS + f"_part_{idx_ds}", private=True)
22 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/create_fine_tuning_datasets/create_llavar.py:
--------------------------------------------------------------------------------
 1 | # DL finetuning images: https://drive.google.com/file/d/1Ms7OCjcFQ18Whmujszpc9bTp0Jy0Dye4/view?usp=sharing
 2 | # DL finetuning instructions: https://drive.google.com/file/d/1ISdKOV1wwVkLHf5FNutctpOBa-CmNRFv/view?usp=sharing
 3 | 
 4 | 
 5 | import json
 6 | import os
 7 | 
 8 | from datasets import Dataset
 9 | from PIL import Image
10 | 
11 | 
12 | PATH_CONV = "/Users/hugolaurencon/Desktop/llava_instruct_150k_llavar_16k.json"
13 | PATH_DIR_IMAGES = "/Users/hugolaurencon/Desktop/finetune"
14 | 
15 | 
16 | with open(PATH_CONV) as f:
17 |     data_conv = json.load(f)
18 | data_conv = data_conv[-15500:]  # Before it's only the regular LLaVA instructions
19 | 
20 | 
21 | all_image = []
22 | all_user_texts = []
23 | all_bot_texts = []
24 | 
25 | for conv in data_conv:
26 |     image_path = os.path.join(PATH_DIR_IMAGES, conv["image"])
27 |     image = Image.open(image_path)
28 |     all_image.append(image)
29 |     user_texts = []
30 |     bot_texts = []
31 |     for turn in conv["conversations"]:
32 |         if turn["from"] == "human":
33 |             user_texts.append(turn["value"].replace("<image>", "").strip())
34 |         elif turn["from"] == "gpt":
35 |             bot_texts.append(turn["value"])
36 |     assert len(user_texts) == len(bot_texts)
37 |     all_user_texts.append(user_texts)
38 |     all_bot_texts.append(bot_texts)
39 | 
40 | assert len(all_image) == len(all_user_texts) == len(all_bot_texts)
41 | 
42 | 
43 | ds = Dataset.from_dict({"image": all_image, "user_texts": all_user_texts, "bot_texts": all_bot_texts})
44 | ds.push_to_hub("HuggingFaceM4/LLaVAR-Instruct-16K")
45 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/create_valid_ds/create_cm4_valid.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datasets import load_from_disk
 4 | 
 5 | 
 6 | SUBSET_DIR_PATH = "/scratch/m4/webdocs/web_document_dataset_filtered/"
 7 | BIG_SHARD_ID = 0
 8 | cm4_valid_path = f"{SUBSET_DIR_PATH}/{BIG_SHARD_ID}"
 9 | sync_cmd = (
10 |     "s5cmd sync"
11 |     f" s3://m4-datasets/webdocs/web_document_dataset_filtered_imgurldedup_nsfwfiltered_urldedup_linededup_finalcleaning_setimgurlsdedup/{BIG_SHARD_ID}/*"
12 |     f" {cm4_valid_path}"
13 | )
14 | 
15 | os.system(sync_cmd)
16 | 
17 | ds = load_from_disk(cm4_valid_path)
18 | 
19 | ds_sample = ds.select(range(10000))
20 | repo_id = "HuggingFaceM4/cm4_valid-Sample"
21 | ds_sample.push_to_hub(repo_id, "valid", private=True)
22 | 
23 | ds.push_to_hub("HuggingFaceM4/cm4_valid", "valid", private=True)
24 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/create_valid_ds/create_coco_valid.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datasets import load_from_disk
 4 | 
 5 | 
 6 | SUBSET_DIR_PATH = "/scratch/general_pmd/image/coco/"
 7 | valid_path = f"{SUBSET_DIR_PATH}/validation"
 8 | sync_cmd = f"s5cmd sync s3://m4-datasets/general_pmd/image/coco/validation/00000-00001/* {valid_path}"
 9 | 
10 | os.system(sync_cmd)
11 | 
12 | ds = load_from_disk(valid_path)
13 | print(ds)
14 | repo_id = "HuggingFaceM4/coco_valid"
15 | ds.push_to_hub(repo_id, "valid", private=True)
16 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/create_valid_ds/create_wiki_valid.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datasets import concatenate_datasets, load_from_disk
 4 | 
 5 | 
 6 | SUBSET_DIR_PATH = "/scratch/enwiki/"
 7 | valid_path = f"{SUBSET_DIR_PATH}/validation"
 8 | sync_cmd = f"s5cmd sync s3://m4-datasets/enwiki/enwiki-v2/valid/* {valid_path}"
 9 | 
10 | os.system(sync_cmd)
11 | 
12 | 
13 | shard_valid_path = f"{SUBSET_DIR_PATH}/validation/shard_0"
14 | ds = load_from_disk(shard_valid_path)
15 | print(ds)
16 | repo_id = "HuggingFaceM4/enwiki-v2_valid-Sample"
17 | ds.push_to_hub(repo_id, "valid", private=True)
18 | 
19 | 
20 | valid_path = [f"{SUBSET_DIR_PATH}/validation/shard_{shard_id}" for shard_id in range(10)]
21 | ds = [load_from_disk(path) for path in valid_path]
22 | ds = concatenate_datasets(ds)
23 | 
24 | print(ds)
25 | repo_id = "HuggingFaceM4/enwiki-v2_valid"
26 | ds.push_to_hub(repo_id, "valid", private=True)
27 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/enwiki/REAME.md:
--------------------------------------------------------------------------------
1 | This folder contains all the slurm, bash and python scripts used to build enwiki-v1 and enwiki-v2. The numbering of the files indicates the order in which they were run.
2 | 
3 | Beware, these scripts have sometimes been used on different machines to process a portion of the shards, the changes needed to parallelize the work are not contained in the scripts in the folder  `slurm_and_bash_scripts`.
4 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/enwiki/python_scripts/05_download_remaining_urls.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from m4.sourcing.data_collection.processors.web_document_extractor import download_images
 4 | 
 5 | 
 6 | SHARD_ID = 9
 7 | NUM_SHARDS = 33
 8 | DATA_DIR = Path("/home/lucile/local_datasets/enwiki/enwiki-NS0-20230220-ENTERPRISE-HTML-EXTRACTION")
 9 | DATASET_NAME_INCOMPLETE_EXAMPLES = "wikipedia_html_enterprise-with-images-incomplete-v1-v2"
10 | NUM_PROC = 32 // 2
11 | REMAINING_URLS_FILENAME = f"remaining_urls_v2_shard_{SHARD_ID}.txt"
12 | DOWNLOADED_IMAGES_DIRNAME = f"downloaded_images-v3_shard_{SHARD_ID}"
13 | 
14 | 
15 | path_save_file_image_urls = DATA_DIR / REMAINING_URLS_FILENAME
16 | path_save_dir_downloaded_images = DATA_DIR / DOWNLOADED_IMAGES_DIRNAME
17 | number_sample_per_shard = 10_000
18 | image_size = 256
19 | resize_mode = "no"
20 | num_proc = 1
21 | thread_count = 1
22 | 
23 | download_images(
24 |     path_save_file_image_urls,
25 |     path_save_dir_downloaded_images,
26 |     number_sample_per_shard,
27 |     image_size,
28 |     resize_mode,
29 |     num_proc,
30 |     thread_count,
31 | )
32 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/enwiki/python_scripts/06_create_image_dataset.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | from pathlib import Path
 4 | 
 5 | from m4.sourcing.data_collection.processors.web_document_extractor import create_dataset_images_from_tar
 6 | 
 7 | 
 8 | path_save_dir_downloaded_images = Path("/home/lucile/local_datasets/enwiki/enwiki-v2-downloaded-images")
 9 | path_save_dir_tmp_datasets_images = Path("/home/lucile/local_datasets/enwiki/enwiki-v2-ds-images-tmp")
10 | num_proc = 16
11 | path_save_file_map_url_idx = Path("/home/lucile/local_datasets/enwiki/enwiki-v2-map-url-idx.json")
12 | path_save_dir_dataset_images = Path("/home/lucile/local_datasets/enwiki/enwiki-v2-ds-images")
13 | 
14 | tar_paths = []
15 | for path_save_dir_downloaded_images_shard in path_save_dir_downloaded_images.glob("*"):
16 |     if path_save_dir_downloaded_images_shard.is_dir():
17 |         tar_paths.extend(glob.glob(os.path.join(path_save_dir_downloaded_images_shard, "*.tar")))
18 | 
19 | create_dataset_images_from_tar(
20 |     tar_paths,
21 |     path_save_dir_tmp_datasets_images,
22 |     num_proc,
23 |     path_save_file_map_url_idx,
24 |     path_save_dir_dataset_images,
25 | )
26 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/enwiki/python_scripts/08_save_dataset.py:
--------------------------------------------------------------------------------
 1 | # %%
 2 | from pathlib import Path
 3 | 
 4 | from datasets import concatenate_datasets, load_from_disk
 5 | 
 6 | from m4.sourcing.data_collection.processors.web_document_extractor import save_split_sharded_already_splitted_dataset
 7 | 
 8 | 
 9 | NUM_SHARDS = 68
10 | DS_V1_PATH = Path("/home/lucile/local_datasets/enwiki/enwiki-v1")
11 | DS_V2_COMMON_PATH = Path("/home/lucile/local_datasets/enwiki/enwiki-NS0-20230220-ENTERPRISE-HTML-EXTRACTION")
12 | EXCLUDE_SHARD_IDS = [34]
13 | DATASET_NAME_COMPLETE_EXAMPLES_V2 = "wikipedia_html_enterprise-with-images-full-v2-v3"
14 | SHARD_SIZE = 20_000
15 | 
16 | DS_FINAL_DS_PATH = Path("/home/lucile/local_datasets/enwiki/enwiki-v2")
17 | ds_v1 = load_from_disk(DS_V1_PATH)
18 | # %%
19 | ds_v1
20 | # %%
21 | ds_v1_merged = concatenate_datasets([ds_v1["train"], ds_v1["valid"]])
22 | # %%
23 | ds_v1_merged
24 | # %%
25 | 
26 | ds_list = []
27 | for shard_id in range(0, NUM_SHARDS):
28 |     if shard_id in EXCLUDE_SHARD_IDS:
29 |         continue
30 |     print(f"Processing shard {shard_id}...")
31 |     shard_dir = DS_V2_COMMON_PATH / f"shard_{shard_id}"
32 |     ds_path = shard_dir / DATASET_NAME_COMPLETE_EXAMPLES_V2
33 |     ds = load_from_disk(ds_path)
34 |     ds_list.append(ds)
35 | 
36 | ds_v2 = concatenate_datasets(ds_list)
37 | # %%
38 | ds_full = concatenate_datasets([ds_v1_merged, ds_v2])
39 | # %%
40 | ds_full = ds_full.remove_columns(["images_urls", "num_found", "num_not_found", "mismatches"])
41 | # %%
42 | ds_full = ds_full.train_test_split(test_size=0.05, shuffle=False)
43 | ds_full["valid"] = ds_full["test"]
44 | ds_full.pop("test")
45 | 
46 | save_split_sharded_already_splitted_dataset(
47 |     ds_full, Path("/home/lucile/local_datasets/enwiki") / "enwiki-v2-full", SHARD_SIZE
48 | )
49 | # %%
50 | 


--------------------------------------------------------------------------------
/vision/data/datasets_processing_scripts/integrate_evaluation_benchmarks_chatbot/gqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | import datasets
 5 | from datasets import DatasetDict, load_dataset
 6 | 
 7 | from datasets_processing_scripts.build_concatenation_datasets_sft.build_ds_sft import (
 8 |     PROMPTS_ANSWER_SHORTLY,
 9 |     convert_img_to_bytes,
10 | )
11 | 
12 | 
13 | NUM_PROC = 96
14 | 
15 | FEATURES = datasets.Features(
16 |     {
17 |         "image": datasets.Image(decode=True),
18 |         "question": datasets.Value("string"),
19 |         "answers": datasets.Sequence(datasets.Value("string")),
20 |         "question_id": datasets.Value("int64"),
21 |     }
22 | )
23 | 
24 | 
25 | def map_transform_gqa(example):
26 |     path_image = os.path.join("/fsx/hugo/gqa/images", os.path.basename(example["image_id"]))
27 |     image_bytes = convert_img_to_bytes(img_path=path_image, format="JPEG")
28 |     question = example["question"] + random.choice(PROMPTS_ANSWER_SHORTLY)
29 |     example["image"] = {"path": None, "bytes": image_bytes}
30 |     example["question"] = question
31 |     example["answers"] = [example["label"]]
32 |     return example
33 | 
34 | 
35 | def load_gqa(split):
36 |     ds_gqa = load_dataset("Graphcore/gqa", split=split)
37 |     columns_to_keep = ["image", "question", "answers", "question_id"]
38 |     columns_to_remove = [c_n for c_n in ds_gqa.column_names if c_n not in columns_to_keep]
39 |     ds_gqa = ds_gqa.map(map_transform_gqa, remove_columns=columns_to_remove, features=FEATURES, num_proc=NUM_PROC)
40 |     return ds_gqa
41 | 
42 | 
43 | ds_gqa_all_splits = DatasetDict(
44 |     {"train": load_gqa("train"), "validation": load_gqa("validation"), "test": load_gqa("test")}
45 | )
46 | 
47 | ds_gqa_all_splits.push_to_hub("HuggingFaceM4/GQA", private=True)
48 | 


--------------------------------------------------------------------------------
/vision/evaluation/README.md:
--------------------------------------------------------------------------------
1 | # Evaluation
2 | 
3 | We implemented the evaluations for SmolVLM in [VLMEvalKit](https://github.com/open-compass/VLMEvalKit).
4 | They can be run by following the instructions in their repository.
5 | 
6 | We also have our own internal evaluation scripts, they can be found in the experiments/evaluation folder. The code used for supporting those is in the m4 folder.


--------------------------------------------------------------------------------
/vision/experiments/evaluation/vloom/common/accelerate_config.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | deepspeed_config: {}
 3 | distributed_type: MULTI_GPU
 4 | fsdp_config: {}
 5 | machine_rank: 0
 6 | main_process_ip: null
 7 | main_process_port: null
 8 | main_training_function: main
 9 | mixed_precision: 'no'
10 | num_machines: null
11 | num_processes: null
12 | use_cpu: false
13 | 


--------------------------------------------------------------------------------
/vision/experiments/evaluation/vloom/common/sync_evaluations_on_gcs.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=gcs_sync_eval
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --qos=qos_cpu-dev
 6 | #SBATCH --partition=compil
 7 | #SBATCH --cpus-per-task=1            # number of cores per tasks
 8 | #SBATCH --hint=nomultithread         # we get physical cores not logical
 9 | #SBATCH --time 00:05:00              # maximum execution time (HH:MM:SS)
10 | #SBATCH --output=/gpfsscratch/rech/cnw/commun/experiments/local_experiment_dir/evals/run_eval_master/logs/%x_%j.out
11 | #SBATCH --account=cnw@cpu
12 | #SBATCH --mail-type=FAIL,INVALID_DEPEND,REQUEUE,STAGE_OUT,TIME_LIMIT
13 | #SBATCH --mail-user=hf-m4-jz@googlegroups.com
14 | #SBATCH --no-requeue
15 | 
16 | set -x -e
17 | 
18 | source $cnw_ALL_CCFRWORK/start-m4-user
19 | 
20 | gsutil cp $EVALUATION_JSONL_FILE gs://hf-science-m4-cold/local_experiment_dir/evals/results/
21 | 


--------------------------------------------------------------------------------
/vision/experiments/evaluation/vloom/common/sync_evaluations_on_s3.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=s3_sync_eval
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --cpus-per-task=1            # number of cores per tasks
 6 | #SBATCH --time 00:05:00              # maximum execution time (HH:MM:SS)
 7 | #SBATCH --mem-per-cpu=11G
 8 | #SBATCH --output=/fsx/m4/evals/run_eval_master/logs/%x_%j.out
 9 | set -x -e
10 | 
11 | source $cnw_ALL_CCFRWORK/start-m4-user
12 | BASENAME_EVALUATION_JSONL_FILE="$(basename $EVALUATION_JSONL_FILE)"
13 | aws s3 cp $EVALUATION_JSONL_FILE s3://m4-exps/eval_results/$BASENAME_EVALUATION_JSONL_FILE
14 | 


--------------------------------------------------------------------------------
/vision/experiments/evaluation/vloom/common/sync_evaluations_on_wandb.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=run_eval_automatic
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --qos=qos_cpu-dev
 6 | #SBATCH --partition=compil
 7 | #SBATCH --cpus-per-task=4            # number of cores per tasks
 8 | #SBATCH --hint=nomultithread         # we get physical cores not logical
 9 | #SBATCH --time 01:00:00              # maximum execution time (HH:MM:SS)
10 | #SBATCH --output=/gpfsscratch/rech/cnw/commun/experiments/local_experiment_dir/evals/run_eval_master/logs/%x_%j.out
11 | #SBATCH --account=cnw@cpu
12 | #SBATCH --mail-type=FAIL,INVALID_DEPEND,REQUEUE,STAGE_OUT,TIME_LIMIT
13 | #SBATCH --mail-user=hf-m4-jz@googlegroups.com
14 | #SBATCH --no-requeue
15 | 
16 | set -x -e
17 | 
18 | source $cnw_ALL_CCFRWORK/start-m4-user
19 | 
20 | conda activate $CONDA_ENV_NAME
21 | 
22 | pushd $WORKING_DIR
23 | 
24 | python m4/evaluation/scripts/sync_evaluations_on_wandb.py \
25 |     --evaluation_jsonl_files $EVALUATION_JSONL_FILE \
26 |     --run_name_to_log $RUN_NAME
27 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/common/sync_and_upload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # $1: save_dir
 4 | # $2: exp_name
 5 | # $3: previously saved step
 6 | # $4: current saved step
 7 | # Example use: ./sync_and_upload.sh /home/victor/experiments vllama_debug opt_step-40 opt_step-50
 8 | 
 9 | if [[ -n "$4" ]]; then
10 |     if [[ -n "$3" ]]; then
11 |         s5cmd sync "$1/$2/$3/" "s3://m4-exps/$2/$3/" && rm -rf "$1/$2/$3"
12 |     fi
13 |     s5cmd cp "$1/$2/$4/" "s3://m4-exps/$2/$4/"
14 | fi
15 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/common/webdataset_get_file.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # create a unique temp file per tar
 4 | tmpfile=$(mktemp /scratch/m4data/tmp-dldata.XXXXXXXX)
 5 | 
 6 | # auto-remove the temp file when the script exits for any reason or receives SIGINT/SIGTERM
 7 | trap "rm -f $tmpfile" EXIT INT TERM
 8 | 
 9 | # make sure that the only output from the script is generated by the `cat` command below
10 | s5cmd cp $1 $tmpfile > /dev/null 2>&1
11 | cat $tmpfile
12 | 
13 | # note: tmpfile gets autodeleted on exit via trap above
14 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/slurm_scripts_templates/accelerate_config_multi_node.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | deepspeed_config:
 3 |   deepspeed_multinode_launcher: standard
 4 |   deepspeed_config_file: ./experiments/pretraining/vloom/slurm_scripts_templates/ds_config.json
 5 |   zero3_init_flag: true
 6 | distributed_type: DEEPSPEED
 7 | fsdp_config: {}
 8 | machine_rank: 0
 9 | main_process_ip: null
10 | main_process_port: null
11 | main_training_function: main
12 | mixed_precision: fp16
13 | num_machines: 2
14 | num_processes: 8
15 | use_cpu: false
16 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/slurm_scripts_templates/accelerate_config_single_node.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | deepspeed_config:
 3 |   deepspeed_multinode_launcher: standard
 4 |   deepspeed_config_file: ./experiments/pretraining/vloom/slurm_scripts_templates/ds_config.json
 5 |   zero3_init_flag: true
 6 | distributed_type: DEEPSPEED
 7 | fsdp_config: {}
 8 | machine_rank: 0
 9 | main_process_ip: null
10 | main_process_port: null
11 | main_training_function: main
12 | mixed_precision: fp16
13 | num_machines: 1
14 | num_processes: 4
15 | use_cpu: false
16 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/slurm_scripts_templates/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": true,
 4 |         "auto_cast": true,
 5 |         "loss_scale": 0,
 6 |         "initial_scale_power": 32,
 7 |         "loss_scale_window": 1000,
 8 |         "hysteresis": 2,
 9 |         "min_loss_scale": 1
10 |     },
11 |     "zero_optimization": {
12 |         "stage": 2,
13 |         "allgather_partitions": true,
14 |         "allgather_bucket_size": 5e8,
15 |         "overlap_comm": false,
16 |         "reduce_scatter": true,
17 |         "reduce_bucket_size": "auto",
18 |         "contiguous_gradients": true,
19 |         "offload_optimizer": {
20 |             "device": "cpu"
21 |         },
22 |         "offload_param": {
23 |             "device": "cpu"
24 |         },
25 |         "stage3_gather_16bit_weights_on_model_save": "auto"
26 |     },
27 |     "train_micro_batch_size_per_gpu": "auto",
28 |     "train_batch_size": "auto",
29 |     "gradient_clipping": "auto"
30 | }
31 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/slurm_scripts_templates/ds_config_bf16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |         "enabled": true
 4 |     },
 5 |     "zero_optimization": {
 6 |         "stage": 2,
 7 |         "allgather_partitions": true,
 8 |         "allgather_bucket_size": 5e8,
 9 |         "overlap_comm": false,
10 |         "reduce_scatter": true,
11 |         "reduce_bucket_size": "auto",
12 |         "contiguous_gradients": true
13 |     },
14 |     "gradient_clipping": "auto",
15 |     "train_batch_size": "auto",
16 |     "train_micro_batch_size_per_gpu": "auto"
17 | }
18 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/slurm_scripts_templates/hfc_with_launcher/cleanup-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=cleanup-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=1:00:00
 6 | #SBATCH --partition=hopper-prod
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/$RUN_NAME/logs/%x-%j.out
 8 | 
 9 | set -e
10 | 
11 | ### EDIT ME START ###
12 | 
13 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved
14 | RUN_FREQUENCY_IN_HOURS=3
15 | 
16 | CONDA_ENV_NAME=shared-m4
17 | 
18 | M4_REPO_PATH=/fsx/m4/repos/m4
19 | EXPERIMENT_NAME=tr_184_xxx
20 | 
21 | ### EDIT ME END ###
22 | 
23 | 
24 | echo "START TIME: $(date)"
25 | 
26 | source /fsx/m4/start-m4-user
27 | conda activate base
28 | conda activate $CONDA_ENV_NAME
29 | 
30 | # ensure to restart self first
31 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
32 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm
33 | 
34 | echo "running checkpoint cleanup"
35 | 
36 | 
37 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
38 | 
39 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH
40 | 
41 | echo "END TIME: $(date)"
42 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/slurm_scripts_templates/hfc_with_launcher/convert-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=convert-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --gres=gpu:8
 7 | #SBATCH --partition=hopper-prod
 8 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/$RUN_NAME/logs/%x-%j.out
 9 | 
10 | 
11 | set -e
12 | 
13 | ### EDIT ME START ###
14 | 
15 | # how often to try to run the checkpoint conversion - hint: approximately as often as a checkpoint is saved
16 | RUN_FREQUENCY_IN_HOURS=3
17 | 
18 | CONDA_ENV_NAME=shared-m4
19 | 
20 | M4_REPO_PATH=/fsx/m4/repos/m4
21 | EXPERIMENT_NAME=tr_184_xxx
22 | 
23 | ### EDIT ME END ###
24 | 
25 | 
26 | echo "START TIME: $(date)"
27 | 
28 | source /fsx/m4/start-m4-user
29 | conda activate base
30 | conda activate $CONDA_ENV_NAME
31 | 
32 | # ensure to restart self first
33 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
34 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour convert-checkpoints.slurm
35 | 
36 | echo "running checkpoint converter"
37 | 
38 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
39 | 
40 | python -u $M4_REPO_PATH/m4/scripts/convert-checkpoints.py $M4_CHECKPOINTS_PATH
41 | 
42 | echo "END TIME: $(date)"
43 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/slurm_scripts_templates/hfc_with_launcher/s3-upload-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=s3-upload-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --partition=hopper-prod
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/$RUN_NAME/logs/%x-%j.out
 8 | 
 9 | 
10 | set -e
11 | 
12 | ### EDIT ME START ###
13 | 
14 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved
15 | RUN_FREQUENCY_IN_HOURS=3
16 | 
17 | CONDA_ENV_NAME=shared-m4
18 | 
19 | M4_REPO_PATH=/fsx/m4/repos/m4
20 | EXPERIMENT_NAME=tr_184_xxx
21 | 
22 | ### EDIT ME END ###
23 | 
24 | 
25 | echo "START TIME: $(date)"
26 | 
27 | source /fsx/m4/start-m4-user
28 | conda activate base
29 | conda activate $CONDA_ENV_NAME
30 | 
31 | # ensure to restart self first
32 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
33 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm
34 | 
35 | echo "running checkpoint converter"
36 | 
37 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
38 | 
39 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH
40 | 
41 | echo "END TIME: $(date)"
42 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/slurm_scripts_templates/hfc_with_launcher/schedule-evals.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=schedule-evals
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=00:30:00
 6 | #SBATCH --partition=hopper-prod
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/$RUN_NAME/logs/%x-%j.out
 8 | 
 9 | set -e
10 | 
11 | ### EDIT ME START ###
12 | 
13 | # how often to try to run the conversion - hint: approximately as often as a checkpoint is saved
14 | RUN_FREQUENCY_IN_HOURS=3
15 | 
16 | CONDA_ENV_NAME=shared-m4
17 | 
18 | M4_REPO_PATH=/fsx/m4/repos/m4
19 | EXPERIMENT_NAME=tr_184_xxx
20 | 
21 | ### EDIT ME END ###
22 | 
23 | 
24 | echo "START TIME: $(date)"
25 | 
26 | source /fsx/m4/start-m4-user
27 | conda activate base
28 | conda activate $CONDA_ENV_NAME
29 | 
30 | # ensure to restart self first
31 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
32 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour schedule-evals.slurm
33 | 
34 | echo "running eval scheduler"
35 | 
36 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
37 | 
38 | python -u $M4_REPO_PATH/m4/scripts/schedule-evals.py $M4_CHECKPOINTS_PATH
39 | 
40 | echo "END TIME: $(date)"
41 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/slurm_scripts_templates/with_launcher/accelerate_config.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | deepspeed_config:
 3 |   deepspeed_multinode_launcher: standard
 4 |   deepspeed_config_file: ./experiments/pretraining/vloom/xxxx/ds_config.json
 5 |   gradient_accumulation_steps: 1
 6 |   gradient_clipping: 1.0
 7 |   offload_optimizer_device: cpu
 8 |   offload_param_device: cpu
 9 |   zero3_init_flag: true
10 |   zero_stage: 2
11 | distributed_type: DEEPSPEED
12 | fsdp_config: {}
13 | machine_rank: 0
14 | main_process_ip: null
15 | main_process_port: null
16 | main_training_function: main
17 | mixed_precision: bf16
18 | num_machines: null
19 | num_processes: null
20 | use_cpu: false
21 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/slurm_scripts_templates/with_launcher/config.yaml:
--------------------------------------------------------------------------------
 1 | data_param:
 2 |     map_batch_size: 32
 3 |     num_workers: 2
 4 |     p_next: 0.
 5 |     max_seq_len: 77
 6 |     pad_dataset: True
 7 |     realtime_processing: True
 8 |     persistent_workers: False
 9 | hparams:
10 |     tokenizer_name: gpt2
11 |     tokenizer_params: '{"additional_special_tokens":[AddedToken("<image>", rstrip=False, lstrip=False)], "use_fast":True}'
12 |     tokenizer_add_special_tokens: '{"pad_token": tokenizer.eos_token}'
13 |     model_name: gpt2-xl
14 |     model_params:
15 |         vision_image_size: 224
16 |         vision_model_name: openai/clip-vit-base-patch16
17 |         vision_model_params: '{"id2label":{}, "label2id":{}}'
18 |         tie_word_embeddings: True
19 |         freeze_lm_head: True
20 |         freeze_text_layers: True
21 |         freeze_vision_layers: True
22 |         alpha_initializer: zeros
23 |         alpha_type: float
24 |         cross_layer_interval: 1
25 |     batch_size: 8
26 |     grad_acc_size: 1
27 |     grad_clip: 1.0
28 |     max_num_opt_steps: 500_000
29 |     seed: 13
30 |     train_logging_opt_steps: 10
31 |     train_saving_opt_steps: 250
32 |     val_logging_opt_steps: 250
33 |     wandb_enable: true
34 |     wandb_entity: huggingfacem4
35 |     wandb_log_freq: 10
36 |     wandb_project: VLOOM
37 | optim_param:
38 |     vl_optim: AdamW
39 |     vl_optim_params:
40 |         betas: [0.9, 0.999]
41 |         lr: 0.0001
42 |         weight_decay: 0.1
43 |         no_decay: ["bias", "alpha", "layernorm", "ln"]
44 |     vl_lr_scheduler: get_constant_schedule_with_warmup
45 |     vl_lr_scheduler_params:
46 |         last_epoch: -1
47 |         num_warmup_steps: 5_000
48 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/slurm_scripts_templates/with_launcher/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |         "enabled": true
 4 |     },
 5 |     "zero_optimization": {
 6 |         "stage": 2,
 7 |         "allgather_partitions": true,
 8 |         "allgather_bucket_size": 5e8,
 9 |         "overlap_comm": false,
10 |         "reduce_scatter": true,
11 |         "reduce_bucket_size": "auto",
12 |         "contiguous_gradients": true
13 |     },
14 |     "gradient_clipping": "auto",
15 |     "train_batch_size": "auto",
16 |     "train_micro_batch_size_per_gpu": "auto"
17 | }
18 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_341_smolvlm_025b_1st_stage/cleanup-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_341_vsmollm2-cleanup-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=1:00:00
 6 | #SBATCH --partition=hopper-cpu
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | set -e
11 | 
12 | # ----------------- Auto-Workdir -----------------
13 | if [ -n $SLURM_JOB_ID ];  then
14 |     # check the original location through scontrol and $SLURM_JOB_ID
15 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
16 | else
17 |     # otherwise: started with bash. Get the real location.
18 |     SCRIPT_PATH=$(realpath $0)
19 | fi
20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
22 | # --------------------------------------------------
23 | 
24 | ### EDIT ME START ###
25 | 
26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved
27 | RUN_FREQUENCY_IN_HOURS=6
28 | 
29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
30 | 
31 | EXPERIMENT_NAME=tr_341_vsmollm2_05b
32 | 
33 | ### EDIT ME END ###
34 | 
35 | 
36 | echo "START TIME: $(date)"
37 | 
38 | source /fsx/m4/start-m4-user
39 | conda activate base
40 | conda activate $CONDA_ENV_NAME
41 | 
42 | pushd $M4_REPO_PATH
43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
45 | 
46 | # ensure to restart self first
47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm
49 | 
50 | echo "running checkpoint cleanup"
51 | 
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_341_smolvlm_025b_1st_stage/merge_lora_and_resize_eou.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --gres=gpu:1
 6 | #SBATCH --cpus-per-task=12
 7 | #SBATCH --time=3:00:00
 8 | #SBATCH --partition=hopper-prod
 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out
10 | #SBATCH --qos=high
11 | 
12 | set -e
13 | 
14 | # ----------------- Auto-Workdir -----------------
15 | if [ -n $SLURM_JOB_ID ];  then
16 |     # check the original location through scontrol and $SLURM_JOB_ID
17 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
18 | else
19 |     # otherwise: started with bash. Get the real location.
20 |     SCRIPT_PATH=$(realpath $0)
21 | fi
22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd)
24 | 
25 | # --------------------------------------------------
26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3"
27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/opt_step-25750"
28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_341_vsmollm2_05b_opt_step_25750_merge_and_resize_eou"
29 | 
30 | source /fsx/m4/start-m4-user
31 | conda activate base
32 | conda activate $CONDA_ENV_NAME
33 | 
34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR
35 | python experiments/pretraining/vloom/tr_341_vsmollm2_05b/resize_embed_for_eou.py $OUTPUT_DIR
36 | echo "Done"
37 | 
38 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_341_smolvlm_025b_1st_stage/s3-upload-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_341-s3-upload-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --partition=hopper-cpu
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | 
11 | set -e
12 | 
13 | # ----------------- Auto-Workdir -----------------
14 | if [ -n $SLURM_JOB_ID ];  then
15 |     # check the original location through scontrol and $SLURM_JOB_ID
16 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
17 | else
18 |     # otherwise: started with bash. Get the real location.
19 |     SCRIPT_PATH=$(realpath $0)
20 | fi
21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
23 | # --------------------------------------------------
24 | 
25 | ### EDIT ME START ###
26 | 
27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved
28 | RUN_FREQUENCY_IN_HOURS=8
29 | 
30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
31 | 
32 | EXPERIMENT_NAME=tr_341_vsmollm2_05b
33 | 
34 | ### EDIT ME END ###
35 | 
36 | 
37 | echo "START TIME: $(date)"
38 | 
39 | source /fsx/m4/start-m4-user
40 | conda activate base
41 | conda activate $CONDA_ENV_NAME
42 | 
43 | pushd $M4_REPO_PATH
44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
46 | 
47 | # ensure to restart self first
48 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
49 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm
50 | 
51 | echo "running s3 checkpoint upload"
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_341_smolvlm_025b_1st_stage/schedule-evals.sh:
--------------------------------------------------------------------------------
1 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_1024.slurm
2 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_2048.slurm
3 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_1024.slurm
4 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_2048.slurm
5 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_343_smolvlm_05b_1st_stage/cleanup-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_343-cleanup-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=1:00:00
 6 | #SBATCH --partition=hopper-cpu
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_343_vsmollm2_05b/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | set -e
11 | 
12 | # ----------------- Auto-Workdir -----------------
13 | if [ -n $SLURM_JOB_ID ];  then
14 |     # check the original location through scontrol and $SLURM_JOB_ID
15 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
16 | else
17 |     # otherwise: started with bash. Get the real location.
18 |     SCRIPT_PATH=$(realpath $0)
19 | fi
20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
22 | # --------------------------------------------------
23 | 
24 | ### EDIT ME START ###
25 | 
26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved
27 | RUN_FREQUENCY_IN_HOURS=6
28 | 
29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
30 | 
31 | EXPERIMENT_NAME=tr_343_vsmollm2_05b
32 | 
33 | ### EDIT ME END ###
34 | 
35 | 
36 | echo "START TIME: $(date)"
37 | 
38 | source /fsx/m4/start-m4-user
39 | conda activate base
40 | conda activate $CONDA_ENV_NAME
41 | 
42 | pushd $M4_REPO_PATH
43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
45 | 
46 | # ensure to restart self first
47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm
49 | 
50 | echo "running checkpoint cleanup"
51 | 
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_343_smolvlm_05b_1st_stage/merge_lora_and_resize_eou.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --gres=gpu:1
 6 | #SBATCH --cpus-per-task=12
 7 | #SBATCH --time=3:00:00
 8 | #SBATCH --partition=hopper-prod
 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out
10 | #SBATCH --qos=high
11 | 
12 | set -e
13 | 
14 | # ----------------- Auto-Workdir -----------------
15 | if [ -n $SLURM_JOB_ID ];  then
16 |     # check the original location through scontrol and $SLURM_JOB_ID
17 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
18 | else
19 |     # otherwise: started with bash. Get the real location.
20 |     SCRIPT_PATH=$(realpath $0)
21 | fi
22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd)
24 | 
25 | # --------------------------------------------------
26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3"
27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_343_vsmollm2_05b/opt_step-24750"
28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_343_vsmollm2_05b_opt_step_24750_merge_and_resize_eou"
29 | 
30 | source /fsx/m4/start-m4-user
31 | conda activate base
32 | conda activate $CONDA_ENV_NAME
33 | 
34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR
35 | python experiments/pretraining/vloom/tr_343_vsmollm2_05b/resize_embed_for_eou.py $OUTPUT_DIR
36 | echo "Done"
37 | 
38 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_343_smolvlm_05b_1st_stage/s3-upload-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_343-s3-upload-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --partition=hopper-cpu
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_343_vsmollm2_05b/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | 
11 | set -e
12 | 
13 | # ----------------- Auto-Workdir -----------------
14 | if [ -n $SLURM_JOB_ID ];  then
15 |     # check the original location through scontrol and $SLURM_JOB_ID
16 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
17 | else
18 |     # otherwise: started with bash. Get the real location.
19 |     SCRIPT_PATH=$(realpath $0)
20 | fi
21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
23 | # --------------------------------------------------
24 | 
25 | ### EDIT ME START ###
26 | 
27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved
28 | RUN_FREQUENCY_IN_HOURS=8
29 | 
30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
31 | 
32 | EXPERIMENT_NAME=tr_343_vsmollm2_05b
33 | 
34 | ### EDIT ME END ###
35 | 
36 | 
37 | echo "START TIME: $(date)"
38 | 
39 | source /fsx/m4/start-m4-user
40 | conda activate base
41 | conda activate $CONDA_ENV_NAME
42 | 
43 | pushd $M4_REPO_PATH
44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
46 | 
47 | # ensure to restart self first
48 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
49 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm
50 | 
51 | echo "running s3 checkpoint upload"
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_343_smolvlm_05b_1st_stage/schedule-evals.slurm:
--------------------------------------------------------------------------------
1 | sbatch experiments/evaluation/vloom/async_evals_tr_343/run_evals_4_shots_captioning_1024.slurm
2 | sbatch experiments/evaluation/vloom/async_evals_tr_343/run_evals_4_shots_captioning_2048.slurm
3 | sbatch experiments/evaluation/vloom/async_evals_tr_343/run_evals_4_shots_vqa_1024.slurm
4 | sbatch experiments/evaluation/vloom/async_evals_tr_343/run_evals_4_shots_vqa_2048.slurm
5 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_345_vsmollm2_256M_2nd_stage/cleanup-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_320-cleanup-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=1:00:00
 6 | #SBATCH --partition=hopper-cpu
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_320_vsmollm2_long_context/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | set -e
11 | 
12 | # ----------------- Auto-Workdir -----------------
13 | if [ -n $SLURM_JOB_ID ];  then
14 |     # check the original location through scontrol and $SLURM_JOB_ID
15 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
16 | else
17 |     # otherwise: started with bash. Get the real location.
18 |     SCRIPT_PATH=$(realpath $0)
19 | fi
20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
22 | # --------------------------------------------------
23 | 
24 | ### EDIT ME START ###
25 | 
26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved
27 | RUN_FREQUENCY_IN_HOURS=6
28 | 
29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
30 | 
31 | EXPERIMENT_NAME=tr_320_vsmollm2_long_context
32 | 
33 | ### EDIT ME END ###
34 | 
35 | 
36 | echo "START TIME: $(date)"
37 | 
38 | source /fsx/m4/start-m4-user
39 | conda activate base
40 | conda activate $CONDA_ENV_NAME
41 | 
42 | pushd $M4_REPO_PATH
43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
45 | 
46 | # ensure to restart self first
47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm
49 | 
50 | echo "running checkpoint cleanup"
51 | 
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_345_vsmollm2_256M_2nd_stage/merge_lora_and_resize_eou.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --gres=gpu:1
 6 | #SBATCH --cpus-per-task=12
 7 | #SBATCH --time=3:00:00
 8 | #SBATCH --partition=hopper-prod
 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out
10 | #SBATCH --qos=high
11 | 
12 | set -e
13 | 
14 | # ----------------- Auto-Workdir -----------------
15 | if [ -n $SLURM_JOB_ID ];  then
16 |     # check the original location through scontrol and $SLURM_JOB_ID
17 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
18 | else
19 |     # otherwise: started with bash. Get the real location.
20 |     SCRIPT_PATH=$(realpath $0)
21 | fi
22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd)
24 | 
25 | # --------------------------------------------------
26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3"
27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/opt_step-18000"
28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_341_vsmollm2_05b_opt_step_18000_merge_and_resize_eou"
29 | 
30 | source /fsx/m4/start-m4-user
31 | conda activate base
32 | conda activate $CONDA_ENV_NAME
33 | 
34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR
35 | python experiments/pretraining/vloom/tr_341_vsmollm2_05b/resize_embed_for_eou.py $OUTPUT_DIR
36 | echo "Done"
37 | 
38 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_345_vsmollm2_256M_2nd_stage/s3-upload-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_345-s3-upload-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --partition=hopper-cpu
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_345_vsmollm2_256M_2nd_stage/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | 
11 | set -e
12 | 
13 | # ----------------- Auto-Workdir -----------------
14 | if [ -n $SLURM_JOB_ID ];  then
15 |     # check the original location through scontrol and $SLURM_JOB_ID
16 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
17 | else
18 |     # otherwise: started with bash. Get the real location.
19 |     SCRIPT_PATH=$(realpath $0)
20 | fi
21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
23 | # --------------------------------------------------
24 | 
25 | ### EDIT ME START ###
26 | 
27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved
28 | RUN_FREQUENCY_IN_HOURS=8
29 | 
30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
31 | 
32 | EXPERIMENT_NAME=tr_345_vsmollm2_256M_2nd_stage
33 | 
34 | ### EDIT ME END ###
35 | 
36 | 
37 | echo "START TIME: $(date)"
38 | 
39 | source /fsx/m4/start-m4-user
40 | conda activate base
41 | conda activate $CONDA_ENV_NAME
42 | 
43 | pushd $M4_REPO_PATH
44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
46 | 
47 | # ensure to restart self first
48 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
49 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm
50 | 
51 | echo "running s3 checkpoint upload"
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_345_vsmollm2_256M_2nd_stage/schedule-evals.sh:
--------------------------------------------------------------------------------
1 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_1024.slurm
2 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_2048.slurm
3 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_1024.slurm
4 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_2048.slurm
5 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_346_vsmollm2_256M_3rd_stage/cleanup-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_346-cleanup-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=1:00:00
 6 | #SBATCH --partition=hopper-cpu
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_346_vsmollm2_256M_3rd_stage/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | set -e
11 | 
12 | # ----------------- Auto-Workdir -----------------
13 | if [ -n $SLURM_JOB_ID ];  then
14 |     # check the original location through scontrol and $SLURM_JOB_ID
15 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
16 | else
17 |     # otherwise: started with bash. Get the real location.
18 |     SCRIPT_PATH=$(realpath $0)
19 | fi
20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
22 | # --------------------------------------------------
23 | 
24 | ### EDIT ME START ###
25 | 
26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved
27 | RUN_FREQUENCY_IN_HOURS=6
28 | 
29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
30 | 
31 | EXPERIMENT_NAME=tr_346_vsmollm2_256M_3rd_stage
32 | 
33 | ### EDIT ME END ###
34 | 
35 | 
36 | echo "START TIME: $(date)"
37 | 
38 | source /fsx/m4/start-m4-user
39 | conda activate base
40 | conda activate $CONDA_ENV_NAME
41 | 
42 | pushd $M4_REPO_PATH
43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
45 | 
46 | # ensure to restart self first
47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm
49 | 
50 | echo "running checkpoint cleanup"
51 | 
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_346_vsmollm2_256M_3rd_stage/merge_lora_and_resize_eou.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --gres=gpu:1
 6 | #SBATCH --cpus-per-task=12
 7 | #SBATCH --time=3:00:00
 8 | #SBATCH --partition=hopper-prod
 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out
10 | #SBATCH --qos=high
11 | 
12 | set -e
13 | 
14 | # ----------------- Auto-Workdir -----------------
15 | if [ -n $SLURM_JOB_ID ];  then
16 |     # check the original location through scontrol and $SLURM_JOB_ID
17 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
18 | else
19 |     # otherwise: started with bash. Get the real location.
20 |     SCRIPT_PATH=$(realpath $0)
21 | fi
22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd)
24 | 
25 | # --------------------------------------------------
26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3"
27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/opt_step-18000"
28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_341_vsmollm2_05b_opt_step_18000_merge_and_resize_eou"
29 | 
30 | source /fsx/m4/start-m4-user
31 | conda activate base
32 | conda activate $CONDA_ENV_NAME
33 | 
34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR
35 | python experiments/pretraining/vloom/tr_341_vsmollm2_05b/resize_embed_for_eou.py $OUTPUT_DIR
36 | echo "Done"
37 | 
38 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_346_vsmollm2_256M_3rd_stage/s3-upload-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_346-s3-upload-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --partition=hopper-prod
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_346_vsmollm2_256M_3rd_stage/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | 
11 | set -e
12 | 
13 | # ----------------- Auto-Workdir -----------------
14 | if [ -n $SLURM_JOB_ID ];  then
15 |     # check the original location through scontrol and $SLURM_JOB_ID
16 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
17 | else
18 |     # otherwise: started with bash. Get the real location.
19 |     SCRIPT_PATH=$(realpath $0)
20 | fi
21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
23 | # --------------------------------------------------
24 | 
25 | ### EDIT ME START ###
26 | 
27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved
28 | RUN_FREQUENCY_IN_HOURS=8
29 | 
30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
31 | 
32 | EXPERIMENT_NAME=tr_346_vsmollm2_256M_3rd_stage
33 | 
34 | ### EDIT ME END ###
35 | 
36 | 
37 | echo "START TIME: $(date)"
38 | 
39 | source /fsx/m4/start-m4-user
40 | conda activate base
41 | conda activate $CONDA_ENV_NAME
42 | 
43 | pushd $M4_REPO_PATH
44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
46 | 
47 | # ensure to restart self first
48 | # echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
49 | # sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm
50 | 
51 | echo "running s3 checkpoint upload"
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_346_vsmollm2_256M_3rd_stage/schedule-evals.sh:
--------------------------------------------------------------------------------
1 | sbatch experiments/evaluation/vloom/async_evals_tr_346/run_evals_0_shots_val_512.slurm
2 | sbatch experiments/evaluation/vloom/async_evals_tr_346/run_evals_0_shots_val_1024.slurm
3 | sbatch experiments/evaluation/vloom/async_evals_tr_346/run_evals_0_shots_val_1536.slurm
4 | sbatch experiments/evaluation/vloom/async_evals_tr_346/run_evals_0_shots_val_2048.slurm
5 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_347_smolvlm_500M_2nd_stage/cleanup-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_320-cleanup-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=1:00:00
 6 | #SBATCH --partition=hopper-cpu
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_320_vsmollm2_long_context/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | set -e
11 | 
12 | # ----------------- Auto-Workdir -----------------
13 | if [ -n $SLURM_JOB_ID ];  then
14 |     # check the original location through scontrol and $SLURM_JOB_ID
15 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
16 | else
17 |     # otherwise: started with bash. Get the real location.
18 |     SCRIPT_PATH=$(realpath $0)
19 | fi
20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
22 | # --------------------------------------------------
23 | 
24 | ### EDIT ME START ###
25 | 
26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved
27 | RUN_FREQUENCY_IN_HOURS=6
28 | 
29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
30 | 
31 | EXPERIMENT_NAME=tr_320_vsmollm2_long_context
32 | 
33 | ### EDIT ME END ###
34 | 
35 | 
36 | echo "START TIME: $(date)"
37 | 
38 | source /fsx/m4/start-m4-user
39 | conda activate base
40 | conda activate $CONDA_ENV_NAME
41 | 
42 | pushd $M4_REPO_PATH
43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
45 | 
46 | # ensure to restart self first
47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm
49 | 
50 | echo "running checkpoint cleanup"
51 | 
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_347_smolvlm_500M_2nd_stage/merge_lora_and_resize_eou.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --gres=gpu:1
 6 | #SBATCH --cpus-per-task=12
 7 | #SBATCH --time=3:00:00
 8 | #SBATCH --partition=hopper-prod
 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out
10 | #SBATCH --qos=high
11 | 
12 | set -e
13 | 
14 | # ----------------- Auto-Workdir -----------------
15 | if [ -n $SLURM_JOB_ID ];  then
16 |     # check the original location through scontrol and $SLURM_JOB_ID
17 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
18 | else
19 |     # otherwise: started with bash. Get the real location.
20 |     SCRIPT_PATH=$(realpath $0)
21 | fi
22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd)
24 | 
25 | # --------------------------------------------------
26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3"
27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/opt_step-18000"
28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_341_vsmollm2_05b_opt_step_18000_merge_and_resize_eou"
29 | 
30 | source /fsx/m4/start-m4-user
31 | conda activate base
32 | conda activate $CONDA_ENV_NAME
33 | 
34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR
35 | python experiments/pretraining/vloom/tr_341_vsmollm2_05b/resize_embed_for_eou.py $OUTPUT_DIR
36 | echo "Done"
37 | 
38 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_347_smolvlm_500M_2nd_stage/s3-upload-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_345-s3-upload-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --partition=hopper-cpu
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_345_vsmollm2_256M_2nd_stage/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | 
11 | set -e
12 | 
13 | # ----------------- Auto-Workdir -----------------
14 | if [ -n $SLURM_JOB_ID ];  then
15 |     # check the original location through scontrol and $SLURM_JOB_ID
16 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
17 | else
18 |     # otherwise: started with bash. Get the real location.
19 |     SCRIPT_PATH=$(realpath $0)
20 | fi
21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
23 | # --------------------------------------------------
24 | 
25 | ### EDIT ME START ###
26 | 
27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved
28 | RUN_FREQUENCY_IN_HOURS=8
29 | 
30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
31 | 
32 | EXPERIMENT_NAME=tr_345_vsmollm2_256M_2nd_stage
33 | 
34 | ### EDIT ME END ###
35 | 
36 | 
37 | echo "START TIME: $(date)"
38 | 
39 | source /fsx/m4/start-m4-user
40 | conda activate base
41 | conda activate $CONDA_ENV_NAME
42 | 
43 | pushd $M4_REPO_PATH
44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
46 | 
47 | # ensure to restart self first
48 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
49 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm
50 | 
51 | echo "running s3 checkpoint upload"
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_347_smolvlm_500M_2nd_stage/schedule-evals.sh:
--------------------------------------------------------------------------------
1 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_1024.slurm
2 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_2048.slurm
3 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_1024.slurm
4 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_2048.slurm
5 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_348_smolvlm_2B/cleanup-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_348-cleanup-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=1:00:00
 6 | #SBATCH --partition=hopper-cpu
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_348_smolvlm_2B_token_fix/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | set -e
11 | 
12 | # ----------------- Auto-Workdir -----------------
13 | if [ -n $SLURM_JOB_ID ];  then
14 |     # check the original location through scontrol and $SLURM_JOB_ID
15 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
16 | else
17 |     # otherwise: started with bash. Get the real location.
18 |     SCRIPT_PATH=$(realpath $0)
19 | fi
20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
22 | # --------------------------------------------------
23 | 
24 | ### EDIT ME START ###
25 | 
26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved
27 | RUN_FREQUENCY_IN_HOURS=6
28 | 
29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
30 | 
31 | EXPERIMENT_NAME=tr_348_smolvlm_2B_token_fix
32 | 
33 | ### EDIT ME END ###
34 | 
35 | 
36 | echo "START TIME: $(date)"
37 | 
38 | source /fsx/m4/start-m4-user
39 | conda activate base
40 | conda activate $CONDA_ENV_NAME
41 | 
42 | pushd $M4_REPO_PATH
43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
45 | 
46 | # ensure to restart self first
47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm
49 | 
50 | echo "running checkpoint cleanup"
51 | 
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_348_smolvlm_2B/merge_lora_and_resize_eou.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --gres=gpu:1
 6 | #SBATCH --cpus-per-task=12
 7 | #SBATCH --time=3:00:00
 8 | #SBATCH --partition=hopper-prod
 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out
10 | #SBATCH --qos=high
11 | 
12 | set -e
13 | 
14 | # ----------------- Auto-Workdir -----------------
15 | if [ -n $SLURM_JOB_ID ];  then
16 |     # check the original location through scontrol and $SLURM_JOB_ID
17 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
18 | else
19 |     # otherwise: started with bash. Get the real location.
20 |     SCRIPT_PATH=$(realpath $0)
21 | fi
22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd)
24 | 
25 | # --------------------------------------------------
26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3"
27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_348_smolvlm_2B_token_fix/opt_step-22250/"
28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_348_smolvlm_2B_token_fix_opt_step_22250_merge_and_resize_eou"
29 | 
30 | source /fsx/m4/start-m4-user
31 | conda activate base
32 | conda activate $CONDA_ENV_NAME
33 | 
34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR
35 | python experiments/pretraining/vloom/tr_348_smolvlm_2B_token_fix/resize_embed_for_eou.py $OUTPUT_DIR
36 | echo "Done"
37 | 
38 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_348_smolvlm_2B/s3-upload-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_348-s3-upload-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --partition=hopper-cpu
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_348_smolvlm_2B_token_fix/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | 
11 | set -e
12 | 
13 | # ----------------- Auto-Workdir -----------------
14 | if [ -n $SLURM_JOB_ID ];  then
15 |     # check the original location through scontrol and $SLURM_JOB_ID
16 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
17 | else
18 |     # otherwise: started with bash. Get the real location.
19 |     SCRIPT_PATH=$(realpath $0)
20 | fi
21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
23 | # --------------------------------------------------
24 | 
25 | ### EDIT ME START ###
26 | 
27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved
28 | RUN_FREQUENCY_IN_HOURS=8
29 | 
30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
31 | 
32 | EXPERIMENT_NAME=tr_348_smolvlm_2B_token_fix
33 | 
34 | ### EDIT ME END ###
35 | 
36 | 
37 | echo "START TIME: $(date)"
38 | 
39 | source /fsx/m4/start-m4-user
40 | conda activate base
41 | conda activate $CONDA_ENV_NAME
42 | 
43 | pushd $M4_REPO_PATH
44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
46 | 
47 | # ensure to restart self first
48 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
49 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm
50 | 
51 | echo "running s3 checkpoint upload"
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_348_smolvlm_2B/schedule-evals.slurm:
--------------------------------------------------------------------------------
1 | sbatch experiments/evaluation/vloom/async_evals_tr_348/run_evals_4_shots_captioning_1024.slurm
2 | sbatch experiments/evaluation/vloom/async_evals_tr_348/run_evals_4_shots_captioning_1920.slurm
3 | sbatch experiments/evaluation/vloom/async_evals_tr_348/run_evals_4_shots_vqa_1024.slurm
4 | sbatch experiments/evaluation/vloom/async_evals_tr_348/run_evals_4_shots_vqa_1920.slurm
5 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_349_vsmollm2_500M_3rd_stage/cleanup-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_349-cleanup-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=1:00:00
 6 | #SBATCH --partition=hopper-cpu
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_349_vsmollm2_500M_3rd_stage/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | set -e
11 | 
12 | # ----------------- Auto-Workdir -----------------
13 | if [ -n $SLURM_JOB_ID ];  then
14 |     # check the original location through scontrol and $SLURM_JOB_ID
15 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
16 | else
17 |     # otherwise: started with bash. Get the real location.
18 |     SCRIPT_PATH=$(realpath $0)
19 | fi
20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
22 | # --------------------------------------------------
23 | 
24 | ### EDIT ME START ###
25 | 
26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved
27 | RUN_FREQUENCY_IN_HOURS=6
28 | 
29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
30 | 
31 | EXPERIMENT_NAME=tr_349_vsmollm2_500M_3rd_stage
32 | 
33 | ### EDIT ME END ###
34 | 
35 | 
36 | echo "START TIME: $(date)"
37 | 
38 | source /fsx/m4/start-m4-user
39 | conda activate base
40 | conda activate $CONDA_ENV_NAME
41 | 
42 | pushd $M4_REPO_PATH
43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
45 | 
46 | # ensure to restart self first
47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm
49 | 
50 | echo "running checkpoint cleanup"
51 | 
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_349_vsmollm2_500M_3rd_stage/merge_lora_and_resize_eou.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --gres=gpu:1
 6 | #SBATCH --cpus-per-task=12
 7 | #SBATCH --time=3:00:00
 8 | #SBATCH --partition=hopper-prod
 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out
10 | #SBATCH --qos=high
11 | 
12 | set -e
13 | 
14 | # ----------------- Auto-Workdir -----------------
15 | if [ -n $SLURM_JOB_ID ];  then
16 |     # check the original location through scontrol and $SLURM_JOB_ID
17 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
18 | else
19 |     # otherwise: started with bash. Get the real location.
20 |     SCRIPT_PATH=$(realpath $0)
21 | fi
22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd)
24 | 
25 | # --------------------------------------------------
26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3"
27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/opt_step-18000"
28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_341_vsmollm2_05b_opt_step_18000_merge_and_resize_eou"
29 | 
30 | source /fsx/m4/start-m4-user
31 | conda activate base
32 | conda activate $CONDA_ENV_NAME
33 | 
34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR
35 | python experiments/pretraining/vloom/tr_341_vsmollm2_05b/resize_embed_for_eou.py $OUTPUT_DIR
36 | echo "Done"
37 | 
38 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_349_vsmollm2_500M_3rd_stage/s3-upload-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_349-s3-upload-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --partition=hopper-prod
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_349_vsmollm2_500M_3rd_stage/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | 
11 | set -e
12 | 
13 | # ----------------- Auto-Workdir -----------------
14 | if [ -n $SLURM_JOB_ID ];  then
15 |     # check the original location through scontrol and $SLURM_JOB_ID
16 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
17 | else
18 |     # otherwise: started with bash. Get the real location.
19 |     SCRIPT_PATH=$(realpath $0)
20 | fi
21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
23 | # --------------------------------------------------
24 | 
25 | ### EDIT ME START ###
26 | 
27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved
28 | RUN_FREQUENCY_IN_HOURS=8
29 | 
30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
31 | 
32 | EXPERIMENT_NAME=tr_349_vsmollm2_500M_3rd_stage
33 | 
34 | ### EDIT ME END ###
35 | 
36 | 
37 | echo "START TIME: $(date)"
38 | 
39 | source /fsx/m4/start-m4-user
40 | conda activate base
41 | conda activate $CONDA_ENV_NAME
42 | 
43 | pushd $M4_REPO_PATH
44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
46 | 
47 | # ensure to restart self first
48 | # echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
49 | # sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm
50 | 
51 | echo "running s3 checkpoint upload"
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_349_vsmollm2_500M_3rd_stage/schedule-evals.sh:
--------------------------------------------------------------------------------
1 | sbatch experiments/evaluation/vloom/async_evals_tr_349/run_evals_0_shots_val_512.slurm
2 | sbatch experiments/evaluation/vloom/async_evals_tr_349/run_evals_0_shots_val_1024.slurm
3 | sbatch experiments/evaluation/vloom/async_evals_tr_349/run_evals_0_shots_val_1536.slurm
4 | sbatch experiments/evaluation/vloom/async_evals_tr_349/run_evals_0_shots_val_2048.slurm
5 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_350_smolvlm_2B_2nd_stage/cleanup-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_320-cleanup-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=1:00:00
 6 | #SBATCH --partition=hopper-cpu
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_320_vsmollm2_long_context/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | set -e
11 | 
12 | # ----------------- Auto-Workdir -----------------
13 | if [ -n $SLURM_JOB_ID ];  then
14 |     # check the original location through scontrol and $SLURM_JOB_ID
15 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
16 | else
17 |     # otherwise: started with bash. Get the real location.
18 |     SCRIPT_PATH=$(realpath $0)
19 | fi
20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
22 | # --------------------------------------------------
23 | 
24 | ### EDIT ME START ###
25 | 
26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved
27 | RUN_FREQUENCY_IN_HOURS=6
28 | 
29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
30 | 
31 | EXPERIMENT_NAME=tr_320_vsmollm2_long_context
32 | 
33 | ### EDIT ME END ###
34 | 
35 | 
36 | echo "START TIME: $(date)"
37 | 
38 | source /fsx/m4/start-m4-user
39 | conda activate base
40 | conda activate $CONDA_ENV_NAME
41 | 
42 | pushd $M4_REPO_PATH
43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
45 | 
46 | # ensure to restart self first
47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm
49 | 
50 | echo "running checkpoint cleanup"
51 | 
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_350_smolvlm_2B_2nd_stage/merge_lora_and_resize_eou.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --gres=gpu:1
 6 | #SBATCH --cpus-per-task=12
 7 | #SBATCH --time=3:00:00
 8 | #SBATCH --partition=hopper-prod
 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out
10 | #SBATCH --qos=high
11 | 
12 | set -e
13 | 
14 | # ----------------- Auto-Workdir -----------------
15 | if [ -n $SLURM_JOB_ID ];  then
16 |     # check the original location through scontrol and $SLURM_JOB_ID
17 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
18 | else
19 |     # otherwise: started with bash. Get the real location.
20 |     SCRIPT_PATH=$(realpath $0)
21 | fi
22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd)
24 | 
25 | # --------------------------------------------------
26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3"
27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/opt_step-18000"
28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_341_vsmollm2_05b_opt_step_18000_merge_and_resize_eou"
29 | 
30 | source /fsx/m4/start-m4-user
31 | conda activate base
32 | conda activate $CONDA_ENV_NAME
33 | 
34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR
35 | python experiments/pretraining/vloom/tr_341_vsmollm2_05b/resize_embed_for_eou.py $OUTPUT_DIR
36 | echo "Done"
37 | 
38 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_350_smolvlm_2B_2nd_stage/s3-upload-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_350-s3-upload-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --partition=hopper-cpu
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_350_smolvlm_2B_2nd_stage/logs/crons/%x-%j.out
 8 | #SBATCH --qos high
 9 | 
10 | 
11 | set -e
12 | 
13 | # ----------------- Auto-Workdir -----------------
14 | if [ -n $SLURM_JOB_ID ];  then
15 |     # check the original location through scontrol and $SLURM_JOB_ID
16 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
17 | else
18 |     # otherwise: started with bash. Get the real location.
19 |     SCRIPT_PATH=$(realpath $0)
20 | fi
21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
23 | # --------------------------------------------------
24 | 
25 | ### EDIT ME START ###
26 | 
27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved
28 | RUN_FREQUENCY_IN_HOURS=8
29 | 
30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3
31 | 
32 | EXPERIMENT_NAME=tr_350_smolvlm_2B_2nd_stage
33 | 
34 | ### EDIT ME END ###
35 | 
36 | 
37 | echo "START TIME: $(date)"
38 | 
39 | source /fsx/m4/start-m4-user
40 | conda activate base
41 | conda activate $CONDA_ENV_NAME
42 | 
43 | pushd $M4_REPO_PATH
44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
46 | 
47 | # ensure to restart self first
48 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
49 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm
50 | 
51 | echo "running s3 checkpoint upload"
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH
56 | 
57 | echo "END TIME: $(date)"
58 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_350_smolvlm_2B_2nd_stage/schedule-evals.sh:
--------------------------------------------------------------------------------
1 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_1024.slurm
2 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_2048.slurm
3 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_1024.slurm
4 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_2048.slurm
5 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_cron_template/README.md:
--------------------------------------------------------------------------------
1 | # Chronicles
2 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_cron_template/cleanup-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_XYZ-cleanup-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=1:00:00
 6 | #SBATCH --partition=production-cluster
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_cron_template/logs/crons/%x-%j.out
 8 | 
 9 | set -e
10 | 
11 | # ----------------- Auto-Workdir -----------------
12 | if [ -n $SLURM_JOB_ID ];  then
13 |     # check the original location through scontrol and $SLURM_JOB_ID
14 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
15 | else
16 |     # otherwise: started with bash. Get the real location.
17 |     SCRIPT_PATH=$(realpath $0)
18 | fi
19 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
20 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
21 | # --------------------------------------------------
22 | 
23 | ### EDIT ME START ###
24 | 
25 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved
26 | RUN_FREQUENCY_IN_HOURS=1
27 | 
28 | CONDA_ENV_NAME=shared-m4
29 | 
30 | EXPERIMENT_NAME=tr_cron_template
31 | 
32 | ### EDIT ME END ###
33 | 
34 | 
35 | echo "START TIME: $(date)"
36 | 
37 | source /fsx/m4/start-m4-user
38 | conda activate base
39 | conda activate $CONDA_ENV_NAME
40 | 
41 | pushd $M4_REPO_PATH
42 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
43 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
44 | 
45 | # ensure to restart self first
46 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
47 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm
48 | 
49 | echo "running checkpoint cleanup"
50 | 
51 | 
52 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
53 | 
54 | $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH
55 | 
56 | echo "END TIME: $(date)"
57 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_cron_template/convert-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_XYZ-convert-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --partition=production-cluster
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_cron_template/logs/crons/%x-%j.out
 8 | 
 9 | 
10 | set -e
11 | 
12 | # ----------------- Auto-Workdir -----------------
13 | if [ -n $SLURM_JOB_ID ];  then
14 |     # check the original location through scontrol and $SLURM_JOB_ID
15 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
16 | else
17 |     # otherwise: started with bash. Get the real location.
18 |     SCRIPT_PATH=$(realpath $0)
19 | fi
20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
22 | # --------------------------------------------------
23 | 
24 | ### EDIT ME START ###
25 | 
26 | # how often to try to run the checkpoint conversion - hint: approximately as often as a checkpoint is saved
27 | RUN_FREQUENCY_IN_HOURS=1
28 | 
29 | CONDA_ENV_NAME=shared-m4
30 | 
31 | EXPERIMENT_NAME=tr_cron_template
32 | 
33 | ### EDIT ME END ###
34 | 
35 | 
36 | echo "START TIME: $(date)"
37 | 
38 | source /fsx/m4/start-m4-user
39 | conda activate base
40 | conda activate $CONDA_ENV_NAME
41 | 
42 | pushd $M4_REPO_PATH
43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
45 | 
46 | # ensure to restart self first
47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour convert-checkpoints.slurm
49 | 
50 | echo "running checkpoint converter"
51 | 
52 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
53 | 
54 | $M4_REPO_PATH/m4/scripts/convert-checkpoints.py $M4_CHECKPOINTS_PATH
55 | 
56 | echo "END TIME: $(date)"
57 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_cron_template/s3-upload-checkpoints.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_XYZ-s3-upload-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --partition=production-cluster
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_cron_template/logs/crons/%x-%j.out
 8 | 
 9 | 
10 | set -e
11 | 
12 | # ----------------- Auto-Workdir -----------------
13 | if [ -n $SLURM_JOB_ID ];  then
14 |     # check the original location through scontrol and $SLURM_JOB_ID
15 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
16 | else
17 |     # otherwise: started with bash. Get the real location.
18 |     SCRIPT_PATH=$(realpath $0)
19 | fi
20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
22 | # --------------------------------------------------
23 | 
24 | ### EDIT ME START ###
25 | 
26 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved
27 | RUN_FREQUENCY_IN_HOURS=1
28 | 
29 | CONDA_ENV_NAME=shared-m4
30 | 
31 | EXPERIMENT_NAME=tr_cron_template
32 | 
33 | ### EDIT ME END ###
34 | 
35 | 
36 | echo "START TIME: $(date)"
37 | 
38 | source /fsx/m4/start-m4-user
39 | conda activate base
40 | conda activate $CONDA_ENV_NAME
41 | 
42 | pushd $M4_REPO_PATH
43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
45 | 
46 | # ensure to restart self first
47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm
49 | 
50 | echo "running checkpoint uploader"
51 | 
52 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
53 | 
54 | $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH
55 | 
56 | echo "END TIME: $(date)"
57 | 


--------------------------------------------------------------------------------
/vision/experiments/pretraining/vloom/tr_cron_template/s3-upload-run-files.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_XYZ-s3-upload-run-files
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --mem-per-cpu=11G
 7 | #SBATCH --partition=production-cluster
 8 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_cron_template/logs/crons/%x-%j.out
 9 | 
10 | 
11 | set -e
12 | 
13 | # ----------------- Auto-Workdir -----------------
14 | if [ -n $SLURM_JOB_ID ];  then
15 |     # check the original location through scontrol and $SLURM_JOB_ID
16 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
17 | else
18 |     # otherwise: started with bash. Get the real location.
19 |     SCRIPT_PATH=$(realpath $0)
20 | fi
21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
23 | # --------------------------------------------------
24 | 
25 | ### EDIT ME START ###
26 | 
27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved
28 | RUN_FREQUENCY_IN_HOURS=1
29 | 
30 | CONDA_ENV_NAME=shared-m4
31 | 
32 | EXPERIMENT_NAME=tr_cron_template
33 | 
34 | ### EDIT ME END ###
35 | 
36 | 
37 | echo "START TIME: $(date)"
38 | 
39 | source /fsx/m4/start-m4-user
40 | conda activate base
41 | conda activate $CONDA_ENV_NAME
42 | 
43 | pushd $M4_REPO_PATH
44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME
46 | 
47 | # ensure to restart self first
48 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours
49 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-run-files.slurm
50 | 
51 | echo "upload run files"
52 | 
53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME}
54 | 
55 | 
56 | # Upload other files
57 | aws s3 cp $M4_CHECKPOINTS_PATH s3://m4-exps/${EXPERIMENT_NAME} --exclude "*opt*" --exclude "*shared*" --recursive
58 | 
59 | echo "END TIME: $(date)"
60 | 


--------------------------------------------------------------------------------
/vision/finetuning/README.md:
--------------------------------------------------------------------------------
1 | # Finetuning
2 | 
3 | Here you can find a notebook to finetune SmolVLM on Visual Question Answering using Consumer GPU with QLoRA.


--------------------------------------------------------------------------------
/vision/m4/__init__.py:
--------------------------------------------------------------------------------
1 | from m4.utils import logging
2 | 


--------------------------------------------------------------------------------
/vision/m4/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/evaluation/__init__.py


--------------------------------------------------------------------------------
/vision/m4/evaluation/custom_metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from m4.evaluation.custom_metrics.classification_vqa_metrics import ClassificationVQAMetrics
2 | from m4.evaluation.custom_metrics.doc_vqa_metrics import DocVQAMetrics
3 | from m4.evaluation.custom_metrics.image_caption_matching_metrics import ImageCaptionMatchingMetrics
4 | from m4.evaluation.custom_metrics.open_ended_vqa_metrics import OpenEndedVQAMetrics
5 | from m4.evaluation.custom_metrics.perplexity_metrics import PerplexityMetrics
6 | from m4.evaluation.custom_metrics.unfolded_classification_metrics import UnfoldedClassificationMetrics
7 | from m4.evaluation.custom_metrics.unfolded_image_captioning_metrics import UnfoldedImageCaptioningMetrics
8 | 


--------------------------------------------------------------------------------
/vision/m4/evaluation/evaluators/__init__.py:
--------------------------------------------------------------------------------
1 | from m4.evaluation.evaluators.in_contexter import in_contexter
2 | from m4.evaluation.evaluators.linear_prober import linear_prober
3 | 


--------------------------------------------------------------------------------
/vision/m4/evaluation/generation/README.md:
--------------------------------------------------------------------------------
1 | # Generation Process:
2 | 
3 | - find one or more opt-step checkpoints to make generations with
4 | - create folder in code/m4/experiments/generations
5 | - add a config.yaml and a [gen_folder_name]_generate.slurm folder
6 | - fill the config file according to desired hyperparameters: prompt/num_beams/ngram_repeats etc..
7 | - run sbatch [m4_repo_name]/experiments/generation/[gen_folder_name]/[gen_folder_name]_generate.slurm
8 | - check wandb and make sure your column shows up. If it doesn't, click on "columns" at the bottom right of the generation table and slide the missing generation to the "Displayed columns" side
9 | 


--------------------------------------------------------------------------------
/vision/m4/evaluation/generation/deprecated_generation/log_generation.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=make_generation
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --partition prepost
 6 | #SBATCH --cpus-per-task=1         # number of cores per tasks
 7 | #SBATCH --hint=nomultithread         # we get physical cores not logical
 8 | #SBATCH --time 00:10:00              # maximum execution time (HH:MM:SS)
 9 | #SBATCH --output=/gpfsscratch/rech/cnw/commun/experiments/generation_dir/logs/%x_%j.out
10 | #SBATCH --account=cnw@cpu
11 | 
12 | source $cnw_ALL_CCFRWORK/start-m4-user
13 | 
14 | conda activate $CONDA_ENV_NAME
15 | 
16 | export WANDB_DIR=$cnw_ALL_CCFRSCRATCH/experiments
17 | 
18 | pushd $WORKING_DIR
19 | 
20 | GIT_PYTHON_GIT_EXECUTABLE=`which git`
21 | export GIT_PYTHON_GIT_EXECUTABLE
22 | 
23 | python m4/evaluation/generation/log_generation.py  --gen_file $GEN_FILE
24 | 


--------------------------------------------------------------------------------
/vision/m4/evaluation/generation/deprecated_generation/make_generation.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=make_generation
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --partition gpu_p2
 6 | #SBATCH --gres=gpu:1                # number of gpu
 7 | #SBATCH --cpus-per-task=2           # number of cores per tasks
 8 | #SBATCH --hint=nomultithread         # we get physical cores not logical
 9 | #SBATCH --time 00:30:00              # maximum execution time (HH:MM:SS)
10 | #SBATCH --output=/gpfsscratch/rech/cnw/commun/experiments/generation_dir/logs/%x_%j.out
11 | #SBATCH --account=cnw@gpu
12 | 
13 | source $cnw_ALL_CCFRWORK/start-m4-user
14 | 
15 | conda activate leo-flash
16 | 
17 | # We are on an offline partition
18 | export TRANSFORMERS_OFFLINE=1
19 | 
20 | pushd $WORKING_DIR
21 | GIT_PYTHON_GIT_EXECUTABLE=`which git`
22 | export GIT_PYTHON_GIT_EXECUTABLE
23 | 
24 | echo "model paths:"
25 | echo $MODEL_PATHS
26 | python m4/evaluation/generation/launch_generation.py  --load_config $CONFIG --job_id $SLURM_JOB_ID
27 | 


--------------------------------------------------------------------------------
/vision/m4/evaluation/generation/deprecated_generation/master_generate.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=generate
 3 | #SBATCH --nodes=1
 4 | #SBATCH --qos=qos_cpu-dev
 5 | #SBATCH --ntasks-per-node=1
 6 | #SBATCH --cpus-per-task=1         # number of cores per tasks
 7 | #SBATCH --hint=nomultithread         # we get physical cores not logical
 8 | #SBATCH --time 00:05:00              # maximum execution time (HH:MM:SS)
 9 | #SBATCH --output=/gpfsscratch/rech/cnw/commun/experiments/generation_dir/logs/%x_%j.out
10 | #SBATCH --account=cnw@cpu
11 | 
12 | source $cnw_ALL_CCFRWORK/start-m4-user
13 | 
14 | CONDA_ENV_NAME="leo"
15 | 
16 | conda activate $CONDA_ENV_NAME
17 | 
18 | # We are on an offline partition
19 | WORKING_DIR=$WORK/code/m4_runs_5
20 | pushd $WORKING_DIR
21 | 
22 | GIT_PYTHON_GIT_EXECUTABLE=`which git`
23 | export GIT_PYTHON_GIT_EXECUTABLE
24 | 
25 | CONFIG="m4/evaluation/generation/generate_config.yaml"
26 | JID_JOB=$(sbatch --job-name=make_generation_$SLURM_JOB_ID --export=ALL,CONDA_ENV_NAME=$CONDA_ENV_NAME,WORKING_DIR=$WORKING_DIR,CONFIG=$CONFIG m4/evaluation/generation/make_generation.slurm)
27 | 
28 | JID_JOB=${JID_JOB##* }
29 | 
30 | echo $JID_JOB
31 | 
32 | GEN_FILE="/gpfsscratch/rech/cnw/commun/experiments/generation_dir/generation_tmp_files_dir/gen_${JID_JOB}.json"
33 | 
34 | echo $GEN_FILE
35 | 
36 | sbatch --dependency=afterok:$JID_JOB --job-name=log_generation_$SLURM_JOB_ID --export=ALL,CONDA_ENV_NAME=$CONDA_ENV_NAME,WORKING_DIR=$WORKING_DIR,GEN_FILE=$GEN_FILE m4/evaluation/generation/log_generation.slurm
37 | 


--------------------------------------------------------------------------------
/vision/m4/evaluation/scripts/README.md:
--------------------------------------------------------------------------------
1 | We need to locally save some datasets with `copy_remote_sample_datasets.py` because the caching function does not work for some datasets, see https://github.com/huggingface/datasets/issues/4760 and https://github.com/huggingface/datasets/issues/3547.
2 | 


--------------------------------------------------------------------------------
/vision/m4/evaluation/scripts/copy_remote_sample_datasets.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from datasets import load_dataset
 4 | 
 5 | from m4.evaluation.tasks import VGPT2_SAMPLE_TASKS, Predictor
 6 | from m4.evaluation.utils import EvaluationVersion
 7 | 
 8 | 
 9 | MIN_DATASET_SIZE = 100
10 | DEFAULT_NUM_EX_PER_CLASS = 3
11 | 
12 | ALREADY_COPIED_DATASETS = set()
13 | 
14 | model_name = "gpt2"  # Not used but necessary to load the task
15 | tokenizer_name = "t5-base"  # Not used but necessary to load the task
16 | image_size = 224  # Not used but necessary to load the task
17 | evaluation_version = EvaluationVersion.v2  # Not used but necessary to load the task
18 | 
19 | save_dir = Path("/gpfsscratch/rech/cnw/commun/local_datasets")
20 | 
21 | 
22 | def load_and_save_dataset(task, split, save_dir):
23 |     dataset_split = load_dataset(task.dataset_name, name=task.dataset_config, split=split, use_auth_token=True)
24 |     print("********************************************************")
25 |     print(task.__class__.__name__)
26 |     print(len(dataset_split))
27 |     print(f"Dataset name is {task.dataset_name} and split is {split} and config is {task.dataset_config}")
28 |     print("********************************************************")
29 | 
30 |     dataset_split.save_to_disk(save_dir / task.dataset_name / split)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     for task_objet in VGPT2_SAMPLE_TASKS[Predictor.in_contexter]:
35 |         task = task_objet(
36 |             model_name=model_name,
37 |             tokenizer_name=tokenizer_name,
38 |             image_size=image_size,
39 |             evaluation_version=evaluation_version,
40 |         )
41 | 
42 |         load_and_save_dataset(task, task.test_split_name, save_dir)
43 |         load_and_save_dataset(task, task.train_split_name, save_dir)
44 | 


--------------------------------------------------------------------------------
/vision/m4/evaluation/tasks/base.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import List, Optional
 3 | 
 4 | 
 5 | class Predictor(Enum):
 6 |     in_contexter = "in_contexter"
 7 |     linear_prober = "linear_prober"
 8 | 
 9 | 
10 | class BaseTask:
11 |     dataset_name: str  # Dataset (example: birdsnap)
12 |     dataset_config: Optional[str] = None  # Dataset config (example: partition_1)
13 |     default_support_split_name: Optional[str] = None
14 |     default_query_split_name: str
15 |     metric_name: str  # the metric to use (example: accuracy) - use evaluate
16 |     metrics_kwargs: Optional[dict] = {}
17 |     extra_metrics: Optional[list] = None
18 |     model_class: str  # The model
19 |     predictor_class: Predictor
20 |     id_column_name: Optional[str] = None
21 | 
22 |     def __init__(self, **kwargs) -> None:
23 |         pass
24 | 
25 | 
26 | class BaseTaskClassification(BaseTask):
27 |     image_column_names: List[str]
28 |     label_column_name: str
29 |     context_column_names: Optional[List[str]] = None
30 |     tested_ex_excluded_context_columns: Optional[List[str]] = None
31 |     tested_labels_column_name: Optional[str] = None
32 |     relevance_scores_column_name: Optional[str] = None
33 | 
34 | 
35 | class BaseTaskOpenEndedVQA(BaseTask):
36 |     image_column_name: str
37 |     question_column_name: str
38 |     answers_column_name: str
39 |     context_column_names: Optional[List[str]] = None
40 | 
41 | 
42 | class BaseTaskImageCaptioning(BaseTask):
43 |     image_column_name: str
44 |     reference_captions_column_name: str
45 |     context_column_name: Optional[str] = None
46 | 
47 | 
48 | class BaseTaskImageCaptionMatching(BaseTask):
49 |     image_column_names: List[str]
50 |     caption_column_names: List[str]
51 | 


--------------------------------------------------------------------------------
/vision/m4/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from m4.models.custom_modules import DecoupledEmbedding, DecoupledLinear
 2 | from m4.models.idefics.configuration_idefics import IdeficsConfig
 3 | from m4.models.idefics.modeling_idefics import IdeficsForCausalLM
 4 | from m4.models.vgpt2.configuration_vgpt2 import VGPT2Config
 5 | from m4.models.vgpt2.modeling_vgpt2 import VGPT2LMHeadModel
 6 | from m4.models.vllama3.configuration_vllama3 import VLlama3Config
 7 | from m4.models.vllama3.modeling_vllama3 import VLlama3ForCausalLM
 8 | from m4.models.vmistral.configuration_vmistral import VMistralConfig
 9 | from m4.models.vmistral.modeling_vmistral import VMistralForCausalLM
10 | 
11 | 
12 | _SUPPORTED_MODELS = {
13 |     "vgpt2": VGPT2Config,
14 |     # "vllama": IdeficsConfig,
15 |     "idefics": IdeficsConfig,
16 |     "vmistral": VMistralConfig,
17 |     "vllama3": VLlama3Config,
18 | }
19 | 
20 | model_type_to_modeling_class = {
21 |     "vgpt2": VGPT2LMHeadModel,
22 |     # "vllama": IdeficsForCausalLM,
23 |     "idefics": IdeficsForCausalLM,
24 |     "vmistral": VMistralForCausalLM,
25 |     "vllama3": VLlama3ForCausalLM,
26 | }
27 | 


--------------------------------------------------------------------------------
/vision/m4/models/idefics/evaluation_classification_vqa_in_context_idefics.py:
--------------------------------------------------------------------------------
 1 | from m4.evaluation.custom_metrics.classification_vqa_metrics import ClassifVQAMetrics
 2 | from m4.models.vgpt2.evaluation_classification_vqa_in_context_vgpt2 import Vgpt2ClassificationVQAInContext
 3 | 
 4 | 
 5 | class IdeficsClassificationVQAInContext(Vgpt2ClassificationVQAInContext):
 6 |     model_class: str = "IdeficsForCausalLM"
 7 |     tokenizer_max_seq_len = 2048
 8 | 
 9 | 
10 | class VQAv2IdeficsClassificationVQAInContextAcc(IdeficsClassificationVQAInContext):
11 |     dataset_name: str = "HuggingFaceM4/VQAv2_modif"
12 |     metric_name: str = "ClassificationVQAMetrics"
13 |     metric_kwargs = {
14 |         "metrics": [
15 |             ClassifVQAMetrics.VQA_ACCURACY,
16 |             ClassifVQAMetrics.ENTROPY_DISTRIBUTION,
17 |             ClassifVQAMetrics.ENTROPY_MEAN,
18 |         ]
19 |     }
20 |     default_query_split_name: str = "validation"
21 |     default_support_split_name: str = "train"
22 |     image_column_name: str = "image"
23 |     question_column_name: str = "question"
24 |     answers_column_name: str = "answers"
25 |     length_normalize: bool = False
26 | 
27 | 
28 | class VQAv2SampleIdeficsClassificationVQAInContextAcc(VQAv2IdeficsClassificationVQAInContextAcc):
29 |     dataset_name: str = "HuggingFaceM4/VQAv2_modif-Sample"
30 | 


--------------------------------------------------------------------------------
/vision/m4/models/idefics/make_tiny_llama.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This script creates a super tiny model that is useful inside tests, when we just want to test that
 4 | # the machinery works, without needing to check the quality of the outcomes.
 5 | #
 6 | # usage: adjust the configs if wanted, but otherwise just run the script
 7 | 
 8 | from pathlib import Path
 9 | 
10 | from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
11 | 
12 | 
13 | mname_tiny = "tiny-random-LlamaForCausalLM"
14 | 
15 | path = Path(mname_tiny)
16 | path.mkdir(parents=True, exist_ok=True)
17 | 
18 | config = LlamaConfig()
19 | config.update(
20 |     dict(
21 |         vocab_size=32000,
22 |         hidden_size=16,
23 |         intermediate_size=16 * 4,
24 |         num_hidden_layers=2,
25 |         num_attention_heads=4,
26 |     )
27 | )
28 | model = LlamaForCausalLM(config)
29 | tokenizer = LlamaTokenizer.from_pretrained("path_to_llama_7b")
30 | 
31 | # Test w/ one text
32 | query = "This is a test"
33 | query_tokens = tokenizer(query, return_tensors="pt")
34 | 
35 | input = {
36 |     "input_ids": query_tokens["input_ids"],
37 |     "attention_mask": query_tokens["attention_mask"],
38 | }
39 | 
40 | out_gen = model.generate(**input)
41 | text = tokenizer.batch_decode(out_gen)
42 | 
43 | # Save model + config + tokenizer
44 | model.half()  # makes it smaller
45 | model.save_pretrained(path)
46 | tokenizer.save_pretrained(path)
47 | 
48 | # test we can load it back
49 | model = LlamaForCausalLM.from_pretrained(path)
50 | 
51 | print(f"Generated {mname_tiny} - Upload the generated folder to the hub")
52 | 


--------------------------------------------------------------------------------
/vision/m4/models/vgpt2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/models/vgpt2/__init__.py


--------------------------------------------------------------------------------
/vision/m4/models/vllama3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/models/vllama3/__init__.py


--------------------------------------------------------------------------------
/vision/m4/models/vllama3/make_tiny_llama3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This script creates a super tiny model that is useful inside tests, when we just want to test that
 4 | # the machinery works, without needing to check the quality of the outcomes.
 5 | #
 6 | # usage: adjust the configs if wanted, but otherwise just run the script
 7 | 
 8 | from pathlib import Path
 9 | 
10 | from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
11 | 
12 | 
13 | mname_tiny = "tiny-random-Llama3ForCausalLM"
14 | 
15 | path = Path(mname_tiny)
16 | path.mkdir(parents=True, exist_ok=True)
17 | 
18 | config = LlamaConfig()
19 | config.update(
20 |     dict(
21 |         vocab_size=128_256,
22 |         hidden_size=16,
23 |         intermediate_size=16 * 4,
24 |         num_hidden_layers=2,
25 |         num_attention_heads=4,
26 |         num_key_value_heads=1,
27 |     )
28 | )
29 | model = LlamaForCausalLM(config)
30 | tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
31 | 
32 | # Test w/ one text
33 | query = "This is a test"
34 | query_tokens = tokenizer(query, return_tensors="pt")
35 | 
36 | input = {
37 |     "input_ids": query_tokens["input_ids"],
38 |     "attention_mask": query_tokens["attention_mask"],
39 | }
40 | 
41 | out_gen = model.generate(**input)
42 | text = tokenizer.batch_decode(out_gen)
43 | 
44 | # Save model + config + tokenizer
45 | model.half()  # makes it smaller
46 | model.save_pretrained(path)
47 | tokenizer.save_pretrained(path)
48 | 
49 | # test we can load it back
50 | model = LlamaForCausalLM.from_pretrained(path)
51 | 
52 | print(f"Generated {mname_tiny} - Upload the generated folder to the hub")
53 | # Pushed to HuggingFaceM4/tiny-random-Llama3ForCausalLM
54 | 


--------------------------------------------------------------------------------
/vision/m4/models/vmistral/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/models/vmistral/__init__.py


--------------------------------------------------------------------------------
/vision/m4/models/vmistral/evaluation_classification_vqa_in_context_vmistral.py:
--------------------------------------------------------------------------------
 1 | from m4.evaluation.custom_metrics.classification_vqa_metrics import ClassifVQAMetrics
 2 | from m4.models.vgpt2.evaluation_classification_vqa_in_context_vgpt2 import Vgpt2ClassificationVQAInContext
 3 | 
 4 | 
 5 | class VMistralClassificationVQAInContext(Vgpt2ClassificationVQAInContext):
 6 |     model_class: str = "VMistralForCausalLM"
 7 |     tokenizer_max_seq_len = 4096
 8 | 
 9 | 
10 | class VQAv2VMistralClassificationVQAInContextAcc(VMistralClassificationVQAInContext):
11 |     dataset_name: str = "HuggingFaceM4/VQAv2_modif"
12 |     metric_name: str = "ClassificationVQAMetrics"
13 |     metric_kwargs = {
14 |         "metrics": [
15 |             ClassifVQAMetrics.VQA_ACCURACY,
16 |             ClassifVQAMetrics.ENTROPY_DISTRIBUTION,
17 |             ClassifVQAMetrics.ENTROPY_MEAN,
18 |         ]
19 |     }
20 |     default_query_split_name: str = "validation"
21 |     default_support_split_name: str = "train"
22 |     image_column_name: str = "image"
23 |     question_column_name: str = "question"
24 |     answers_column_name: str = "answers"
25 |     length_normalize: bool = False
26 | 
27 | 
28 | class VQAv2SampleVMistralClassificationVQAInContextAcc(VQAv2VMistralClassificationVQAInContextAcc):
29 |     dataset_name: str = "HuggingFaceM4/VQAv2_modif-Sample"
30 | 


--------------------------------------------------------------------------------
/vision/m4/models/vmistral/make_tiny_mistral.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This script creates a super tiny model that is useful inside tests, when we just want to test that
 4 | # the machinery works, without needing to check the quality of the outcomes.
 5 | #
 6 | # usage: adjust the configs if wanted, but otherwise just run the script
 7 | 
 8 | from pathlib import Path
 9 | 
10 | from transformers import AutoTokenizer, MistralConfig, MistralForCausalLM
11 | 
12 | 
13 | mname_tiny = "tiny-random-MistralForCausalLM"
14 | 
15 | path = Path(mname_tiny)
16 | path.mkdir(parents=True, exist_ok=True)
17 | 
18 | config = MistralConfig()
19 | config.update(
20 |     dict(
21 |         vocab_size=32000,
22 |         hidden_size=16,
23 |         intermediate_size=16 * 4,
24 |         num_hidden_layers=2,
25 |         num_attention_heads=4,
26 |         num_key_value_heads=1,
27 |     )
28 | )
29 | model = MistralForCausalLM(config)
30 | tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
31 | 
32 | # Test w/ one text
33 | query = "This is a test"
34 | query_tokens = tokenizer(query, return_tensors="pt")
35 | 
36 | input = {
37 |     "input_ids": query_tokens["input_ids"],
38 |     "attention_mask": query_tokens["attention_mask"],
39 | }
40 | 
41 | out_gen = model.generate(**input)
42 | text = tokenizer.batch_decode(out_gen)
43 | 
44 | # Save model + config + tokenizer
45 | model.half()  # makes it smaller
46 | model.save_pretrained(path)
47 | tokenizer.save_pretrained(path)
48 | 
49 | # test we can load it back
50 | model = MistralForCausalLM.from_pretrained(path)
51 | 
52 | print(f"Generated {mname_tiny} - Upload the generated folder to the hub")
53 | # Pushed to HuggingFaceM4/tiny-random-MistralForCausalLM
54 | 


--------------------------------------------------------------------------------
/vision/m4/scripts/clean_jsonl_evals.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | PATH_JSONL = "/Users/hugolaurencon/Desktop/tr_209_ift_mixture_test_final_evaluations.jsonl"
 5 | 
 6 | 
 7 | BANNED_KEYS = [
 8 |     "kl_distribution",
 9 |     "entropy_distribution",
10 |     "kl_mean",
11 |     "Bleu_1",
12 |     "Bleu_1_all",
13 |     "Bleu_2",
14 |     "Bleu_2_all",
15 |     "Bleu_3",
16 |     "Bleu_3_all",
17 |     "Bleu_4",
18 |     "Bleu_4_all",
19 |     "METEOR",
20 |     "METEOR_all",
21 |     "CIDEr_all",
22 |     "ROUGE_L",
23 |     "ROUGE_L_all",
24 |     "per_bucket_accuracy",
25 |     "std_per_bucket_accuracy",
26 |     "entropy_mean",
27 | ]
28 | 
29 | 
30 | jsonl_data = []
31 | with open(PATH_JSONL, "r") as file:
32 |     for line in file:
33 |         json_data = json.loads(line)
34 |         jsonl_data.append(json_data)
35 | 
36 | 
37 | for idx, data in enumerate(jsonl_data):
38 |     if "score" in data:
39 |         if type(data["score"]) == str:
40 |             data["score"] = json.loads(data["score"].replace("'", '"'))
41 |         for banned_key in BANNED_KEYS:
42 |             if banned_key in data["score"]:
43 |                 data["score"].pop(banned_key)
44 |     jsonl_data[idx] = data
45 | 
46 | 
47 | with open(PATH_JSONL, "w") as file:
48 |     for item in jsonl_data:
49 |         item_json = json.dumps(item)
50 |         file.write(item_json + "\n")
51 | 


--------------------------------------------------------------------------------
/vision/m4/scripts/convert_vmistral_lm_head.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import json
 3 | 
 4 | from safetensors import safe_open
 5 | from safetensors.torch import save_file
 6 | 
 7 | 
 8 | model_path = "/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_272_bis_opt_step_15000_merge_and_resize_eou_renamed_lmhead/unwrapped_model"
 9 | safetensor_files = glob.glob(f"{model_path}/model*.safetensors")
10 | 
11 | KEYS_TO_MODIFY_MAPPING = {
12 |     "lm_head.additional_fc": "additional_fc",
13 | }
14 | 
15 | for file in safetensor_files:
16 |     tensors = {}
17 |     with safe_open(file, framework="pt", device="cpu") as f:
18 |         for old_key in f.keys():
19 |             final_key = old_key
20 |             for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
21 |                 if key_to_modify in old_key:
22 |                     final_key = old_key.replace(key_to_modify, new_key)
23 |             tensors[final_key] = f.get_tensor(old_key)
24 |     print(f"{tensors.keys()}")
25 |     save_file(tensors, file, metadata={"format": "pt"})
26 | 
27 | with open(f"{model_path}/model.safetensors.index.json", "r") as f:
28 |     data = json.load(f)
29 |     keys_to_iterate = list(data["weight_map"].keys())
30 |     new_data_weight_map = {}
31 |     for old_key, v in data["weight_map"].items():
32 |         final_key = old_key
33 |         for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
34 |             if key_to_modify in old_key:
35 |                 final_key = old_key.replace(key_to_modify, new_key)
36 |         new_data_weight_map[final_key] = v
37 |     data["weight_map"] = new_data_weight_map
38 | 
39 | with open(f"{model_path}/model.safetensors.index.json", "w") as f:
40 |     json_object = json.dumps(data, indent=4)
41 |     f.write(json_object)
42 | 


--------------------------------------------------------------------------------
/vision/m4/scripts/job_update_siglip_model_pos_embeds.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=update_siglip_model_pos_embeds.py
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --cpus-per-task=96
 6 | #SBATCH --mem-per-cpu=20G
 7 | #SBATCH --output=/fsx/m4/experiments/general_logs/update_siglip_model_pos_embeds/%x_%j.out
 8 | #SBATCH --time=20:00:00
 9 | #SBATCH --partition=hopper-prod
10 | #SBATCH --gpus=1
11 | #SBATCH --qos high
12 | 
13 | 
14 | set -x -e
15 | 
16 | source /fsx/m4/start-m4-user
17 | conda activate base
18 | conda activate shared-m4
19 | 
20 | 
21 | WORKING_DIR=/fsx/hugo/repos/m4_36
22 | pushd $WORKING_DIR
23 | 
24 | python m4/scripts/update_siglip_model_pos_embeds.py
25 | 


--------------------------------------------------------------------------------
/vision/m4/scripts/merge_lora_and_behead.sh:
--------------------------------------------------------------------------------
 1 | set -x -e
 2 | 
 3 | source /fsx/m4/start-m4-user
 4 | conda activate victor
 5 | 
 6 | INPUT_DIR=/fsx/m4/experiments/local_experiment_dir/tr_289_288_ter_12600_lima_sft/opt_step-1400
 7 | OUTPUT_DIR=/fsx/m4/victor/idefics2
 8 | 
 9 | SCRIPT_RELATIVE_PATH="${BASH_SOURCE[0]}"
10 | PATH_TO_THIS_FILE=$(realpath "$SCRIPT_RELATIVE_PATH")
11 | echo "The absolute path of the current script file is: $PATH_TO_THIS_FILE"
12 | 
13 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
14 | WORKING_DIR=$(builtin cd $SCRIPT_DIR/; pwd)
15 | echo "Working dir is: $WORKING_DIR"
16 | 
17 | cd $WORKING_DIR
18 | 
19 | 
20 | python merge_lora_and_save.py $INPUT_DIR $OUTPUT_DIR
21 | echo "Finished merge lora"
22 | mv $OUTPUT_DIR/unwrapped_model/model* $OUTPUT_DIR
23 | rm -rf $OUTPUT_DIR/unwrapped_model
24 | rm -rf $OUTPUT_DIR/tokenizer # Just a sanity
25 | 
26 | 
27 | python behead_unused_params.py \
28 |     --model_dir $OUTPUT_DIR \
29 |     --behead_siglip_pooling \
30 |     --behead_perceiver_rmsnorm
31 | echo "Finished behead unused parameters"
32 | 
33 | # Push `/fsx/m4/victor/idefics2` to `HuggingFaceM4/idefics2`
34 | # Then call optionally to transform into transformers compatible checkpoint and push to `HuggingFaceM4/idefics2-tfrm-compatible`
35 | # python transformers/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py \
36 | #     --original_model_id HuggingFaceM4/idefics2 \
37 | #     --output_hub_path /fsx/m4/victor/idefics2-tfrm-compatible
38 | 


--------------------------------------------------------------------------------
/vision/m4/scripts/merge_lora_and_resize_eou_template.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --gres=gpu:1
 6 | #SBATCH --cpus-per-task=12
 7 | #SBATCH --time=3:00:00
 8 | #SBATCH --partition=hopper-prod
 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out
10 | #SBATCH --qos=high
11 | 
12 | set -e
13 | 
14 | # ----------------- Auto-Workdir -----------------
15 | if [ -n $SLURM_JOB_ID ];  then
16 |     # check the original location through scontrol and $SLURM_JOB_ID
17 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
18 | else
19 |     # otherwise: started with bash. Get the real location.
20 |     SCRIPT_PATH=$(realpath $0)
21 | fi
22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd)
24 | 
25 | # --------------------------------------------------
26 | CONDA_ENV_NAME="shared-m4"
27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_315_vsmollm_long_context/opt_step-12810/"
28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_315_vsmollm_long_contex_opt_step_12810_merge_and_resize_eou"
29 | 
30 | source /fsx/m4/start-m4-user
31 | conda activate base
32 | conda activate $CONDA_ENV_NAME
33 | pushd $M4_REPO_PATH
34 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
35 | 
36 | python $M4_REPO_PATH/m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR
37 | python $M4_REPO_PATH/m4/scripts/resize_embed_for_eou.py $OUTPUT_DIR
38 | echo "Done"
39 | 


--------------------------------------------------------------------------------
/vision/m4/scripts/merge_lora_template.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=template-merge-lora
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --gres=gpu:0
 6 | #SBATCH --cpus-per-task=12
 7 | #SBATCH --time=3:00:00
 8 | #SBATCH --partition=hopper-prod
 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras/%x-%j.out
10 | 
11 | 
12 | set -e
13 | 
14 | # ----------------- Auto-Workdir -----------------
15 | if [ -n $SLURM_JOB_ID ];  then
16 |     # check the original location through scontrol and $SLURM_JOB_ID
17 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
18 | else
19 |     # otherwise: started with bash. Get the real location.
20 |     SCRIPT_PATH=$(realpath $0)
21 | fi
22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd)
24 | 
25 | # --------------------------------------------------
26 | CONDA_ENV_NAME="shared-m4"
27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/template_dir"
28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/template_out_dir"
29 | 
30 | 
31 | source /fsx/m4/start-m4-user
32 | conda activate base
33 | conda activate $CONDA_ENV_NAME
34 | pushd $M4_REPO_PATH
35 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
36 | 
37 | python $M4_REPO_PATH/m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR
38 | 


--------------------------------------------------------------------------------
/vision/m4/scripts/s3_checkpoint_download_convert_upload.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_test-s3-download-and-convert-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --partition=production-cluster
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/logs/%x-%j.out
 8 | 
 9 | 
10 | set -e
11 | 
12 | # ----------------- Auto-Workdir -----------------
13 | if [ -n $SLURM_JOB_ID ];  then
14 |     # check the original location through scontrol and $SLURM_JOB_ID
15 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
16 | else
17 |     # otherwise: started with bash. Get the real location.
18 |     SCRIPT_PATH=$(realpath $0)
19 | fi
20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd)
22 | 
23 | # --------------------------------------------------
24 | 
25 | ### EDIT ME START ###
26 | 
27 | CONDA_ENV_NAME=shared-m4
28 | 
29 | EXPERIMENT_NAME=tr_194_laion_cm4_mix
30 | 
31 | opt_step_num_list=(
32 |    "1000"
33 |    "2000"
34 | )
35 | 
36 | ### EDIT ME END ###
37 | 
38 | 
39 | echo "START TIME: $(date)"
40 | 
41 | source /fsx/m4/start-m4-user
42 | conda activate base
43 | conda activate $CONDA_ENV_NAME
44 | pushd $M4_REPO_PATH
45 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
46 | 
47 | echo "running checkpoint download, convert, upload for opt-steps: ${opt_step_num_list[@]} of experiment: $EXPERIMENT_NAME"
48 | 
49 | python $M4_REPO_PATH/m4/scripts/s3_checkpoint_download_convert_upload.py $EXPERIMENT_NAME ${opt_step_num_list[@]} $M4_REPO_PATH
50 | 
51 | echo "END TIME: $(date)"
52 | 


--------------------------------------------------------------------------------
/vision/m4/scripts/s3_downloaded_checkpoints_cleanup.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr_test-s3-cleanup-checkpoints
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=3:00:00
 6 | #SBATCH --partition=production-cluster
 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/logs/%x-%j.out
 8 | 
 9 | 
10 | set -e
11 | 
12 | # ----------------- Auto-Workdir -----------------
13 | if [ -n $SLURM_JOB_ID ];  then
14 |     # check the original location through scontrol and $SLURM_JOB_ID
15 |     SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
16 | else
17 |     # otherwise: started with bash. Get the real location.
18 |     SCRIPT_PATH=$(realpath $0)
19 | fi
20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd)
22 | 
23 | # --------------------------------------------------
24 | 
25 | ### EDIT ME START ###
26 | 
27 | CONDA_ENV_NAME=shared-m4
28 | 
29 | EXPERIMENT_NAME=tr_194_laion_cm4_mix
30 | 
31 | opt_step_num_list=(
32 |    "1000"
33 |    "2000"
34 | )
35 | 
36 | ### EDIT ME END ###
37 | 
38 | 
39 | echo "START TIME: $(date)"
40 | 
41 | source /fsx/m4/start-m4-user
42 | conda activate base
43 | conda activate $CONDA_ENV_NAME
44 | pushd $M4_REPO_PATH
45 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH
46 | 
47 | for opt_step_num in ${opt_step_num_list[@]}
48 | do
49 |     OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/${EXPERIMENT_NAME}/opt_step-${opt_step_num}"
50 |     rm -r $OPT_STEP_DIR
51 |     echo "Deleted $OPT_STEP_DIR of experiment: $EXPERIMENT_NAME"
52 | done
53 | 
54 | echo "END TIME: $(date)"
55 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/callers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/callers/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/configs/config_extract_web_documents.yaml:
--------------------------------------------------------------------------------
 1 | dom_tree_simplificator:
 2 |   strip_multiple_linebreaks: True
 3 |   strip_multiple_spaces: True
 4 |   remove_html_comments: True
 5 |   replace_line_break_tags: True
 6 |   unwrap_tags: True
 7 |   strip_tags: True
 8 |   strip_special_divs: True
 9 |   remove_dates: True
10 |   remove_empty_leaves: True
11 |   unnest_nodes: True
12 |   remake_tree: True
13 |   css_rules:
14 |     - "[class~='footer']"
15 |     - "[class~='site-info']"
16 |   css_rules_replace_with_text: {"[class~='more-link']": "\n\nEND_OF_DOCUMENT_TOKEN_TO_BE_REPLACED\n\n"}
17 | pre_extraction_simplificator:
18 |   only_text_image_nodes: True
19 |   format_texts: True
20 |   merge_consecutive_text_nodes: True
21 | web_document_extractor:
22 |   image_size: 256
23 |   resize_mode: no
24 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/configs/config_filter_laion_pairs.yaml:
--------------------------------------------------------------------------------
 1 | cond_check_size_image: True
 2 | original_width_min_cutoff: 150
 3 | original_width_max_cutoff: 10_000
 4 | original_height_min_cutoff: 150
 5 | original_height_max_cutoff: 10_000
 6 | aspect_ratio_max_cutoff: 3
 7 | cond_check_number_words: True
 8 | number_words_min_cutoff: 1
 9 | number_words_max_cutoff: 256
10 | cond_check_word_repetition_ratio: True
11 | word_repetition_length: 1
12 | word_repetition_max_cutoff: 0.7
13 | cond_check_special_character_ratio: True
14 | special_character_ratio_max_cutoff: 0.4
15 | cond_check_common_word_ratio: True
16 | common_word_ratio_min_cutoff: 0.7
17 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/configs/config_filter_text_image_pairs.yaml:
--------------------------------------------------------------------------------
 1 | cond_check_image_in_simplified_dom_tree: True
 2 | cond_check_format: True
 3 | valid_formats: !!set {jpg, jpeg, png, webp}
 4 | cond_check_size_image: True
 5 | original_width_min_cutoff: 100
 6 | # PIL decompression bomb warning is at 178M pixels, and 10000^2=100M is close
 7 | original_width_max_cutoff: 10000
 8 | original_height_min_cutoff: 100
 9 | original_height_max_cutoff: 10000
10 | rendered_width_min_cutoff: 100
11 | rendered_width_max_cutoff: 100000
12 | rendered_height_min_cutoff: 100
13 | rendered_height_max_cutoff: 100000
14 | aspect_ratio_max_cutoff: 3
15 | cond_remove_non_printing_characters: False  # Warning if set to True, it contains " " and "\n"
16 | cond_standardize_whitespace: True
17 | cond_check_number_words: True
18 | number_words_min_cutoff: 3
19 | number_words_max_cutoff: 256
20 | cond_check_special_character_ratio: True
21 | special_character_ratio_max_cutoff: 0.4
22 | cond_check_stopword_ratio: False
23 | stopword_ratio_min_cutoff: 0
24 | cond_check_repetition_ratio: True
25 | repetition_ratio_max_cutoff: 0.3
26 | cond_check_clip_score: True
27 | clip_score_min_cutoff: 0.2
28 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/debug/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/debug/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/outputs/README.md:
--------------------------------------------------------------------------------
 1 | # Clip distributions - descriptive stat
 2 | 
 3 | - SBU Captions
 4 | ```python
 5 | DescribeResult(nobs=10000, minmax=(0.11153904348611832, 0.44991129636764526), mean=0.2874957061290741, variance=0.0016425453395696478, skewness=-0.22512623318313724, kurtosis=0.1512977180455395)
 6 | ```
 7 | 
 8 | - Red Caps
 9 | ```python
10 | DescribeResult(nobs=10000, minmax=(0.08980361372232437, 0.4210364818572998), mean=0.3082767878524959, variance=0.001230211924011678, skewness=-0.5157219676083339, kurtosis=0.6965278169334876)
11 | ```
12 | 
13 | - LAION 400M
14 | ```python
15 | DescribeResult(nobs=10000, minmax=(0.16056129336357117, 0.4760231077671051), mean=0.333618477447331, variance=0.0008586748609226699, skewness=0.7131919650316029, kurtosis=1.668628208211425)
16 | ```
17 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/outputs/clip_scores_laion400m_10000.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/outputs/clip_scores_laion400m_10000.npy


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/outputs/clip_scores_red_caps_10000.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/outputs/clip_scores_red_caps_10000.npy


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/outputs/clip_scores_sbu_captions_10000.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/outputs/clip_scores_sbu_captions_10000.npy


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/outputs/distributions_extracted.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/outputs/distributions_extracted.png


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/outputs/distributions_reference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/outputs/distributions_reference.png


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | from m4.sourcing.data_collection.processors.dom_tree_simplificator import DOMTreeSimplificator
 2 | from m4.sourcing.data_collection.processors.html_extractor import HtmlExtractor
 3 | from m4.sourcing.data_collection.processors.image_deduplicator import ImageDeduplicator
 4 | from m4.sourcing.data_collection.processors.pair_extractor import TextMediaPairsExtractor
 5 | from m4.sourcing.data_collection.processors.pair_filtering import PairFiltering
 6 | from m4.sourcing.data_collection.processors.pre_extraction_simplificator import PreExtractionSimplificator
 7 | from m4.sourcing.data_collection.processors.warc_downloader import WarcDownloader
 8 | from m4.sourcing.data_collection.processors.web_document_extractor import CommonCrawlWebDocumentExtractor
 9 | from m4.sourcing.data_collection.processors.web_document_filtering import (
10 |     FilteringFunctions,
11 |     WebDocumentFilteringDocLevel,
12 |     WebDocumentFilteringNodeLevel,
13 | )
14 | from m4.sourcing.data_collection.processors.web_document_image_deduplication import WebDocumentImageDeduplication
15 | from m4.sourcing.data_collection.processors.web_document_line_deduplication import WebDocumentLineDeduplication
16 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/processors/warc_downloader.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from botocore.config import Config
 3 | from botocore.exceptions import ClientError, ProxyConnectionError
 4 | 
 5 | 
 6 | class WarcDownloader:
 7 |     def __init__(self):
 8 |         config_boto = Config(
 9 |             # region_name="us-east-1", # Location of the CC data, commenting this line since it doesn't help
10 |             retries={"max_attempts": 10, "mode": "standard"}
11 |         )
12 |         self.client = boto3.client("s3", config=config_boto)
13 | 
14 |     def __call__(self, example):
15 |         if example["warc"] and not example["warc_error"]:
16 |             return example
17 | 
18 |         warc_filename = example["warc_filename"]
19 |         warc_record_offset = example["warc_record_offset"]
20 |         warc_record_length = example["warc_record_length"]
21 | 
22 |         warc, warc_error = self.get_warc_from_metadata(
23 |             client=self.client,
24 |             warc_filename=warc_filename,
25 |             warc_record_offset=warc_record_offset,
26 |             warc_record_length=warc_record_length,
27 |         )
28 |         example["warc"] = warc
29 |         example["warc_error"] = warc_error
30 |         return example
31 | 
32 |     def get_warc_from_metadata(self, client, warc_filename, warc_record_offset, warc_record_length):
33 |         try:
34 |             response = client.get_object(
35 |                 Bucket="commoncrawl",
36 |                 Key=warc_filename,
37 |                 Range=f"bytes={warc_record_offset}-{warc_record_offset + warc_record_length - 1}",
38 |             )
39 |         except (ClientError, ProxyConnectionError) as e:
40 |             return b"", repr(e)
41 |         return response["Body"].read(), ""
42 | 
43 |     # Needed to make multiprocessing work
44 |     def __reduce__(self):
45 |         return (self.__class__, ())
46 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from m4.sourcing.data_collection.utils.clip_utils import compute_clip_score
 2 | from m4.sourcing.data_collection.utils.fetching_utils import fetch_single_image
 3 | from m4.sourcing.data_collection.utils.filtering_utils import (
 4 |     DIGITS_RE,
 5 |     FLAGGED_WORDS,
 6 |     NON_PRINTING_CHARACTERS_RE,
 7 |     PUNCTUATION,
 8 |     SPECIAL_CHARACTERS,
 9 |     STOPWORDS,
10 |     UNICODE_PUNCTUATION,
11 | )
12 | from m4.sourcing.data_collection.utils.kl_utils import NB_BINS, kl_div
13 | from m4.sourcing.data_collection.utils.simplification_utils import (
14 |     TAG_TO_SEP,
15 |     format_filename,
16 |     format_image_size,
17 |     format_relative_to_absolute_path,
18 |     get_media_src,
19 |     is_url_valid,
20 |     simplify_media_node,
21 | )
22 | from m4.sourcing.data_collection.utils.tags_attributes import (
23 |     INTERESTING_TAGS_SET,
24 |     MEDIA_CONTAIN_INTERESTING_ATTRIBUTES_SET,
25 |     UNWRAP_TAGS,
26 |     InterestingAttributesSetCategory,
27 | )
28 | from m4.sourcing.data_collection.utils.utils import load_dataset_html, make_selectolax_tree
29 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/utils/kl_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | NB_BINS = 40
 5 | 
 6 | 
 7 | def kl_div(p, q, nb_bins=NB_BINS):
 8 |     freq_p, _ = np.histogram(p, bins=nb_bins, range=(0.0, 1.0), density=True)
 9 |     freq_q, _ = np.histogram(q, bins=nb_bins, range=(0.0, 1.0), density=True)
10 |     elem = freq_p * np.log(freq_p / freq_q)
11 |     return np.sum(np.where((~np.isnan(elem)) & (freq_q != 0), elem, 0))
12 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/utils/utils.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from selectolax.parser import HTMLParser
 3 | 
 4 | 
 5 | def load_dataset_html(shuffle=False, buffer_size=10000, seed=42):
 6 |     dataset = load_dataset(
 7 |         "bs-modeling-metadata/c4-en-html-with-metadata",
 8 |         streaming=True,
 9 |         split="train",
10 |         use_auth_token=True,
11 |     )
12 |     if shuffle:
13 |         dataset = dataset.shuffle(buffer_size=buffer_size, seed=seed)
14 |     dataset = iter(dataset)
15 |     return dataset
16 | 
17 | 
18 | def make_selectolax_tree(html_str):
19 |     selectolax_tree = HTMLParser(html_str)
20 |     return selectolax_tree
21 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/visualization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/visualization/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/visualization/choose_filtering_parameters_laion_pairs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import streamlit as st
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     st.set_page_config(layout="wide")
 7 |     st.title("Visualization to help choosing the filtering parameters for image / text pair datasets")
 8 | 
 9 |     path_stats = "./large_files/stats_vis_choose_filtering_params.pkl"
10 |     df_stats = pd.read_pickle(path_stats)
11 | 
12 |     num_considered_examples = st.number_input(
13 |         "Choose the number of image / text pairs to consider",
14 |         min_value=0,
15 |         max_value=len(df_stats),
16 |         value=1_000,
17 |         help=f"Enter a number between 0 and {len(df_stats)}",
18 |     )
19 |     df_stats = df_stats.head(num_considered_examples)
20 | 
21 |     order_sort = st.selectbox("Sorting in", options=["ascending order", "descending order"], index=0)
22 |     stat_sort_on = st.selectbox(
23 |         "Sorting on",
24 |         options=[name for name in list(df_stats.columns.values) if name not in ["img", "caption"]],
25 |         index=0,
26 |     )
27 |     ascending_sort = True if "ascending" in order_sort else False
28 |     df_stats = df_stats.sort_values(stat_sort_on, ascending=ascending_sort)
29 | 
30 |     html_data_frame = df_stats.to_html(escape=False)
31 |     st.markdown(html_data_frame, unsafe_allow_html=True)
32 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/data_collection/visualization/wikipedia/explore_wiki_results.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from pathlib import Path
 3 | 
 4 | import streamlit as st
 5 | from datasets import load_from_disk
 6 | 
 7 | 
 8 | st.set_page_config(layout="wide")
 9 | 
10 | processed_data_dir = Path("/home/lucile/local_datasets/enwiki/enwiki-NS0-20230220-ENTERPRISE-HTML-EXTRACTION")
11 | original_data_dir = Path("/home/lucile/local_datasets/enwiki/enwiki-NS0-20230220-ENTERPRISE-HTML")
12 | shard_id = 30
13 | exclude_shards = [34]
14 | 
15 | processed_ds_name_2 = (
16 |     processed_data_dir / f"shard_{shard_id}" / "wikipedia_html_enterprise-with-images-and-html-full-v1-v2"
17 | )
18 | shard_ds = load_from_disk(processed_ds_name_2)
19 | 
20 | shard_ds = shard_ds.filter(lambda x: x["html"] is not None)
21 | num_docs = len(shard_ds)
22 | 
23 | st.header("Document")
24 | if st.button("Select a random document"):
25 |     dct_idx = random.randint(a=0, b=num_docs - 1)
26 | else:
27 |     dct_idx = 0
28 | idx = st.number_input(
29 |     f"Select a document among the first {num_docs} ones",
30 |     min_value=0,
31 |     max_value=num_docs - 1,
32 |     value=dct_idx,
33 |     step=1,
34 |     help=f"Index between 0 and {num_docs-1}",
35 | )
36 | current_example = shard_ds[idx]
37 | current_html = current_example["html"]
38 | 
39 | 
40 | col1, col2 = st.columns(2)
41 | with col1:
42 |     st.subheader("Raw html rendering")
43 |     st.components.v1.html(current_html, height=700, scrolling=True)
44 | with col2:
45 |     st.subheader("Texts and images extracted from the html")
46 |     for text, img in zip(current_example["texts"], current_example["images"]):
47 |         if img is not None:
48 |             st.image(img, caption=text)
49 |         else:
50 |             st.write(text)
51 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/get_modelling_metadata_dataset/get_modelling_metadata_dataset.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=m4_get_dataset            # (change me!) job name
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1                                             # crucial - only 1 task per dist per node!
 5 | #SBATCH --cpus-per-task=4                                               # (change me! between 0 and 48) number of cores per tasks
 6 | #SBATCH --hint=nomultithread                                            # we get physical cores not logical
 7 | #SBATCH --time 012:00:00                                                 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
 8 | #SBATCH --output=/gpfsdswork/projects/rech/cnw/uue59kq/logs/get_dataset/%j-%x.out   # output file name
 9 | #SBATCH --account=cnw@cpu                                               # account
10 | #SBATCH --array=0-2756
11 | #SBATCH --partition=cpu_p1
12 | 
13 | set -x -e
14 | 
15 | source $cnw_ALL_CCFRWORK/start-m4-user
16 | conda activate lucile-m4
17 | 
18 | export HF_DATASETS_OFFLINE=1
19 | export HF_DATASETS_CACHE=/gpfsscratch/rech/cnw/uue59kq/to_delete
20 | 
21 | WORKING_DIR=/gpfswork/rech/cnw/uue59kq/repos/m4/m4/sourcing/processing/extracting_documents/get_modelling_metadata_dataset
22 | pushd $WORKING_DIR
23 | 
24 | readarray -t SHARD_NAMES < shard_names.txt
25 | SHARD_NAME=${SHARD_NAMES[$SLURM_ARRAY_TASK_ID]}
26 | echo "Downloading shard: "$SHARD_NAME
27 | 
28 | python get_modelling_metadata_dataset.py \
29 |     --dataset-path /gpfsscratch/rech/cnw/urd43gx/c4-en-html-with-metadata/ \
30 |     --save-dir /gpfsscratch/rech/cnw/commun/local_datasets/c4-en-html-with-metadata-arrow/ \
31 |     --shard-name $SHARD_NAME
32 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/pmd/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from functools import lru_cache
 3 | from pathlib import Path
 4 | 
 5 | import datasets
 6 | 
 7 | 
 8 | DEFAULT_M4_CACHE_HOME = Path("~/.cache/m4")
 9 | 
10 | 
11 | @lru_cache(maxsize=1)
12 | def get_m4_cache_dir() -> Path:
13 |     return (Path(os.environ["M4_MANUAL_DIR"]) if "M4_MANUAL_DIR" in os.environ else DEFAULT_M4_CACHE_HOME).expanduser()
14 | 
15 | 
16 | @lru_cache(maxsize=1)
17 | def get_jz_dataset_dir() -> Path:
18 |     if "DSDIR" in os.environ:
19 |         return Path(os.environ["DSDIR"]).expanduser()
20 |     raise ValueError("We're not in JZ. This method should only be called when running in JZ.")
21 | 
22 | 
23 | # All PMD datasets should following a single feature API.
24 | _FEATURES = datasets.Features(
25 |     {
26 |         "image": datasets.Image(),
27 |         "text": datasets.Value("string"),
28 |         # Define where the sample comes from, this is necessary when we start to use aggregated versions like PMD.
29 |         "source": datasets.Value("string"),
30 |         # We commit any kind of additional information in json format in `meta`
31 |         "meta": datasets.Value("string"),
32 |     }
33 | )
34 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/pmd/fix_image_path.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Dict, List
 2 | 
 3 | import datasets
 4 | 
 5 | 
 6 | """
 7 | Images can be stored in `datasets` using bytes or path to an actual file. If the a path is given, one needs to make the
 8 | path work with local setup. The way to do so we remove a prefix and replace with a environment dependent one.
 9 | `home/thomas_wang_hugginface_co/.cache/m4/...` -> f"{get_m4_cache_dir()}/..."
10 | """
11 | 
12 | 
13 | def get_image_paths_fixer(image_column_name: str, image_path_fixer: Callable[[str], str]):
14 |     image_feature = datasets.Image(decode=True)
15 | 
16 |     def image_paths_fixer(batch: Dict[str, List]) -> Dict[str, List]:
17 |         # Image(decode=False) which allows the images to be `{'path': str, 'bytes': str}`
18 |         image_dicts = batch[image_column_name]
19 | 
20 |         for image_dict in image_dicts:
21 |             # We ignore Images that store bytes directly
22 |             if image_dict["bytes"] is not None:
23 |                 continue
24 | 
25 |             path = image_dict["path"]
26 |             assert path is not None
27 |             new_path = image_path_fixer(path)
28 |             assert new_path is not None
29 |             # Careful that's an in-place operation, which updates the dict stored in `batch`
30 |             image_dict["path"] = new_path
31 | 
32 |         batch[image_column_name] = [image_feature.decode_example(image_dict) for image_dict in image_dicts]
33 |         return batch
34 | 
35 |     return image_paths_fixer
36 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/pmd/jz_loaders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/jz_loaders/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/pmd/jz_loaders/jz_conceptual_captions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/jz_loaders/jz_conceptual_captions/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/pmd/jz_loaders/jz_wit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/jz_loaders/jz_wit/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/pmd/local_loaders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/local_loaders/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/pmd/local_loaders/coco/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/local_loaders/coco/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/pmd/local_loaders/localized_narratives__ADE20k/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/local_loaders/localized_narratives__ADE20k/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/pmd/local_loaders/localized_narratives__coco/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/local_loaders/localized_narratives__coco/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/pmd/local_loaders/localized_narratives__flickr30k/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/local_loaders/localized_narratives__flickr30k/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/pmd/local_loaders/localized_narratives__openimages/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/local_loaders/localized_narratives__openimages/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/pmd/local_loaders/yfcc100m/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/local_loaders/yfcc100m/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/pmd/scripts/jz_image_pmd.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=jz_image_pmd
 3 | #SBATCH --qos=qos_cpu-t4
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=40
 6 | #SBATCH --partition=cpu_p1
 7 | #SBATCH --hint=nomultithread
 8 | #SBATCH --array=0-1%2
 9 | #SBATCH --time=100:00:00
10 | #SBATCH --output=output-%x-%A_%a.out
11 | #SBATCH --error=output-%x-%A_%a.out
12 | #SBATCH --account=cnw@cpu
13 | #SBATCH --mail-type=ALL
14 | #SBATCH --mail-user=victor@huggingface.co
15 | 
16 | source ~/.bashrc_cnw
17 | eval "$(conda shell.bash hook)"
18 | conda activate victor
19 | 
20 | export HF_DATASETS_OFFLINE=1
21 | export HF_DATASETS_CACHE=$cnw_ALL_CCFRSCRATCH/datasets
22 | 
23 | all_dataset=(
24 | jz_wit
25 | jz_conceptual_captions
26 | )
27 | dataset_name=${all_dataset[${SLURM_ARRAY_TASK_ID}]}
28 | 
29 | python jz_pmd.py --dataset_name $dataset_name --loading_script_path $WORK/code/m4/m4/sourcing/pmd/jz_loaders/$dataset_name
30 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/pmd/scripts/jz_pmd.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import math
 3 | import os
 4 | 
 5 | from datasets import DatasetDict, load_dataset
 6 | 
 7 | from m4.utils.datasets.get_self_contained_ds import process_ds_wrapped
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser(description="Create arrow files for subsets of image PMD - JZ version/")
12 |     parser.add_argument(
13 |         "--dataset_name", type=str, required=True, help="Should be either `jz_conceptual_captions` or `jz_wit`."
14 |     )
15 |     parser.add_argument("--loading_script_path", type=str, required=True, help="Path to the loading script.")
16 |     parser.add_argument(
17 |         "--num_proc",
18 |         type=int,
19 |         default=1,
20 |         help="Number of processed for multiprocessing (in particular calls to `map`s).",
21 |     )
22 |     args = parser.parse_args()
23 | 
24 |     dataset_name = args.dataset_name.replace("jz_", "")
25 | 
26 |     dataset = load_dataset(args.loading_script_path)
27 | 
28 |     print("Start converting the images to bytes.")
29 |     dataset = process_ds_wrapped(dataset, batch_size=1_000, num_proc=args.num_proc)
30 | 
31 |     print("Start saving shards.")
32 |     if isinstance(dataset, DatasetDict):
33 |         for split_name, dset in dataset.items():
34 |             nb_of_shards = math.ceil(len(dset) / 50_000)
35 |             shards = [dset.shard(num_shards=nb_of_shards, index=i, contiguous=True) for i in range(nb_of_shards)]
36 |             for i, shard in enumerate(shards):
37 |                 shard.save_to_disk(
38 |                     f"{os.environ['cnw_ALL_CCFRSCRATCH']}/general_pmd/image/{dataset_name}/{split_name}/{i:05}-{nb_of_shards:05}"
39 |                 )
40 |     else:
41 |         raise ValueError(f"`datasets` is of type {type(dataset)} which is not supported yet.")
42 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/processing/README.md:
--------------------------------------------------------------------------------
 1 | # Data Processing Pipelines
 2 | 
 3 | Relate to issue [#12](https://github.com/huggingface/m4/issues/12).
 4 | 
 5 | We have two v0 data processing pipelines:
 6 | - (a) split (for sharding) + parallel/slurm arrays of whatever processing scripts (python or rust for instance)
 7 | - (b) apache beam (for creating processing pipelines) + Dataflow (for horizontal scaling)
 8 | 
 9 | ## App
10 | 
11 | ngram search is mostly an example.
12 | to launch the app:
13 | ```bash
14 | streamlit run app.py --server.port 6006
15 | ```
16 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/processing/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/processing/extracting_ngrams/README.md:
--------------------------------------------------------------------------------
 1 | ## Locally
 2 | Run the `run_document_ngrams_extraction.sh` script.
 3 | 
 4 | ## On JZ
 5 | On JZ:
 6 | - Add to your `~/.bashrc` the following line (custom installation of `jq` and `parallel`):
 7 | ```bash
 8 | export PATH=$PATH:/gpfswork/rech/six/commun/lib/jq-1.5/bin/:/gpfswork/rech/six/commun/lib/parallel/bin/
 9 | ```
10 | 
11 | Then, run the slurm script (`sbatch pipe.slurm`).
12 | 


--------------------------------------------------------------------------------
/vision/m4/sourcing/processing/extracting_ngrams/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/processing/extracting_ngrams/__init__.py


--------------------------------------------------------------------------------
/vision/m4/sourcing/processing/extracting_ngrams/run_document_ngrams_extraction.sh:
--------------------------------------------------------------------------------
 1 | eval "$(conda shell.bash hook)"
 2 | conda activate m3
 3 | 
 4 | # TODO: update so that we can take in multiple shards
 5 | N_WORKERS=4
 6 | DATA_PATH=/home/victor_huggingface_co/m4/data
 7 | SHARD_NAME=4e47925f7c894bd8eb56e5dd1d778ec77bf2c90f6cee0e32e31615393391c67a
 8 | NB_DOCS_PER_SUBSHARD=2000
 9 | 
10 | # Get the text field only
11 | jq ".text" < $DATA_PATH/raw_dumps/$SHARD_NAME > $DATA_PATH/processed_dumps/$SHARD_NAME.texts
12 | # Get the URL field only
13 | jq -r "[input_line_number,.url] | @csv" < $DATA_PATH/raw_dumps/$SHARD_NAME > $DATA_PATH/extracted_databases/$SHARD_NAME.urls.csv
14 | # Get the HTML field only
15 | jq -r "[input_line_number,.html] | @csv" < $DATA_PATH/raw_dumps/$SHARD_NAME > $DATA_PATH/extracted_databases/$SHARD_NAME.htmls.csv
16 | 
17 | # Splitting into subshards
18 | split --lines $NB_DOCS_PER_SUBSHARD --numeric-suffixes $DATA_PATH/processed_dumps/$SHARD_NAME.texts $DATA_PATH/processed_dumps/$SHARD_NAME.texts.
19 | 
20 | # Extract ngrams in each documents
21 | find $DATA_PATH/processed_dumps/ | \
22 |     grep "${DATA_PATH}/processed_dumps/${SHARD_NAME}.texts.[0-9]+*" | \
23 |     parallel --verbose -j $N_WORKERS --progress "TRANSFORMERS_OFFLINE=1 TRANSFORMERS_VERBOSITY=error python extract_documents_ngrams.py --filepath {} --nb_docs_per_subshard $NB_DOCS_PER_SUBSHARD" > \
24 |     $DATA_PATH/extracted_databases/$SHARD_NAME.ngrams.csv
25 | 
26 | # Remove the subshards
27 | find $DATA_PATH/processed_dumps/ | grep "${DATA_PATH}/processed_dumps/${SHARD_NAME}.texts.[0-9]+*" | xargs -d"\n" rm
28 | 


--------------------------------------------------------------------------------
/vision/m4/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/training/__init__.py


--------------------------------------------------------------------------------
/vision/m4/training/debug_utils.py:
--------------------------------------------------------------------------------
 1 | """ Trainer debug utils """
 2 | 
 3 | 
 4 | def dump_optim_states(self):
 5 |     """dumps basic information about the state of the optimizer"""
 6 | 
 7 |     print("*** Optim States Dump:")
 8 |     param_groups_cnt = len(self.vl_optim.param_groups)
 9 |     # state dict has more than param_groups info, so extract only the param groups
10 |     param_group_states = list(self.vl_optim.state.values())[:param_groups_cnt]
11 |     for i, state in enumerate(param_group_states):
12 |         print(f"param group: {i}")
13 |         print(f"  step={state['step']}")
14 |         print(f"  exp_avg    all_zero={all(state['exp_avg'] == 0)}")
15 |         print(f"  exp_avg_sq all_zero={all(state['exp_avg_sq'] == 0)}")
16 | 
17 |     # can also dump LR state if need be
18 |     # print(f"LR={self.vl_scheduler.get_last_lr()}")
19 | 
20 | 
21 | def validate_optim_states_are_reset(self):
22 |     """
23 |     for a new or fully reset optimizer we expect all zeros `exp_avg` and `exp_avg_sq` state tensors and step=1
24 |     """
25 | 
26 |     param_groups_cnt = len(self.vl_optim.param_groups)
27 |     param_group_states = list(self.vl_optim.state.values())[:param_groups_cnt]
28 |     for i, state in enumerate(param_group_states):
29 |         if state["step"] != 1:
30 |             raise ValueError(f"optimizer reset didn't seem to work: state={i} step={state['step']}")
31 |         if not all(state["exp_avg"] == 0):
32 |             raise ValueError(f"optimizer reset didn't seem to work: state={i} step={state['exp_avg']}")
33 |         if not all(state["exp_avg_sq"] == 0):
34 |             raise ValueError(f"optimizer reset didn't seem to work: state={i} step={state['exp_avg_sq']}")
35 | 


--------------------------------------------------------------------------------
/vision/m4/training/setup_language_model.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from m4.models.idefics.configuration_idefics import IdeficsConfig
 4 | from m4.models.idefics.modeling_idefics import IdeficsForCausalLM
 5 | from m4.models.vgpt2.configuration_vgpt2 import VGPT2Config
 6 | from m4.models.vgpt2.modeling_vgpt2 import VGPT2LMHeadModel
 7 | from m4.models.vllama3.configuration_vllama3 import VLlama3Config
 8 | from m4.models.vllama3.modeling_vllama3 import VLlama3ForCausalLM
 9 | from m4.models.vmistral.configuration_vmistral import VMistralConfig
10 | from m4.models.vmistral.modeling_vmistral import VMistralForCausalLM
11 | 
12 | 
13 | model_name2classes = {
14 |     r"gpt2": [VGPT2Config, VGPT2LMHeadModel],
15 |     r"idefics": [IdeficsConfig, IdeficsForCausalLM],
16 |     r"mistral": [VMistralConfig, VMistralForCausalLM],
17 |     r"llama": [VLlama3Config, VLlama3ForCausalLM],
18 |     r"smollm": [VLlama3Config, VLlama3ForCausalLM],
19 | }
20 | 
21 | 
22 | def model_name_to_classes(model_name_or_path):
23 |     """returns config_class, model_class for a given model name or path"""
24 | 
25 |     model_name_lowcase = model_name_or_path.lower()
26 |     for rx, classes in model_name2classes.items():
27 |         if re.search(rx, model_name_lowcase):
28 |             return classes
29 |     else:
30 |         raise ValueError(
31 |             f"Unknown type of backbone LM. Got {model_name_or_path}, supported regexes:"
32 |             f" {list(model_name2classes.keys())}."
33 |         )
34 | 


--------------------------------------------------------------------------------
/vision/m4/training/setup_vision_model.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from transformers import AutoModel
 4 | 
 5 | 
 6 | # map to check the supported cv archs and also how to extract the model - in some arch, we want to
 7 | # go through a specific prefix to get to the model as in `model.vision_model` for clip
 8 | vision_model_name2model = {
 9 |     r"clip": lambda model: model.vision_model,
10 |     r"siglip": lambda model: model.vision_model,
11 |     r"vit": lambda model: model,
12 | }
13 | 
14 | 
15 | def vision_model_name_to_model(model_name_or_path, model):
16 |     """returns the model if supported, asserts otherwise"""
17 | 
18 |     model_name_lowcase = model_name_or_path.lower()
19 |     for rx, lookup in vision_model_name2model.items():
20 |         if re.search(rx, model_name_lowcase):
21 |             return lookup(model)
22 |     else:
23 |         raise ValueError(
24 |             f"Unknown type of backbone vision model. Got {model_name_or_path}, supported regexes:"
25 |             f" {list(vision_model_name2model.keys())}."
26 |         )
27 | 
28 | 
29 | def get_vision_model(config):
30 |     vision_model_name = config.vision_model_name
31 |     vision_model_params = eval(config.vision_model_params)
32 | 
33 |     model = AutoModel.from_pretrained(vision_model_name, **vision_model_params, trust_remote_code=True)
34 |     return vision_model_name_to_model(vision_model_name, model)
35 | 


--------------------------------------------------------------------------------
/vision/m4/training/types.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class DatasetNames(Enum):
 5 |     PMD = "pmd"
 6 |     LAION = "laion"
 7 |     LAION_COCO = "laion_coco"
 8 |     TIKZ = "tikz"
 9 |     CM4 = "cm4"
10 |     WIKI = "wiki"
11 |     IMAGE_WEBSITE_CODE = "image_website_code"
12 |     VQAV2_TASK_FINETUNING = "vqav2_task_finetuning"
13 |     OCR = "ocr"
14 |     DOCVQA = "docvqa"
15 |     SFT = "sft"
16 | 
17 | 
18 | class DatasetTypes(Enum):
19 |     WEB_DOCUMENTS = "wd"
20 |     IMAGE_CAPTION_PAIRS = "icp"
21 |     VQAV2_TASK_FINETUNING = "vqav2_task_finetuning"
22 |     OCR = "ocr"
23 |     DOCVQA = "docvqa"
24 |     SFT = "sft"
25 | 


--------------------------------------------------------------------------------
/vision/m4/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/utils/__init__.py


--------------------------------------------------------------------------------
/vision/m4/utils/check_valid_tokenizer.py:
--------------------------------------------------------------------------------
 1 | def check_valid_tokenizer(tokenizer) -> bool:
 2 |     """Check if the special tokens were correctly added to the tokenizer,
 3 |     and if they are not normalized.
 4 |     """
 5 |     tok_class = type(tokenizer).__name__.lower()
 6 |     if ("idefics" in tok_class) or ("mistral" in tok_class):
 7 |         assert "<image>" in tokenizer.get_vocab()
 8 |         assert "<fake_token_around_image>" in tokenizer.get_vocab()
 9 |         assert "<s>" in tokenizer.get_vocab()
10 |         assert "</s>" in tokenizer.get_vocab()
11 |         assert "<unk>" in tokenizer.get_vocab()
12 | 
13 |         for _, val in tokenizer.added_tokens_decoder.items():
14 |             assert not val.normalized  # assert that normalized=False for all AddedToken
15 | 


--------------------------------------------------------------------------------
/vision/m4/utils/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/utils/datasets/__init__.py


--------------------------------------------------------------------------------
/vision/m4/utils/debug.py:
--------------------------------------------------------------------------------
 1 | import builtins
 2 | import fcntl
 3 | 
 4 | 
 5 | def printflock(*args, **kwargs):
 6 |     """
 7 |     This is a wrapper around the built-in Python `print` which calls `flock` before calling
 8 |     `print` and unlocks it immediately after. This wrapper is useful for when each rank needs to
 9 |     print a message without getting it interleaved with prints from other ranks.
10 |     The lock file is the file this wrapper is defined in.
11 |     The output order will be random per rank.
12 | 
13 |     Example:
14 |         >>> # assuming 4 GPUs
15 |         >>> world_size = dist.get_world_size()
16 |         >>> rank = dist.get_rank()
17 |         >>> printflock(f"This is a very long message from rank {rank}/{world_size}")
18 |        This is a very long message from rank 0/4
19 |        This is a very long message from rank 2/4
20 |        This is a very long message from rank 3/4
21 |        This is a very long message from rank 1/4
22 | 
23 |     It can also be used to override normal `print` for an easier multi-gpu debug:
24 | 
25 |         from m4.utils.debug import printflock as print
26 | 
27 |     and then you don't need to change anything in your code, the normal `print` calls will all be non-interleaved
28 |     """
29 | 
30 |     with open(__file__, "r") as fh:
31 |         fcntl.flock(fh, fcntl.LOCK_EX)
32 |         try:
33 |             builtins.print(*args, **kwargs)
34 |         finally:
35 |             fcntl.flock(fh, fcntl.LOCK_UN)
36 | 


--------------------------------------------------------------------------------
/vision/m4/utils/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/utils/training/__init__.py


--------------------------------------------------------------------------------