├── .github └── workflows │ └── trufflehog.yml ├── .gitignore ├── LICENSE ├── README.md ├── text ├── README.md ├── data │ ├── decontamination │ │ └── README.md │ ├── finemath │ │ └── README.md │ ├── fineweb-edu │ │ └── README.md │ └── smoltalk │ │ ├── README.md │ │ ├── constraints │ │ ├── README.md │ │ ├── filter_ifeval_data.py │ │ ├── launch_ifeval_pipeline.slurm │ │ └── pipeline │ │ │ ├── __init__.py │ │ │ ├── ifeval_tasks.py │ │ │ ├── json_schemas.py │ │ │ ├── pipeline.py │ │ │ └── system_prompts.py │ │ ├── magpie_ultra_v1 │ │ ├── README.md │ │ └── pipeline.py │ │ ├── rewrite │ │ ├── README.md │ │ ├── launch_rewrite_pipeline.slurm │ │ └── pipeline │ │ │ ├── __init__.py │ │ │ ├── dataset.py │ │ │ └── pipeline.py │ │ └── summarization │ │ ├── README.md │ │ └── pipelines │ │ ├── cnn_daily_summaries.py │ │ └── email_summaries.py ├── evaluation │ ├── README.md │ ├── math_utils.py │ ├── requirements.txt │ ├── smollm2_base.txt │ ├── smollm2_instruct.txt │ └── tasks.py ├── finetuning │ ├── Dockerfile │ ├── README.md │ ├── requirements.txt │ └── train.py └── pretraining │ ├── README.md │ ├── continual-pretraining │ ├── README.md │ └── finemath │ │ ├── 160B-runs │ │ ├── fwedu-finemath-infiwebmath-3plus.yaml │ │ └── fwedu-finemath-infiwebmath-4plus.yaml │ │ ├── 60B-runs │ │ ├── finemath-3plus.yaml │ │ ├── finemath-4plus.yaml │ │ ├── finemath-infiwebmath-3plus.yaml │ │ ├── finemath-infiwebmath-4plus.yaml │ │ ├── infiwebmath-3plus.yaml │ │ ├── infiwebmath-4plus.yaml │ │ ├── infiwebmath.yaml │ │ └── openwebmath.yaml │ │ ├── finemath-tokenize.py │ │ ├── tokenization_InfiMM-WebMath-40B.patch │ │ └── tokenization_finemath.patch │ ├── launch.slurm │ ├── smollm1 │ ├── config_smollm1_135M.yaml │ ├── config_smollm1_1B.yaml │ └── config_smollm1_360M.yaml │ └── smollm2 │ ├── config_smollm2_135M.yaml │ ├── config_smollm2_1B.yaml │ └── config_smollm2_360M.yaml ├── tools ├── README.md ├── smol_tools │ ├── README.md │ ├── demo_tkinter.py │ ├── requirements.txt │ └── smol_tools │ │ ├── agent.py │ │ ├── base.py │ │ ├── chatter.py │ │ ├── rewriter.py │ │ ├── summarizer.py │ │ └── titler.py ├── smollm_local_inference │ ├── README.md │ ├── llama-cpp-python.py │ ├── mlc.py │ ├── mlx.py │ └── transformers-js.js └── smolvlm_local_inference │ ├── README.md │ └── SmolVLM_video_inference.py └── vision ├── README.md ├── data ├── README.md └── datasets_processing_scripts │ ├── 01_tar_datasets_with_jpeg │ └── python_scripts │ │ ├── 01_convert_coco_per_shard_idx.py │ │ ├── 02_convert_cm4_per_shard_idx.py │ │ ├── 03_convert_laoin_per_shard_idx.py │ │ ├── 04_convert_cm4_per_shard_idx.py │ │ ├── 05_convert_scaled_laion_per_shard_idx.py │ │ ├── 06_convert_lrv_per_shard_idx.py │ │ ├── 07_convert_llava_per_shard_idx.py │ │ ├── 08_convert_svit_per_shard_idx.py │ │ ├── 09_convert_ultrachat_per_shard_idx.py │ │ ├── 10_convert_m3it_per_shard_idx.py │ │ ├── 11_convert_spot_difference_per_shard_idx.py │ │ ├── 12_convert_llavar_per_shard_idx.py │ │ ├── 13_convert_vqav2_task_finetuning_per_shard_idx.py │ │ ├── 14_convert_tikz_per_shard_idx.py │ │ ├── 15_convert_docvqa_per_shard_idx.py │ │ ├── 16_convert_image_website_code_per_shard_idx.py │ │ ├── 17_convert_websight_v02_per_shard_idx.py │ │ ├── 18_convert_sft_per_shard_idx.py │ │ └── 19_convert_websight_mix_per_shard_idx.py │ ├── build_concatenation_datasets_sft │ ├── build_concat_ds_sft.py │ ├── build_ds_sft.py │ ├── build_the_cauldron.py │ ├── create_set_hashes_test_images.py │ ├── job_build_the_cauldron.slurm │ ├── job_merge_on_image_individual_dataset.slurm │ ├── merge_on_image_individual_dataset.py │ ├── tar_dataset_pattern_check.py │ └── viz_tool.py │ ├── build_ethic_dataset │ ├── bias_generation_eval_idefics.py │ └── bias_generation_eval_idefics2.py │ ├── build_image_website_code │ ├── 01_generate_ideas_website.py │ ├── 02_generate_html_css_codes.py │ ├── 03_extraction_html_css_codes.py │ └── 04_screenshot_rendered_websites.py │ ├── build_laion_coco_dataset │ └── python_scripts │ │ ├── 02_01_find_opt_out.py │ │ ├── 02_02_remove_opt_out.py │ │ ├── 03_remove_nsfw_images.py │ │ ├── 04_remove_small_images.py │ │ ├── 05_binary_classification.py │ │ ├── make_laion_coco.py │ │ ├── make_laion_coco_1_4_dataset.py │ │ └── train_bin_classif.py │ ├── build_laion_dataset │ └── python_scripts │ │ ├── 01_01_download_prepare_laion.py │ │ ├── 01_02_template_loading_script_laion.py │ │ ├── 02_filter_laion.py │ │ ├── 03_01_prepare_dedup_laion.py │ │ ├── 03_02_dedup_laion.py │ │ ├── 04_01_find_laion_urls.py │ │ ├── 04_02_create_ds_laion_urls.py │ │ ├── 04_03_find_opt_out_images_laion.py │ │ ├── 04_04_remove_opt_out_images_laion.py │ │ └── merge_1_4_laion_big_shards.py │ ├── build_webdocs_dataset │ └── python_scripts │ │ ├── 01_download_warc.py │ │ ├── 02_bis_extract_html_get_image_urls_new_rules.py │ │ ├── 02_extract_html_get_image_urls.py │ │ ├── 02_parallel_extract_html_get_image_urls.py │ │ ├── 03_dl_images_create_dataset.py │ │ ├── 03_parallel_dl_images_create_dataset.py │ │ ├── 04_merge_web_docs_with_images.py │ │ ├── 05_filtering_web_docs.py │ │ ├── 06_01_create_set_image_urls_in_webdocs.py │ │ ├── 06_02_merge_sets_image_urls_in_webdocs.py │ │ ├── 06_03_remove_image_duplicates.py │ │ ├── 07_01_nsfw_image_filtering.py │ │ ├── 07_02_nsfw_image_visualization.py │ │ ├── 07_03_nsfw_image_removal.py │ │ ├── 08_01_prepare_urldedup.py │ │ ├── 08_02_urldedup.py │ │ ├── 09_01_create_web_docs_texts_only.py │ │ ├── 09_02_get_domain_to_positions.py │ │ ├── 09_03_split_domain_to_positions.py │ │ ├── 09_04_get_domain_to_duplicated_texts.py │ │ ├── 09_05_merge_domain_to_duplicated_texts_sharded.py │ │ ├── 09_06_line_dedup.py │ │ ├── 09_07_merge_web_docs_texts_only_and_rest.py │ │ ├── 10_final_cleaning.py │ │ ├── 11_01_create_set_img_urls.py │ │ ├── 11_02_get_docs_to_remove_by_set_img_urls_dedup.py │ │ ├── 11_03_set_img_urls_dedup.py │ │ ├── 12_01_find_opt_out_images.py │ │ ├── 12_02_remove_opt_out_images.py │ │ ├── 13_final_processing.py │ │ ├── 14_01_filter_perplexity_with_language_model.py │ │ ├── 15_01_find_urls_obelics.py │ │ ├── 15_02_find_opt_out.py │ │ ├── 15_03_remove_opt_out_documents.py │ │ └── 15_04_remove_opt_out_images.py │ ├── build_websight_v02 │ └── python_scripts │ │ ├── 01_generate_ideas_websites.py │ │ ├── 02_01_generate_html_codes_prompt_1.py │ │ ├── 02_02_generate_html_codes_prompt_2.py │ │ ├── 03_filtering_html_codes.py │ │ └── 04_screenshot_html_codes.py │ ├── clean_m4_prelimenary_experiments │ ├── README.md │ ├── explore │ │ ├── assets │ │ │ └── DOM_tree_viz.html │ │ ├── explore.py │ │ └── global_visualization.py │ └── python_scripts │ │ ├── 01_shard_names.txt │ │ ├── 02_add_html_back.py │ │ ├── 03_clean_v2.py │ │ ├── 04_get_banned_urls.py │ │ ├── 05_filter_cm4.py │ │ └── get_modelling_metadata_dataset.py │ ├── create_evaluation_datasets │ ├── Flickr30k │ │ └── flickr30k.py │ ├── MMBench │ │ └── make_mmbench.py │ ├── NLVR2 │ │ └── NLVR2.py │ ├── README.md │ ├── SEED │ │ └── make_seed.py │ ├── ScienceQA │ │ ├── scienceqa.py │ │ └── scienceqa_old_setup.py │ ├── SugarCrepe │ │ └── make_sugarcrepe.py │ ├── create_AI2D │ │ ├── create_ai2d.py │ │ ├── create_ai2d_2.py │ │ ├── create_ai2d_4.py │ │ ├── create_ai2d_5_abcd.py │ │ ├── create_ai2d_6_abcd.py │ │ └── create_ai2d_7_abcd.py │ ├── create_clevr.py │ ├── create_coco.py │ ├── create_fairface.py │ ├── create_flickr30k.py │ ├── create_hateful_memes.py │ ├── create_imagenet1k.py │ ├── create_imagenet1k_1ksupportset_subsets.py │ ├── create_imagenet1k_5ksupportset_subsets.py │ ├── create_math_vista.py │ ├── create_math_vista_mcq.py │ ├── create_math_vista_open_ended.py │ ├── create_mmbench.py │ ├── create_mmmu.py │ ├── create_mmmu_mcq.py │ ├── create_mmmu_open_ended.py │ ├── create_mmstar.py │ ├── create_nlvr2.py │ ├── create_nocaps.py │ ├── create_okvqa.py │ ├── create_renderedsst2.py │ ├── create_scienceqa.py │ ├── create_scienceqa_old_setup.py │ ├── create_textcaps.py │ ├── create_textvqa.py │ ├── create_visdial.py │ ├── create_vizwiz.py │ ├── create_vqav2.py │ ├── create_vqav2_subsets.py │ └── dedup_val_mmbench.py │ ├── create_fine_tuning_datasets │ ├── create_aokvqa.py │ ├── create_llavar.py │ ├── create_m3it.py │ ├── create_pgm.py │ ├── create_raven.py │ ├── create_spot_difference.py │ └── create_table_datasets.py │ ├── create_valid_ds │ ├── create_cm4_valid.py │ ├── create_coco_valid.py │ └── create_wiki_valid.py │ ├── enwiki │ ├── REAME.md │ └── python_scripts │ │ ├── 01_extract_text_and_urls_from_wikipedia_web_documents.py │ │ ├── 02_load_wit_images_in_ds.py │ │ ├── 02bis_get_stats.py │ │ ├── 03_extract_intermediary_dataset.py │ │ ├── 04_get_list_of_remaining_images.py │ │ ├── 04bis_get_list_of_remaining_images.py │ │ ├── 05_download_remaining_urls.py │ │ ├── 06_create_image_dataset.py │ │ ├── 07_get_images_in_ds.py │ │ └── 08_save_dataset.py │ ├── integrate_evaluation_benchmarks_chatbot │ ├── gqa.py │ ├── llava_wild.py │ ├── mm_vet.py │ ├── mmbench.py │ ├── mmbench_no_mcq.py │ ├── pope.py │ ├── qbench.py │ ├── scienceqa.py │ ├── scienceqa_no_mcq.py │ ├── seed_img.py │ └── vsr.py │ └── upload_rendered_text_dataset │ └── upload_tar_to_s3.py ├── evaluation └── README.md ├── experiments ├── evaluation │ └── vloom │ │ ├── README.md │ │ ├── async_eval_template │ │ ├── run_evals_0_shots.slurm │ │ ├── run_evals_0_shots_a_la_flamingo.slurm │ │ ├── run_evals_4_shots.slurm │ │ └── run_evals_perplexity_validation.slurm │ │ ├── async_evals_tr_341 │ │ ├── run_evals_4_shots_captioning_1024.slurm │ │ ├── run_evals_4_shots_captioning_2048.slurm │ │ ├── run_evals_4_shots_vqa_1024.slurm │ │ └── run_evals_4_shots_vqa_2048.slurm │ │ ├── async_evals_tr_343 │ │ ├── run_evals_4_shots_captioning_1024.slurm │ │ ├── run_evals_4_shots_captioning_2048.slurm │ │ ├── run_evals_4_shots_vqa_1024.slurm │ │ └── run_evals_4_shots_vqa_2048.slurm │ │ ├── async_evals_tr_346 │ │ ├── run_evals_0_shots_test_2048_docvqa.slurm │ │ ├── run_evals_0_shots_val_1024.slurm │ │ ├── run_evals_0_shots_val_1536.slurm │ │ ├── run_evals_0_shots_val_2048.slurm │ │ └── run_evals_0_shots_val_512.slurm │ │ ├── async_evals_tr_348 │ │ ├── run_evals_4_shots_captioning_1024.slurm │ │ ├── run_evals_4_shots_captioning_1920.slurm │ │ ├── run_evals_4_shots_vqa_1024.slurm │ │ └── run_evals_4_shots_vqa_1920.slurm │ │ ├── async_evals_tr_349 │ │ ├── run_evals_0_shots_test_2048_docvqa.slurm │ │ ├── run_evals_0_shots_val_1024.slurm │ │ ├── run_evals_0_shots_val_1536.slurm │ │ ├── run_evals_0_shots_val_2048.slurm │ │ └── run_evals_0_shots_val_512.slurm │ │ ├── common │ │ ├── accelerate_config.yaml │ │ ├── run_cron_evals_multi_task_cluster.slurm │ │ ├── run_evals_cluster.slurm │ │ ├── run_evals_local_datasets.slurm │ │ ├── run_evals_local_datasets_tickets.slurm │ │ ├── run_evals_multi_task_cluster.slurm │ │ ├── run_evals_multi_task_cluster_s3.slurm │ │ ├── sync_evaluations_on_gcs.slurm │ │ ├── sync_evaluations_on_s3.slurm │ │ └── sync_evaluations_on_wandb.slurm │ │ ├── cron_eval_template │ │ ├── run_evals_0_shots.slurm │ │ └── run_evals_2_shots.slurm │ │ ├── cron_tr_cron_template │ │ ├── run_evals_0_shots.slurm │ │ └── run_evals_4_shots.slurm │ │ └── slurm_scripts_templates │ │ └── run_evals_master_template.slurm └── pretraining │ └── vloom │ ├── README.md │ ├── common │ ├── sync_and_upload.sh │ └── webdataset_get_file.sh │ ├── slurm_scripts_templates │ ├── accelerate_config_multi_node.yaml │ ├── accelerate_config_single_node.yaml │ ├── ds_config.json │ ├── ds_config_bf16.json │ ├── hfc_with_launcher │ │ ├── 01_launch.sh │ │ ├── cleanup-checkpoints.slurm │ │ ├── config.yaml │ │ ├── convert-checkpoints.slurm │ │ ├── s3-upload-checkpoints.slurm │ │ ├── schedule-evals.slurm │ │ └── train.slurm │ ├── multi_node_run.slurm │ ├── single_node_run.slurm │ └── with_launcher │ │ ├── 01_launch.slurm │ │ ├── accelerate_config.yaml │ │ ├── config.yaml │ │ ├── ds_config.json │ │ └── train.slurm │ ├── tr_341_smolvlm_025b_1st_stage │ ├── 01_launch.sh │ ├── cleanup-checkpoints.slurm │ ├── config.yaml │ ├── convert-checkpoints.slurm │ ├── merge_lora_and_resize_eou.slurm │ ├── resize_embed_for_eou.py │ ├── s3-upload-checkpoints.slurm │ ├── s3-upload-run-files.slurm │ ├── schedule-evals.sh │ ├── slurm-status.slurm │ └── train.slurm │ ├── tr_343_smolvlm_05b_1st_stage │ ├── 01_launch.sh │ ├── cleanup-checkpoints.slurm │ ├── config.yaml │ ├── convert-checkpoints.slurm │ ├── merge_lora_and_resize_eou.slurm │ ├── resize_embed_for_eou.py │ ├── s3-upload-checkpoints.slurm │ ├── s3-upload-run-files.slurm │ ├── schedule-evals.slurm │ ├── slurm-status.slurm │ └── train.slurm │ ├── tr_345_vsmollm2_256M_2nd_stage │ ├── 01_launch.sh │ ├── cleanup-checkpoints.slurm │ ├── config.yaml │ ├── convert-checkpoints.slurm │ ├── merge_lora_and_resize_eou.slurm │ ├── resize_embed_for_eou.py │ ├── s3-upload-checkpoints.slurm │ ├── s3-upload-run-files.slurm │ ├── schedule-evals.sh │ ├── slurm-status.slurm │ └── train.slurm │ ├── tr_346_vsmollm2_256M_3rd_stage │ ├── 01_launch.sh │ ├── cleanup-checkpoints.slurm │ ├── config.yaml │ ├── convert-checkpoints.slurm │ ├── merge_lora_and_resize_eou.slurm │ ├── resize_embed_for_eou.py │ ├── s3-upload-checkpoints.slurm │ ├── s3-upload-run-files.slurm │ ├── schedule-evals.sh │ ├── slurm-status.slurm │ └── train.slurm │ ├── tr_347_smolvlm_500M_2nd_stage │ ├── 01_launch.sh │ ├── cleanup-checkpoints.slurm │ ├── config.yaml │ ├── convert-checkpoints.slurm │ ├── merge_lora_and_resize_eou.slurm │ ├── resize_embed_for_eou.py │ ├── s3-upload-checkpoints.slurm │ ├── s3-upload-run-files.slurm │ ├── schedule-evals.sh │ ├── slurm-status.slurm │ └── train.slurm │ ├── tr_348_smolvlm_2B │ ├── 01_launch.sh │ ├── cleanup-checkpoints.slurm │ ├── config.yaml │ ├── convert-checkpoints.slurm │ ├── merge_lora_and_resize_eou.slurm │ ├── resize_embed_for_eou.py │ ├── s3-upload-checkpoints.slurm │ ├── s3-upload-run-files.slurm │ ├── schedule-evals.slurm │ ├── slurm-status.slurm │ └── train.slurm │ ├── tr_349_vsmollm2_500M_3rd_stage │ ├── 01_launch.sh │ ├── cleanup-checkpoints.slurm │ ├── config.yaml │ ├── convert-checkpoints.slurm │ ├── merge_lora_and_resize_eou.slurm │ ├── resize_embed_for_eou.py │ ├── s3-upload-checkpoints.slurm │ ├── s3-upload-run-files.slurm │ ├── schedule-evals.sh │ ├── slurm-status.slurm │ └── train.slurm │ ├── tr_350_smolvlm_2B_2nd_stage │ ├── 01_launch.sh │ ├── cleanup-checkpoints.slurm │ ├── config.yaml │ ├── convert-checkpoints.slurm │ ├── merge_lora_and_resize_eou.slurm │ ├── resize_embed_for_eou.py │ ├── s3-upload-checkpoints.slurm │ ├── s3-upload-run-files.slurm │ ├── schedule-evals.sh │ ├── slurm-status.slurm │ └── train.slurm │ └── tr_cron_template │ ├── 01_launch.sh │ ├── README.md │ ├── cleanup-checkpoints.slurm │ ├── config.yaml │ ├── convert-checkpoints.slurm │ ├── s3-upload-checkpoints.slurm │ ├── s3-upload-run-files.slurm │ ├── schedule-evals.slurm │ └── train.slurm ├── finetuning ├── README.md ├── SmolVLM2_Video_FT.ipynb └── Smol_VLM_FT.ipynb └── m4 ├── __init__.py ├── evaluation ├── README.md ├── __init__.py ├── config.py ├── custom_metrics │ ├── __init__.py │ ├── classification_vqa_metrics.py │ ├── doc_vqa_metrics.py │ ├── image_caption_matching_metrics.py │ ├── open_ended_vqa_metrics.py │ ├── perplexity_metrics.py │ ├── unfolded_classification_metrics.py │ ├── unfolded_image_captioning_metrics.py │ └── utils.py ├── evaluators │ ├── __init__.py │ ├── in_contexter.py │ └── linear_prober.py ├── generation │ ├── README.md │ ├── config.py │ ├── deprecated_generation │ │ ├── generate.py │ │ ├── launch_generation.py │ │ ├── log_generation.py │ │ ├── log_generation.slurm │ │ ├── make_generation.slurm │ │ └── master_generate.slurm │ └── generate.py ├── launch.py ├── scripts │ ├── README.md │ ├── copy_remote_sample_datasets.py │ ├── create_sample_evaluation_datasets.py │ ├── create_sample_evaluation_datasets_simplified.py │ ├── docvqa_to_submission_format.ipynb │ ├── mmbench_submission_format.py │ ├── mmmu_submission_format.py │ ├── sync_evaluations_on_wandb.py │ └── visualize_generations.py ├── tasks │ ├── __init__.py │ └── base.py ├── utils.py └── vqa_labels.py ├── models ├── __init__.py ├── common.py ├── custom_modules.py ├── idefics │ ├── configuration_idefics.py │ ├── evaluation_captioning_in_context_idefics.py │ ├── evaluation_classification_in_context_idefics.py │ ├── evaluation_classification_vqa_in_context_idefics.py │ ├── evaluation_image_caption_matching_idefics.py │ ├── evaluation_open_ended_vqa_in_context_idefics.py │ ├── evaluation_perplexity_in_context_idefics.py │ ├── make_tiny_llama.py │ ├── make_tiny_model.py │ └── modeling_idefics.py ├── perceiver │ └── perceiver.py ├── vgpt2 │ ├── __init__.py │ ├── configuration_vgpt2.py │ ├── evaluation_captioning_in_context_vgpt2.py │ ├── evaluation_classification_in_context_vgpt2.py │ ├── evaluation_classification_vqa_in_context_vgpt2.py │ ├── evaluation_image_caption_matching_vgpt2.py │ ├── evaluation_open_ended_vqa_in_context_vgpt2.py │ ├── evaluation_perplexity_in_context_vgpt2.py │ └── modeling_vgpt2.py ├── vllama3 │ ├── __init__.py │ ├── configuration_vllama3.py │ ├── evaluation_captioning_in_context_vllama3.py │ ├── evaluation_classification_in_context_vllama3.py │ ├── evaluation_open_ended_vqa_in_context_vllama3.py │ ├── make_tiny_llama3.py │ ├── make_tiny_model.py │ └── modeling_vllama3.py ├── vmistral │ ├── __init__.py │ ├── configuration_vmistral.py │ ├── evaluation_captioning_in_context_vmistral.py │ ├── evaluation_classification_in_context_vmistral.py │ ├── evaluation_classification_vqa_in_context_vmistral.py │ ├── evaluation_image_caption_matching_vmistral.py │ ├── evaluation_open_ended_vqa_in_context_vmistral.py │ ├── evaluation_perplexity_in_context_vmistral.py │ ├── make_tiny_mistral.py │ ├── make_tiny_model.py │ └── modeling_vmistral.py └── zero_checkpoint_to_hf.py ├── scripts ├── README.md ├── behead_unused_params.py ├── clean_jsonl_evals.py ├── cleanup-checkpoints.py ├── convert-checkpoints.py ├── convert_vmistral_lm_head.py ├── convert_vmistral_old_siglip_to_new_siglip.py ├── convert_zero_state_dict_for_new_siglip.py ├── job_update_siglip_model_pos_embeds.slurm ├── merge_lora_and_behead.sh ├── merge_lora_and_resize_eou_template.slurm ├── merge_lora_and_save.py ├── merge_lora_template.slurm ├── resize_embed_for_eou.py ├── s3-upload-checkpoints.py ├── s3_checkpoint_download_convert_upload.py ├── s3_checkpoint_download_convert_upload.slurm ├── s3_downloaded_checkpoints_cleanup.slurm ├── schedule-evals.py ├── update_model_embeds.py ├── update_model_perceiver_latents.py ├── update_siglip_model_pos_embeds.py └── update_vision_model_position_embeds.py ├── sourcing ├── __init__.py ├── data_collection │ ├── README.md │ ├── __init__.py │ ├── callers │ │ ├── __init__.py │ │ ├── deduplicate_images_web_documents.py │ │ ├── download_warc.py │ │ ├── extract_html.py │ │ ├── extract_image_text_pairs.py │ │ ├── extract_web_documents.py │ │ ├── filter_laion_pairs.py │ │ ├── filter_web_documents.py │ │ ├── get_reference_clip_distrib.py │ │ └── line_deduplicate_web_documents.py │ ├── configs │ │ ├── config_extract_web_documents.yaml │ │ ├── config_filter_laion_pairs.yaml │ │ ├── config_filter_text_image_pairs.yaml │ │ └── config_filter_web_documents.yaml │ ├── debug │ │ ├── __init__.py │ │ ├── debug.py │ │ └── get_intuition.py │ ├── docs │ │ ├── filtering_doc.md │ │ └── image_deduplication_doc.md │ ├── outputs │ │ ├── README.md │ │ ├── clip_scores_laion400m_10000.npy │ │ ├── clip_scores_red_caps_10000.npy │ │ ├── clip_scores_sbu_captions_10000.npy │ │ ├── distributions_extracted.png │ │ ├── distributions_reference.png │ │ └── image_text_pairs.jsonl │ ├── processors │ │ ├── __init__.py │ │ ├── dom_tree_simplificator.py │ │ ├── html_extractor.py │ │ ├── image_deduplicator.py │ │ ├── laion_pair_filtering.py │ │ ├── pair_extractor.py │ │ ├── pair_filtering.py │ │ ├── pre_extraction_simplificator.py │ │ ├── warc_downloader.py │ │ ├── web_document_extractor.py │ │ ├── web_document_filtering.py │ │ ├── web_document_image_deduplication.py │ │ └── web_document_line_deduplication.py │ ├── utils │ │ ├── __init__.py │ │ ├── clip_utils.py │ │ ├── fetching_utils.py │ │ ├── filtering_utils.py │ │ ├── kl_utils.py │ │ ├── simplification_utils.py │ │ ├── tags_attributes.py │ │ └── utils.py │ └── visualization │ │ ├── __init__.py │ │ ├── assets │ │ └── DOM_tree_viz.html │ │ ├── choose_filtering_parameters_laion_pairs.py │ │ ├── choose_filtering_parameters_web_documents_node_level.py │ │ ├── get_stats_vis_choose_filtering_parameters_laion_pairs.py │ │ ├── global_visualization.py │ │ ├── pair_stat_dashboard.py │ │ ├── pair_visualization.py │ │ ├── plot_clip_distrib.py │ │ ├── web_document_and_filtering_visualization.py │ │ ├── web_document_visualization.py │ │ └── wikipedia │ │ ├── explore.py │ │ ├── explore_wiki_results.py │ │ └── global_visualization.py ├── get_html_files │ └── common_crawl.md ├── get_modelling_metadata_dataset │ ├── get_modelling_metadata_dataset.py │ ├── get_modelling_metadata_dataset.slurm │ └── shard_names.txt ├── pmd │ ├── __init__.py │ ├── cache_path.py │ ├── fix_image_path.py │ ├── helpers.py │ ├── jz_loaders │ │ ├── __init__.py │ │ ├── jz_conceptual_captions │ │ │ ├── __init__.py │ │ │ └── jz_conceptual_captions.py │ │ └── jz_wit │ │ │ ├── __init__.py │ │ │ └── jz_wit.py │ ├── loader_builder.py │ ├── local_loaders │ │ ├── __init__.py │ │ ├── coco │ │ │ ├── __init__.py │ │ │ └── coco.py │ │ ├── laion_2b_en │ │ │ └── laion_2b_en.py │ │ ├── localized_narratives__ADE20k │ │ │ ├── __init__.py │ │ │ └── localized_narratives__ADE20k.py │ │ ├── localized_narratives__coco │ │ │ ├── __init__.py │ │ │ └── localized_narratives__coco.py │ │ ├── localized_narratives__flickr30k │ │ │ ├── __init__.py │ │ │ └── localized_narratives__flickr30k.py │ │ ├── localized_narratives__openimages │ │ │ ├── __init__.py │ │ │ └── localized_narratives__openimages.py │ │ └── yfcc100m │ │ │ ├── __init__.py │ │ │ └── yfcc100m.py │ └── scripts │ │ ├── README.md │ │ ├── check_none_ims.py │ │ ├── jz_image_pmd.slurm │ │ ├── jz_pmd.py │ │ └── pmd.py └── processing │ ├── README.md │ ├── __init__.py │ ├── app.py │ └── extracting_ngrams │ ├── README.md │ ├── __init__.py │ ├── extract_documents_ngrams.py │ ├── processing_pipeline.slurm │ ├── run_document_ngrams_extraction.sh │ └── utils.py ├── testing_utils.py ├── training ├── DATA_DOCUMENTATION.md ├── __init__.py ├── config.py ├── dataset.py ├── dataset_utils.py ├── debug_utils.py ├── main.py ├── packing.py ├── setup_language_model.py ├── setup_vision_model.py ├── trainer.py ├── types.py └── utils.py └── utils ├── __init__.py ├── activation_tracker.py ├── check_valid_tokenizer.py ├── datasets ├── __init__.py ├── create_webdataset_tar.py └── get_self_contained_ds.py ├── debug.py ├── logging.py ├── progress.py └── training ├── __init__.py └── timer.py /.github/workflows/trufflehog.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | 4 | name: Secret Leaks 5 | 6 | permissions: 7 | contents: read 8 | 9 | jobs: 10 | trufflehog: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v4 15 | with: 16 | fetch-depth: 0 17 | - name: Secret Scanning 18 | uses: trufflesecurity/trufflehog@main -------------------------------------------------------------------------------- /text/data/decontamination/README.md: -------------------------------------------------------------------------------- 1 | # Decontamination 2 | 3 | TODO: add code. 4 | Placeholder here: https://github.com/huggingface/cosmopedia/tree/main/decontamination -------------------------------------------------------------------------------- /text/data/finemath/README.md: -------------------------------------------------------------------------------- 1 | # 📚 FineWeb-Edu pipeline 2 | 3 |
4 | FineWeb-Edu: The finest collection of educational content the web has to offer 5 |
6 | 7 | 8 | Here you can find the pipeline for training [FineWeb-Edu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/)'s [classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) and running the annotation on FineWeb. 9 | 10 | ### 1. Finetune a model for educational value regression 11 | 12 | * edit `train_edu_bert.slurm` 13 | ```bash 14 | --base_model_name="Snowflake/snowflake-arctic-embed-m" \ # BERT-like base model 15 | --dataset_name="HuggingFaceFW/fineweb-edu-llama3-annotations" \ # Llama3-annotated eduational value dataset 16 | --target_column="score" 17 | ``` 18 | * run the training script on a SLURM cluster: 19 | ```bash 20 | sbatch train_edu_bert.slurm 21 | ``` 22 | 23 | ### 2. Annotate a dataset with the educational scores predicted by the model 24 | 25 | ```bash 26 | sbatch run_edu_bert.slurm 27 | ``` -------------------------------------------------------------------------------- /text/data/smoltalk/README.md: -------------------------------------------------------------------------------- 1 | # SmolTalk: distilabel pipelines 2 | We released [SmolTalk](https://huggingface.co/datasets/HuggingFaceTB/smoltalk) the SFT dataset used for building SmolLM2 instruct models. It was created with [distilabel](https://github.com/argilla-io/distilabel) and you can find the synthetic data pipelines here. 3 | 4 |
5 | 6 |

Comparison of models finetuned on SmolTalk and Orca AgentInstruct 1M. For more details, refer to the dataset card.

7 |
8 | 9 | > [!NOTE] 10 | > This section is still in WIP. We will upload the rest of the pipelines soon. Thanks for your patience! 11 | 12 | -------------------------------------------------------------------------------- /text/data/smoltalk/constraints/README.md: -------------------------------------------------------------------------------- 1 | # Smol-Constraints 2 | 3 | This pipeline generates synthetic data similar to the one in the [google/IFEval](https://huggingface.co/datasets/google/IFEval) dataset/benchmark. 4 | -------------------------------------------------------------------------------- /text/data/smoltalk/constraints/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/text/data/smoltalk/constraints/pipeline/__init__.py -------------------------------------------------------------------------------- /text/data/smoltalk/magpie_ultra_v1/README.md: -------------------------------------------------------------------------------- 1 | # MagPie Ultra v1.0 2 | 3 | This [`distilabel`](https://github.com/argilla-io/distilabel) was used to generate the [magpie-ultra-v1.0](https://huggingface.co/datasets/argilla/magpie-ultra-v1.0) dataset. The dataset follows the [MagPie](https://magpie-align.github.io) pipeline recipe to generate a multi-turn conversation dataset using [meta-llama/Llama-3.1-405B-Instruct-FP8](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct-FP8). 4 | 5 | ## Setup 6 | 7 | You will need to install `distilabel` with a few extra dependencies to be able to execute the pipeline: 8 | 9 | ```bash 10 | pip install distilabel[ray,vllm,sentence-transformers,faiss-cpu,hf-transformers] 11 | ``` -------------------------------------------------------------------------------- /text/data/smoltalk/rewrite/README.md: -------------------------------------------------------------------------------- 1 | # Smol-Rewrite 2 | 3 | This pipeline is used to generate a synthetic dataset for training a re-writing assistant. 4 | -------------------------------------------------------------------------------- /text/data/smoltalk/rewrite/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/text/data/smoltalk/rewrite/pipeline/__init__.py -------------------------------------------------------------------------------- /text/data/smoltalk/rewrite/pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | from dataset import get_dataset 2 | from distilabel.llms import vLLM 3 | from distilabel.pipeline import Pipeline 4 | from distilabel.steps import StepResources 5 | from distilabel.steps.tasks import TextGeneration 6 | 7 | with Pipeline(name="smol-rewrite").ray() as pipeline: 8 | TextGeneration( 9 | llm=vLLM( 10 | model="Qwen/Qwen2.5-72B-Instruct", 11 | tokenizer="Qwen/Qwen2.5-72B-Instruct", 12 | generation_kwargs={ 13 | "temperature": 0.2, 14 | "max_new_tokens": 1024, 15 | "top_p": 0.95, 16 | }, 17 | extra_kwargs={ 18 | "tensor_parallel_size": 8, 19 | "max_model_len": 4096, 20 | "enable_prefix_caching": True, 21 | }, 22 | ), 23 | input_batch_size=1000, 24 | resources=StepResources(replicas=4), 25 | ) 26 | 27 | 28 | if __name__ == "__main__": 29 | dataset = get_dataset() 30 | distiset = pipeline.run(dataset=dataset, dataset_batch_size=10000, use_cache=True) 31 | distiset.push_to_hub("HuggingFaceTB/smollm-v2-rewriting") 32 | -------------------------------------------------------------------------------- /text/data/smoltalk/summarization/README.md: -------------------------------------------------------------------------------- 1 | # Smol-Summarization -------------------------------------------------------------------------------- /text/evaluation/README.md: -------------------------------------------------------------------------------- 1 | # SmolLM evaluation scripts 2 | 3 | We're using the [LightEval](https://github.com/huggingface/lighteval/) library to benchmark our models. 4 | 5 | Check out the [quick tour](https://github.com/huggingface/lighteval/wiki/Quicktour) to configure it to your own hardware and tasks. 6 | 7 | ## Setup 8 | 9 | Use conda/venv with `python>=3.10`. 10 | 11 | Adjust the pytorch installation according to your environment: 12 | ```bash 13 | pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu121 14 | ``` 15 | For reproducibility, we recommend fixed versions of the libraries: 16 | ```bash 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | ## Running the evaluations 21 | 22 | ### SmolLM2 base models 23 | 24 | ```bash 25 | lighteval accelerate \ 26 | --model_args "pretrained=HuggingFaceTB/SmolLM2-1.7B,revision=main,dtype=bfloat16,vllm,gpu_memory_utilisation=0.8,max_model_length=2048" \ 27 | --custom_tasks "tasks.py" --tasks "smollm2_base.txt" --output_dir "./evals" --save_details 28 | ``` 29 | 30 | ### SmolLM2 instruction-tuned models 31 | 32 | (note the `--use_chat_template` flag) 33 | ```bash 34 | lighteval accelerate \ 35 | --model_args "pretrained=HuggingFaceTB/SmolLM2-1.7B-Instruct,revision=main,dtype=bfloat16,vllm,gpu_memory_utilisation=0.8,max_model_length=2048" \ 36 | --custom_tasks "tasks.py" --tasks "smollm2_instruct.txt" --use_chat_template --output_dir "./evals" --save_details 37 | ``` 38 | 39 | ### FineMath dataset ablations 40 | 41 | See the collection for model names: https://huggingface.co/collections/HuggingFaceTB/finemath-6763fb8f71b6439b653482c2 42 | 43 | ```bash 44 | lighteval accelerate \ 45 | --model_args "pretrained=HuggingFaceTB/finemath-ablation-4plus-160B,revision=main,dtype=bfloat16,vllm,gpu_memory_utilisation=0.7,max_model_length=4096" \ 46 | --custom_tasks "tasks.py" --tasks "custom|math|4|1,custom|gsm8k|5|1,custom|arc:challenge|0|1,custom|mmlu_pro|0|1,custom|hellaswag|0|1" --output_dir "./evals" --save_details 47 | ``` 48 | -------------------------------------------------------------------------------- /text/evaluation/requirements.txt: -------------------------------------------------------------------------------- 1 | lighteval[accelerate,extended_tasks,vllm] @ git+https://github.com/huggingface/lighteval@ea46419a93fb390e8f694f7c6c64c1e684487c9d 2 | fsspec>=2024.3.0 3 | word2number -------------------------------------------------------------------------------- /text/evaluation/smollm2_base.txt: -------------------------------------------------------------------------------- 1 | custom|hellaswag|0|1 2 | custom|arc|0|1 3 | custom|piqa|0|1 4 | custom|mmlu_pro|0|1 5 | custom|commonsense_qa|0|1 6 | custom|trivia_qa|0|1 7 | custom|winogrande|0|1 8 | custom|openbook_qa|0|1 9 | custom|gsm8k|5|1 -------------------------------------------------------------------------------- /text/evaluation/smollm2_instruct.txt: -------------------------------------------------------------------------------- 1 | extended|ifeval|0|0 2 | custom|hellaswag|0|1 3 | custom|arc|0|1 4 | custom|piqa|0|1 5 | custom|mmlu_pro|0|1 6 | custom|bbh|3|1 7 | custom|gsm8k|5|1 -------------------------------------------------------------------------------- /text/finetuning/Dockerfile: -------------------------------------------------------------------------------- 1 | # base image: CUDA 12.1 2 | FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 3 | 4 | WORKDIR /app 5 | 6 | # install necessary packages 7 | RUN apt-get update && apt-get install -y \ 8 | git \ 9 | wget \ 10 | curl \ 11 | ca-certificates \ 12 | libglib2.0-0 \ 13 | libsm6 \ 14 | libxrender1 \ 15 | libxext6 \ 16 | libssl-dev \ 17 | libffi-dev \ 18 | python3 \ 19 | python3-pip \ 20 | && rm -rf /var/lib/apt/lists/* 21 | 22 | # set python3 as default python 23 | RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1 24 | 25 | RUN pip install --upgrade pip setuptools 26 | 27 | RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu121 28 | 29 | COPY requirements.txt . 30 | RUN pip install -r requirements.txt 31 | 32 | COPY . . 33 | 34 | ENV PYTHONUNBUFFERED=1 35 | 36 | CMD ["bash"] -------------------------------------------------------------------------------- /text/finetuning/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | trl>=0.15 3 | peft 4 | accelerate 5 | datasets 6 | wandb 7 | bitsandbytes 8 | -------------------------------------------------------------------------------- /text/pretraining/README.md: -------------------------------------------------------------------------------- 1 | # Pretraining 2 | We use [nanotron](https://github.com/huggingface/nanotron/) library for training SmolLM and SmolLM2 base models. 3 | 4 | The scripts for training SmolLM v1 can be found in the `smollm1` folder, and those for training SmolLM2 can be found in the `smollm2` folder, we will add the details for the data mixture soon. SmolLM2 uses similar architecture as SmolLM but uses an improved data mixture and significantly longer training periods (11 trillion tokens for the 1.7B, 4 trillion for the 360M and 2 trillion for the 135M). 5 | 6 | ## Setup 7 | 8 | Please refer to [nanotron](https://github.com/huggingface/nanotron/) for detailed instructions on setting up your training environment and launching jobs. 9 | 10 | After setting up the environment and tokenizing the training datasets with [datatrove](https://github.com/huggingface/datatrove) (instructions available [here](https://github.com/huggingface/nanotron/blob/main/docs/nanoset.md#nanosets)), you can modify the configurations to match your number of nodes and local paths. 11 | 12 | Below is an example of launching SmolLM1 135M training on 1 node (you can change the DP value to 8 in the config and adjust the batch size) and run: 13 | 14 | ```bash 15 | git clone https://github.com/huggingface/nanotron 16 | cd nanotron 17 | # follow installation 18 | CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=8 run_train.py --config-file smollm1/config_smollm1_135M.yaml 19 | ``` 20 | 21 | If you are working on a slurm cluster, you can modify the `launch.slurm` and launch the training with: 22 | 23 | ```bash 24 | sbatch launch.slurm 25 | ``` 26 | > [!NOTE] 27 | > Don't forget to create the logs directory before launching the job: 28 | 29 | ## Continual pre-training 30 | 31 | The nanotron checkpoints for SmolLM2 models are available at: https://huggingface.co/HuggingFaceTB/SmolLM2-nanotron-ckpt 32 | 33 | You can find an example of continual pre-training in the [continual-pretraining](./continual-pretraining) folder. 34 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # Tools for local inference 2 | 3 | Here you can find tools and demos for running SmolLM2 and SmolVLM locally, leveraing libraries such as llama.cpp, MLX, MLC and Transformers.js. -------------------------------------------------------------------------------- /tools/smol_tools/requirements.txt: -------------------------------------------------------------------------------- 1 | tkmacosx>=1.0.5 2 | pynput>=1.7.7 3 | llama-cpp-python>=0.3.1 4 | pyperclip>=1.9.0 5 | transformers>=4.46.2 6 | pygments>=2.18.0 -------------------------------------------------------------------------------- /tools/smol_tools/smol_tools/rewriter.py: -------------------------------------------------------------------------------- 1 | from .base import SmolTool 2 | from typing import Generator 3 | 4 | class SmolRewriter(SmolTool): 5 | def __init__(self): 6 | super().__init__( 7 | model_repo="andito/SmolLM2-1.7B-Instruct-F16-GGUF", 8 | model_filename="smollm2-1.7b-8k-dpo-f16.gguf", 9 | system_prompt="You are an AI writing assistant. Your task is to rewrite the user's email to make it more professional and approachable while maintaining its main points and key message. Do not return any text other than the rewritten message.", 10 | prefix_text="Rewrite the message below to make it more professional and approachable while maintaining its main points and key message. Do not add any new information or return any text other than the rewritten message\nThe message:" 11 | ) 12 | 13 | def process(self, text: str) -> Generator[str, None, None]: 14 | messages = [ 15 | {"role": "system", "content": self.system_prompt}, 16 | {"role": "user", "content": f"{self.prefix_text}\n{text}"} 17 | ] 18 | yield from self._create_chat_completion(messages, temperature=0.4, repeat_penalty=1.0, top_k=0, max_tokens=1024) -------------------------------------------------------------------------------- /tools/smol_tools/smol_tools/summarizer.py: -------------------------------------------------------------------------------- 1 | from .base import SmolTool 2 | from typing import Generator, Optional 3 | from dataclasses import dataclass 4 | from datetime import datetime 5 | from typing import List 6 | 7 | @dataclass 8 | class SummaryMessage: 9 | role: str # "user" or "assistant" 10 | content: str 11 | timestamp: datetime 12 | 13 | class SmolSummarizer(SmolTool): 14 | def __init__(self): 15 | self.name = "SmolLM2-1.7B" 16 | 17 | super().__init__( 18 | model_repo="andito/SmolLM2-1.7B-Instruct-F16-GGUF", 19 | model_filename="smollm2-1.7b-8k-dpo-f16.gguf", 20 | system_prompt="Concisely summarize the main points of the input text in up to three sentences, focusing on key information and events.", 21 | ) 22 | 23 | def process(self, text: str, question: Optional[str] = None) -> Generator[str, None, None]: 24 | if question is None: 25 | print("Summarizing text") 26 | prompt = f"{self.prefix_text}\n{text}" 27 | messages = [ 28 | {"role": "system", "content": self.system_prompt}, 29 | {"role": "user", "content": prompt}, 30 | {"role": "assistant", "content": "This is a short summary of the text:"} 31 | ] 32 | else: 33 | print("Answering question") 34 | prompt = f"Original text:\n{text}\n\nQuestion: {question}" 35 | messages = [ 36 | {"role": "user", "content": prompt}, 37 | ] 38 | 39 | for chunk in self._create_chat_completion(messages, max_tokens=1024, temperature=0.1, top_p=0.9): 40 | yield chunk 41 | -------------------------------------------------------------------------------- /tools/smol_tools/smol_tools/titler.py: -------------------------------------------------------------------------------- 1 | from .base import SmolTool 2 | from typing import Generator 3 | 4 | class SmolTitler(SmolTool): 5 | def __init__(self): 6 | super().__init__( 7 | model_repo="andito/SmolLM2-1.7B-Instruct-F16-GGUF", 8 | model_filename="smollm2-1.7b-8k-dpo-f16.gguf", 9 | system_prompt="", 10 | prefix_text="Create a title for this conversation:", 11 | ) 12 | 13 | def process(self, text: str) -> Generator[str, None, None]: 14 | messages = [ 15 | {"role": "user", "content": f"{self.prefix_text}\n{text}"} 16 | ] 17 | yield from self._create_chat_completion(messages, max_tokens=128, temperature=0.6, top_p=0.9, top_k=0, repeat_penalty=1.1) -------------------------------------------------------------------------------- /tools/smollm_local_inference/llama-cpp-python.py: -------------------------------------------------------------------------------- 1 | from llama_cpp import Llama 2 | 3 | llm = Llama.from_pretrained( 4 | repo_id="HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF", 5 | filename="*q4_k_m.gguf", 6 | verbose=False 7 | ) 8 | 9 | output = llm( 10 | "Q: Name the planets in the solar system? A: ", # Prompt 11 | max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window 12 | stop=["Q:", "\n"], # Stop generating just before the model would generate a new question 13 | echo=True # Echo the prompt back in the output 14 | ) # Generate a completion, can also call create_completion 15 | 16 | print(output) -------------------------------------------------------------------------------- /tools/smollm_local_inference/mlc.py: -------------------------------------------------------------------------------- 1 | from mlc_llm import MLCEngine 2 | 3 | # Create engine 4 | model = "HF://mlc-ai/SmolLM2-1.7B-Instruct-q0f16-MLC" 5 | engine = MLCEngine(model) 6 | 7 | # Run chat completion in OpenAI API. 8 | for response in engine.chat.completions.create( 9 | messages=[{"role": "user", "content": "What is the meaning of life?"}], 10 | model=model, 11 | stream=True, 12 | ): 13 | for choice in response.choices: 14 | print(choice.delta.content, end="", flush=True) 15 | print("\n") 16 | 17 | engine.terminate() 18 | -------------------------------------------------------------------------------- /tools/smollm_local_inference/mlx.py: -------------------------------------------------------------------------------- 1 | from mlx_lm import load, generate 2 | 3 | model, tokenizer = load("HuggingFaceTB/SmolLM2-1.7B-Instruct-Q8-mlx") 4 | 5 | prompt = "Hello" 6 | 7 | messages = [{"role": "user", "content": prompt}] 8 | prompt = tokenizer.apply_chat_template( 9 | messages, tokenize=False, add_generation_prompt=True 10 | ) 11 | 12 | response = generate(model, tokenizer, prompt=prompt, verbose=True) 13 | print(response) 14 | -------------------------------------------------------------------------------- /tools/smollm_local_inference/transformers-js.js: -------------------------------------------------------------------------------- 1 | import { pipeline } from "@huggingface/transformers"; 2 | 3 | // Create a text generation pipeline 4 | const generator = await pipeline( 5 | "text-generation", 6 | "HuggingFaceTB/SmolLM2-1.7B-Instruct", 7 | { dtype: "q4f16" }, 8 | ); 9 | 10 | // Define the list of messages 11 | const messages = [ 12 | { role: "system", content: "You are a helpful assistant." }, 13 | { role: "user", content: "Rewrite the following: hello how r u?" }, 14 | ]; 15 | 16 | // Generate a response 17 | const output = await generator(messages, { max_new_tokens: 128 }); 18 | console.log(output[0].generated_text.at(-1).content); 19 | // "Hello, how's it going?" -------------------------------------------------------------------------------- /vision/data/README.md: -------------------------------------------------------------------------------- 1 | # Data 2 | 3 | The scripts inside of datasets_processing_scripts are the ones we used to create all the datasets used for training smolvlm -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/01_convert_coco_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--shard_dir_path", type=Path, required=True) 25 | parser.add_argument("--saving_dir", type=Path, required=True) 26 | parser.add_argument("--num_examples_per_shard", type=int, required=True) 27 | parser.add_argument("--num_proc", type=int, required=True) 28 | parser.add_argument("--shard_idx", type=int, required=True) 29 | parser.add_argument("--min_num_shards", type=int) 30 | args = parser.parse_args() 31 | return args 32 | 33 | 34 | def main(args): 35 | shard_1_dirs = [shard_dir for shard_dir in args.shard_dir_path.iterdir()] 36 | ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS 37 | 38 | export_dataset_shard_idx_to_tar( 39 | hf_datasets_paths=shard_1_dirs, 40 | saving_dir=args.saving_dir, 41 | ds_type=ds_type, 42 | num_examples_per_shard=args.num_examples_per_shard, 43 | num_proc=args.num_proc, 44 | shard_idx=args.shard_idx, 45 | min_num_shards=args.min_num_shards, 46 | ) 47 | 48 | 49 | if __name__ == "__main__": 50 | args = get_args() 51 | main(args) 52 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/02_convert_cm4_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--shard_dir_path", type=Path, required=True) 25 | parser.add_argument("--saving_dir", type=Path, required=True) 26 | parser.add_argument("--num_examples_per_shard", type=int, required=True) 27 | parser.add_argument("--num_proc", type=int, required=True) 28 | parser.add_argument("--shard_idx", type=int, required=True) 29 | parser.add_argument("--min_num_shards", type=int) 30 | args = parser.parse_args() 31 | return args 32 | 33 | 34 | def main(args): 35 | shard_1_dirs = [shard_dir for shard_dir in args.shard_dir_path.iterdir()] 36 | ds_type = DatasetTypes.WEB_DOCUMENTS 37 | 38 | export_dataset_shard_idx_to_tar( 39 | hf_datasets_paths=shard_1_dirs, 40 | saving_dir=args.saving_dir, 41 | ds_type=ds_type, 42 | num_examples_per_shard=args.num_examples_per_shard, 43 | num_proc=args.num_proc, 44 | shard_idx=args.shard_idx, 45 | min_num_shards=args.min_num_shards, 46 | ) 47 | 48 | 49 | if __name__ == "__main__": 50 | args = get_args() 51 | main(args) 52 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/06_convert_lrv_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--saving_dir", type=Path, required=True) 25 | parser.add_argument("--num_examples_per_shard", type=int, required=True) 26 | parser.add_argument("--num_proc", type=int, required=True) 27 | parser.add_argument("--shard_idx", type=int, required=True) 28 | parser.add_argument("--min_num_shards", type=int) 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def main(args): 34 | ds_type = DatasetTypes.LRV_PAIRS 35 | 36 | export_dataset_shard_idx_to_tar( 37 | hf_datasets_paths=["VictorSanh/LrvInstruction:train"], 38 | saving_dir=args.saving_dir, 39 | ds_type=ds_type, 40 | num_examples_per_shard=args.num_examples_per_shard, 41 | num_proc=args.num_proc, 42 | shard_idx=args.shard_idx, 43 | min_num_shards=args.min_num_shards, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | args = get_args() 49 | main(args) 50 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/07_convert_llava_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--saving_dir", type=Path, required=True) 25 | parser.add_argument("--num_examples_per_shard", type=int, required=True) 26 | parser.add_argument("--num_proc", type=int, required=True) 27 | parser.add_argument("--shard_idx", type=int, required=True) 28 | parser.add_argument("--min_num_shards", type=int) 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def main(args): 34 | ds_type = DatasetTypes.LLaVA 35 | 36 | export_dataset_shard_idx_to_tar( 37 | hf_datasets_paths=["HuggingFaceM4/LLaVA-Instruct-150K:train"], 38 | saving_dir=args.saving_dir, 39 | ds_type=ds_type, 40 | num_examples_per_shard=args.num_examples_per_shard, 41 | num_proc=args.num_proc, 42 | shard_idx=args.shard_idx, 43 | min_num_shards=args.min_num_shards, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | args = get_args() 49 | main(args) 50 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/08_convert_svit_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--saving_dir", type=Path, required=True) 25 | parser.add_argument("--num_examples_per_shard", type=int, required=True) 26 | parser.add_argument("--num_proc", type=int, required=True) 27 | parser.add_argument("--shard_idx", type=int, required=True) 28 | parser.add_argument("--min_num_shards", type=int) 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def main(args): 34 | ds_type = DatasetTypes.LRV_PAIRS 35 | 36 | export_dataset_shard_idx_to_tar( 37 | hf_datasets_paths=["HuggingFaceM4/SVITMerged:train"], 38 | saving_dir=args.saving_dir, 39 | ds_type=ds_type, 40 | num_examples_per_shard=args.num_examples_per_shard, 41 | num_proc=args.num_proc, 42 | shard_idx=args.shard_idx, 43 | min_num_shards=args.min_num_shards, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | args = get_args() 49 | main(args) 50 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/09_convert_ultrachat_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--saving_dir", type=Path, required=True) 25 | parser.add_argument("--num_examples_per_shard", type=int, required=True) 26 | parser.add_argument("--num_proc", type=int, required=True) 27 | parser.add_argument("--shard_idx", type=int, required=True) 28 | parser.add_argument("--min_num_shards", type=int) 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def main(args): 34 | ds_type = DatasetTypes.TEXT_DIALOGUE 35 | 36 | export_dataset_shard_idx_to_tar( 37 | hf_datasets_paths=["stingning/ultrachat:train"], 38 | saving_dir=args.saving_dir, 39 | ds_type=ds_type, 40 | num_examples_per_shard=args.num_examples_per_shard, 41 | num_proc=args.num_proc, 42 | shard_idx=args.shard_idx, 43 | min_num_shards=args.min_num_shards, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | args = get_args() 49 | main(args) 50 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/10_convert_m3it_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--saving_dir", type=Path, required=True) 25 | parser.add_argument("--num_examples_per_shard", type=int, required=True) 26 | parser.add_argument("--num_proc", type=int, required=True) 27 | parser.add_argument("--shard_idx", type=int, required=True) 28 | parser.add_argument("--min_num_shards", type=int) 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def main(args): 34 | ds_type = DatasetTypes.M3IT_PAIRS 35 | 36 | export_dataset_shard_idx_to_tar( 37 | hf_datasets_paths=["HuggingFaceM4/M3IT:train"], 38 | saving_dir=args.saving_dir, 39 | ds_type=ds_type, 40 | num_examples_per_shard=args.num_examples_per_shard, 41 | num_proc=args.num_proc, 42 | shard_idx=args.shard_idx, 43 | min_num_shards=args.min_num_shards, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | args = get_args() 49 | main(args) 50 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/11_convert_spot_difference_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--saving_dir", type=Path, required=True) 25 | parser.add_argument("--num_examples_per_shard", type=int, required=True) 26 | parser.add_argument("--num_proc", type=int, required=True) 27 | parser.add_argument("--shard_idx", type=int, required=True) 28 | parser.add_argument("--min_num_shards", type=int) 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def main(args): 34 | ds_type = DatasetTypes.SPOT_DIFFERENCE_PAIRS 35 | 36 | export_dataset_shard_idx_to_tar( 37 | hf_datasets_paths=["HuggingFaceM4/SpotDifference_4:train"], 38 | saving_dir=args.saving_dir, 39 | ds_type=ds_type, 40 | num_examples_per_shard=args.num_examples_per_shard, 41 | num_proc=args.num_proc, 42 | shard_idx=args.shard_idx, 43 | min_num_shards=args.min_num_shards, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | args = get_args() 49 | main(args) 50 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/12_convert_llavar_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--saving_dir", type=Path, required=True) 25 | parser.add_argument("--num_examples_per_shard", type=int, required=True) 26 | parser.add_argument("--num_proc", type=int, required=True) 27 | parser.add_argument("--shard_idx", type=int, required=True) 28 | parser.add_argument("--min_num_shards", type=int) 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def main(args): 34 | ds_type = DatasetTypes.LLaVA 35 | 36 | export_dataset_shard_idx_to_tar( 37 | hf_datasets_paths=["HuggingFaceM4/LLaVAR-Instruct-16K:train"], 38 | saving_dir=args.saving_dir, 39 | ds_type=ds_type, 40 | num_examples_per_shard=args.num_examples_per_shard, 41 | num_proc=args.num_proc, 42 | shard_idx=args.shard_idx, 43 | min_num_shards=args.min_num_shards, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | args = get_args() 49 | main(args) 50 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/13_convert_vqav2_task_finetuning_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--saving_dir", type=Path, required=True) 25 | parser.add_argument("--num_examples_per_shard", type=int, required=True) 26 | parser.add_argument("--num_proc", type=int, required=True) 27 | parser.add_argument("--shard_idx", type=int, required=True) 28 | parser.add_argument("--min_num_shards", type=int) 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def main(args): 34 | ds_type = DatasetTypes.VQAV2_TASK_FINETUNING 35 | 36 | export_dataset_shard_idx_to_tar( 37 | hf_datasets_paths=["HuggingFaceM4/vqav2_task_finetuning:train"], 38 | saving_dir=args.saving_dir, 39 | ds_type=ds_type, 40 | num_examples_per_shard=args.num_examples_per_shard, 41 | num_proc=args.num_proc, 42 | shard_idx=args.shard_idx, 43 | min_num_shards=args.min_num_shards, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | args = get_args() 49 | main(args) 50 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/14_convert_tikz_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--saving_dir", type=Path, required=True) 25 | parser.add_argument("--num_examples_per_shard", type=int, required=True) 26 | parser.add_argument("--num_proc", type=int, required=True) 27 | parser.add_argument("--shard_idx", type=int, required=True) 28 | parser.add_argument("--min_num_shards", type=int) 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def main(args): 34 | ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS 35 | 36 | export_dataset_shard_idx_to_tar( 37 | hf_datasets_paths=["HuggingFaceM4/datikz_modif:train"], 38 | saving_dir=args.saving_dir, 39 | ds_type=ds_type, 40 | num_examples_per_shard=args.num_examples_per_shard, 41 | num_proc=args.num_proc, 42 | shard_idx=args.shard_idx, 43 | min_num_shards=args.min_num_shards, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | args = get_args() 49 | main(args) 50 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/15_convert_docvqa_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--saving_dir", type=Path, required=True) 25 | parser.add_argument("--num_examples_per_shard", type=int, required=True) 26 | parser.add_argument("--num_proc", type=int, required=True) 27 | parser.add_argument("--shard_idx", type=int, required=True) 28 | parser.add_argument("--min_num_shards", type=int) 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def main(args): 34 | ds_type = DatasetTypes.DOCVQA 35 | 36 | export_dataset_shard_idx_to_tar( 37 | hf_datasets_paths=["HuggingFaceM4/DocumentVQA:train"], 38 | saving_dir=args.saving_dir, 39 | ds_type=ds_type, 40 | num_examples_per_shard=args.num_examples_per_shard, 41 | num_proc=args.num_proc, 42 | shard_idx=args.shard_idx, 43 | min_num_shards=args.min_num_shards, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | args = get_args() 49 | main(args) 50 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/16_convert_image_website_code_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--saving_dir", type=Path, required=True) 25 | parser.add_argument("--num_examples_per_shard", type=int, required=True) 26 | parser.add_argument("--num_proc", type=int, required=True) 27 | parser.add_argument("--shard_idx", type=int, required=True) 28 | parser.add_argument("--min_num_shards", type=int) 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def main(args): 34 | ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS 35 | 36 | export_dataset_shard_idx_to_tar( 37 | hf_datasets_paths=["HuggingFaceM4/image_to_website_code:train"], 38 | saving_dir=args.saving_dir, 39 | ds_type=ds_type, 40 | num_examples_per_shard=args.num_examples_per_shard, 41 | num_proc=args.num_proc, 42 | shard_idx=args.shard_idx, 43 | min_num_shards=args.min_num_shards, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | args = get_args() 49 | main(args) 50 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/17_convert_websight_v02_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--saving_dir", type=Path, required=True) 25 | parser.add_argument("--num_examples_per_shard", type=int, required=True) 26 | parser.add_argument("--num_proc", type=int, required=True) 27 | parser.add_argument("--shard_idx", type=int, required=True) 28 | parser.add_argument("--min_num_shards", type=int) 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def main(args): 34 | ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS 35 | 36 | export_dataset_shard_idx_to_tar( 37 | hf_datasets_paths=["/fsx/hugo/ds_websight_v02"], 38 | saving_dir=args.saving_dir, 39 | ds_type=ds_type, 40 | num_examples_per_shard=args.num_examples_per_shard, 41 | num_proc=args.num_proc, 42 | shard_idx=args.shard_idx, 43 | min_num_shards=args.min_num_shards, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | args = get_args() 49 | main(args) 50 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/01_tar_datasets_with_jpeg/python_scripts/19_convert_websight_mix_per_shard_idx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from datasets import set_caching_enabled 6 | 7 | from m4.training.types import DatasetTypes 8 | from m4.utils.datasets.create_webdataset_tar import export_dataset_all_shard_idx_to_tar 9 | 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 14 | datefmt="%m/%d/%Y %H:%M:%S", 15 | ) 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | set_caching_enabled(False) 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--ds_path", type=Path, required=True) 25 | parser.add_argument("--saving_dir", type=Path, required=True) 26 | parser.add_argument("--num_examples_per_shard", type=int) 27 | parser.add_argument("--s3_uri", type=str) 28 | parser.add_argument("--num_proc", type=int, required=True) 29 | parser.add_argument("--min_num_shards", type=int) 30 | parser.add_argument("--save_shard_prefix", type=str, default="") 31 | 32 | args = parser.parse_args() 33 | return args 34 | 35 | 36 | def main(args): 37 | ds_paths = [args.ds_path] 38 | ds_type = DatasetTypes.IMAGE_CAPTION_PAIRS 39 | 40 | export_dataset_all_shard_idx_to_tar( 41 | hf_datasets_paths=ds_paths, 42 | saving_dir=args.saving_dir, 43 | ds_type=ds_type, 44 | num_examples_per_shard=args.num_examples_per_shard, 45 | s3_uri=args.s3_uri, 46 | num_proc=args.num_proc, 47 | min_num_shards=args.min_num_shards, 48 | save_shard_prefix=args.save_shard_prefix, 49 | ) 50 | 51 | 52 | if __name__ == "__main__": 53 | args = get_args() 54 | main(args) 55 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/build_concatenation_datasets_sft/create_set_hashes_test_images.py: -------------------------------------------------------------------------------- 1 | """ 2 | srun --pty --cpus-per-task=96 --mem-per-cpu=20G --partition=hopper-prod bash -i 3 | conda activate shared-m4 4 | """ 5 | 6 | 7 | import hashlib 8 | import json 9 | 10 | from datasets import concatenate_datasets, load_dataset 11 | from tqdm import tqdm 12 | 13 | 14 | NAME_DS_TO_HASH = [ 15 | "HuggingFaceM4/MMBench_modif_chatbot", 16 | "HuggingFaceM4/MathVista-modif", 17 | "HuggingFaceM4/MMMU-modif", 18 | ] 19 | 20 | PATH_SAVE_LIST_HASHES = "/fsx/hugo/fine_tuning_datasets_merge_image_individual/list_hashes_test_images.json" 21 | 22 | 23 | list_hashes = [] 24 | 25 | for name_ds in tqdm(NAME_DS_TO_HASH): 26 | potential_subset_names = ["testmini", "test", "validation", "dev"] 27 | all_splits = [] 28 | for split in potential_subset_names: 29 | try: 30 | all_splits.append(load_dataset(name_ds, split=split)) 31 | except Exception: 32 | pass 33 | ds = concatenate_datasets(all_splits) 34 | if "image" in ds.column_names: 35 | images = ds["image"] 36 | elif "images" in ds.column_names: 37 | images = ds["images"] 38 | images = [img for list_images in images for img in list_images] 39 | else: 40 | raise ValueError("images not found in the dataset") 41 | for img in tqdm(images): 42 | md5hash = hashlib.md5(img.tobytes()).hexdigest() 43 | list_hashes.append(md5hash) 44 | 45 | 46 | with open(PATH_SAVE_LIST_HASHES, "w") as f: 47 | json.dump(list_hashes, f) 48 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/build_concatenation_datasets_sft/job_build_the_cauldron.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=build_the_cauldron 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=88 6 | #SBATCH --mem-per-cpu=21G 7 | #SBATCH --output=/fsx/m4/experiments/general_logs/build_the_cauldron/res%A_%a 8 | #SBATCH --partition=hopper-prod 9 | #SBATCH --qos high 10 | 11 | source /fsx/m4/start-m4-user 12 | conda activate shared-m4 13 | 14 | 15 | python /fsx/hugo/repos/m4_28/datasets_processing_scripts/build_concatenation_datasets_sft/build_the_cauldron.py ${SLURM_ARRAY_TASK_ID} 16 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/build_concatenation_datasets_sft/job_merge_on_image_individual_dataset.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=merge_on_image_individual_dataset 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=88 6 | #SBATCH --mem-per-cpu=21G 7 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_on_image_individual_dataset/res%A_%a 8 | #SBATCH --partition=hopper-prod 9 | 10 | source /fsx/m4/start-m4-user 11 | conda activate /fsx/m4/conda/hugo_3 12 | 13 | 14 | python /fsx/hugo/repos/m4_15/datasets_processing_scripts/build_concatenation_datasets_sft/merge_on_image_individual_dataset.py ${SLURM_ARRAY_TASK_ID} 15 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/build_concatenation_datasets_sft/tar_dataset_pattern_check.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import datasets 4 | 5 | from m4.training.dataset_utils import get_webdataset 6 | from m4.training.types import DatasetTypes 7 | 8 | 9 | base_path = "/fsx/leo/fine_tuning_datasets/concat_chatty_tar/shard_{index}.tar" 10 | 11 | # Generate paths using a list comprehension 12 | webdataset_paths = [base_path.format(index=i) for i in range(0, 1785)] 13 | 14 | 15 | FEATURES = datasets.Features( 16 | { 17 | "__key__": datasets.Value("string"), 18 | "__url__": datasets.Value("string"), 19 | "images": datasets.Sequence(datasets.Image(decode=True)), 20 | "texts": [ 21 | { 22 | "user": datasets.Value("string"), 23 | "assistant": datasets.Value("string"), 24 | "source": datasets.Value("string"), 25 | } 26 | ], 27 | } 28 | ) 29 | combined_dataset = get_webdataset( 30 | urls=webdataset_paths, 31 | ds_type=DatasetTypes.SFT, 32 | batch_size=10, 33 | shuffle_initial_urls_list=False, 34 | shuffle_before_split_by_node_buffer_size=(None), 35 | shuffle_before_split_by_worker_buffer_size=(None), 36 | shuffle_after_tarfile_to_samples_buffer_size=(None), 37 | shuffle_after_batching_buffer_size=None, 38 | ) 39 | 40 | # Regex pattern 41 | pattern = r"[?!:]\." 42 | # Find all occurrences 43 | all_matches = [] 44 | # Process each text 45 | for batch in combined_dataset: 46 | for turns in batch["texts"]: 47 | for turn in turns: 48 | text = turn["assistant"] 49 | matches = re.findall(pattern, text) 50 | if matches: 51 | all_matches.extend(matches) 52 | 53 | print(f"len matches: {len(all_matches)}") 54 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/build_laion_dataset/python_scripts/03_01_prepare_dedup_laion.py: -------------------------------------------------------------------------------- 1 | # The followings steps were done in different jobs, this is to give an idea of what was done 2 | 3 | 4 | import pickle 5 | 6 | import numpy as np 7 | from datasets import load_dataset 8 | from tqdm import tqdm 9 | 10 | 11 | laion_dataset = load_dataset("laion/laion2b-en-vit-h-14-embeddings")["train"] # Takes a long time to download 12 | # The md5 is shorter than the url to identify an image. Moreover, some images in the dataset are the same but under 13 | # different urls. In this case they have the same md5, and we'll be able to have even more compact data 14 | # laion_dataset_md5 is uploaded at s3://m4-datasets/trash/laion_dataset_md5/ 15 | laion_dataset_md5 = laion_dataset.remove_columns([c_n for c_n in laion_dataset.column_names if c_n != "md5"]) 16 | 17 | # Download at https://huggingface.co/datasets/fraisdufour/snip-dedup/resolve/main/is_dup_mlp_1024_128_gelu_snn_2layer_notext.npy 18 | is_dup_all = np.load("/fsx/hugo/prepare_dedup_laion/is_dup_mlp_1024_128_gelu_snn_2layer_notext.npy").ravel() 19 | 20 | list_index_dup = [idx for idx, el in enumerate(is_dup_all) if el] + [ 21 | idx for idx in range(len(is_dup_all), len(laion_dataset_md5)) 22 | ] 23 | set_dup = set() 24 | for idx in tqdm(list_index_dup): 25 | set_dup.add(laion_dataset_md5[idx]["md5"]) 26 | 27 | # set_dup_md5.pkl is uploaded at s3://m4-datasets/trash/set_dup_md5.pkl 28 | with open("/fsx/hugo/prepare_dedup_laion/set_dup_md5.pkl", "wb") as f: 29 | pickle.dump(set_dup, f) 30 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/build_laion_dataset/python_scripts/04_02_create_ds_laion_urls.py: -------------------------------------------------------------------------------- 1 | """ 2 | srun --pty --cpus-per-task=48 --mem-per-cpu=11G bash -i 3 | conda activate /fsx/m4/conda/shared-m4-2023-03-10 4 | """ 5 | 6 | 7 | import json 8 | import os 9 | 10 | from datasets import Dataset 11 | from tqdm import tqdm 12 | 13 | 14 | NUM_SHARDS = 200 15 | 16 | PATH_LAION_URLS_S3 = "s3://m4-datasets/LAION_data/urls_laion_dataset_filtered_dedup/" 17 | PATH_LAION_URLS_LOCAL = "/scratch/laion_urls" 18 | 19 | PATH_SAVE_DISK_DS_LAION_URLS = "/scratch/ds_laion_urls" 20 | PATH_SAVE_S3_DS_LAION_URLS = "s3://m4-datasets/LAION_data/ds_urls_laion_dataset_filtered_dedup/" 21 | 22 | NUM_PROC = 48 23 | 24 | 25 | if __name__ == "__main__": 26 | command_sync_s3 = f"aws s3 sync {PATH_LAION_URLS_S3} {PATH_LAION_URLS_LOCAL}" 27 | os.system(command_sync_s3) 28 | 29 | all_urls = [] 30 | for idx_shard in tqdm(range(NUM_SHARDS)): 31 | if idx_shard not in [184, 189]: 32 | path_urls_laion_shard = os.path.join(PATH_LAION_URLS_LOCAL, str(idx_shard), "laion_urls.json") 33 | with open(path_urls_laion_shard) as f: 34 | all_urls.extend(json.load(f)) 35 | 36 | ds_laion_urls = Dataset.from_dict({"url": all_urls}) 37 | ds_laion_urls.save_to_disk(PATH_SAVE_DISK_DS_LAION_URLS, num_proc=NUM_PROC) 38 | 39 | command_sync_s3 = f"aws s3 sync {PATH_SAVE_DISK_DS_LAION_URLS} {PATH_SAVE_S3_DS_LAION_URLS}" 40 | os.system(command_sync_s3) 41 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts/03_parallel_dl_images_create_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import signal 3 | import subprocess 4 | import sys 5 | 6 | import numpy as np 7 | 8 | 9 | idx_machine = int(sys.argv[1]) 10 | 11 | IDX_REMAINING = [idx for idx in range(200)] 12 | NUM_MACHINES = 21 13 | IDX = [el.tolist() for el in np.array_split(IDX_REMAINING, NUM_MACHINES)][idx_machine] 14 | PATH_LOG = "/scratch/log.txt" 15 | 16 | 17 | for idx in IDX: 18 | f = open(PATH_LOG, "a") 19 | f.write(f"Starting job {idx}\n") 20 | f.close() 21 | 22 | os.system("sudo truncate -s 0 /var/log/syslog") 23 | 24 | p = subprocess.Popen( 25 | f"python3 m4/sourcing/data_collection/callers/dl_images_create_dataset.py {idx} --download_only 1", 26 | shell=True, 27 | preexec_fn=os.setsid, 28 | ) 29 | try: 30 | p.wait(2 * 60 * 60) 31 | except subprocess.TimeoutExpired: 32 | os.killpg(os.getpgid(p.pid), signal.SIGTERM) 33 | # p.kill() 34 | 35 | f = open(PATH_LOG, "a") 36 | f.write(f"{idx} done with download only\n") 37 | f.close() 38 | 39 | os.system(f"python3 m4/sourcing/data_collection/callers/dl_images_create_dataset.py {idx} --U 1") 40 | 41 | f = open(PATH_LOG, "a") 42 | f.write(f"{idx} done with create image dataset only\n") 43 | f.close() 44 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts/09_03_split_domain_to_positions.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | 5 | from tqdm import tqdm 6 | 7 | 8 | random.seed(42) 9 | 10 | NUM_SHARDS = 200 11 | 12 | PATH_LINE_DEDUP_DOMAIN_TO_POSITIONS_S3 = "s3://m4-datasets/webdocs/line_dedup_domain_to_positions.json" 13 | PATH_LINE_DEDUP_DOMAIN_TO_POSITIONS_LOCAL = "/scratch/line_dedup_domain_to_positions.json" 14 | 15 | PATH_SAVE_S3_LINE_DEDUP_DOMAIN_TO_POSITIONS_SHARDED = ( 16 | "s3://m4-datasets/webdocs/line_dedup_domain_to_positions_sharded/" 17 | ) 18 | 19 | 20 | if __name__ == "__main__": 21 | command_sync_s3 = f"aws s3 cp {PATH_LINE_DEDUP_DOMAIN_TO_POSITIONS_S3} {PATH_LINE_DEDUP_DOMAIN_TO_POSITIONS_LOCAL}" 22 | os.system(command_sync_s3) 23 | 24 | with open(PATH_LINE_DEDUP_DOMAIN_TO_POSITIONS_LOCAL) as f: 25 | domain_to_positions = json.load(f) 26 | 27 | keys = list(domain_to_positions.keys()) 28 | random.shuffle(keys) 29 | 30 | sublist_size = len(keys) // NUM_SHARDS + 1 31 | keys_per_shard = [set(keys[i : i + sublist_size]) for i in range(0, len(keys), sublist_size)] 32 | 33 | domain_to_positions_shard = [] 34 | 35 | for idx_shard in tqdm(range(NUM_SHARDS)): 36 | domain_to_positions_shard.append( 37 | {k: v for k, v in domain_to_positions.items() if k in keys_per_shard[idx_shard]} 38 | ) 39 | 40 | with open(f"/scratch/line_dedup_domain_to_positions_{idx_shard}.json", "w") as f: 41 | json.dump(domain_to_positions_shard[idx_shard], f) 42 | 43 | for idx_shard in tqdm(range(NUM_SHARDS)): 44 | path_disk = f"/scratch/line_dedup_domain_to_positions_{idx_shard}.json" 45 | path_s3 = os.path.join( 46 | PATH_SAVE_S3_LINE_DEDUP_DOMAIN_TO_POSITIONS_SHARDED, str(idx_shard), "line_dedup_domain_to_positions.json" 47 | ) 48 | command_sync_s3 = f"aws s3 cp {path_disk} {path_s3}" 49 | os.system(command_sync_s3) 50 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/clean_m4_prelimenary_experiments/README.md: -------------------------------------------------------------------------------- 1 | This folder traces the exploration of additional cleaning that could be brought to the CM4 dataset. 2 | 3 | As a result of this exploration phase, 2 potential improvements have been identified: 4 | 1. Remove HTML nodes (and their descendants) whose tag class attribute value contains either "footer" or "site-info". From the exploration, this would correspond to "web" parts of the web page 5 | 2. Splitting the html at the level of the continue reading occurrence, which is often characterized by the fact that the class attribute value of the tag contains "more-link". 6 | 7 | **Before being fully implemented**, we tested the suitability of 2. by creating a filtered version of CM4 that excluded all documents that would have had an occurance of continuous reading (`04_get_banned_url.slurm` and `05_filter_cm4.slurm`). 8 | 9 | The explore folder contains streamlint spaces that have been used to find new possible cleaning rules. 10 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/create_evaluation_datasets/MMBench/make_mmbench.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from copy import deepcopy 3 | from io import BytesIO 4 | 5 | import datasets 6 | import pandas as pd 7 | from datasets import Dataset 8 | from PIL import Image 9 | 10 | 11 | PATH_MMBENCH_DATA = ( # DL from https://opencompass.org.cn/mmbench 12 | "/Users/hugolaurencon/Desktop/mmbench_dev_20230712.tsv" 13 | ) 14 | NUM_PROC = 10 15 | REPO_ID = "HuggingFaceM4/MMBench_dev" 16 | 17 | 18 | data_frame = pd.read_csv(PATH_MMBENCH_DATA, sep="\t", header=0) 19 | 20 | 21 | ds = Dataset.from_pandas(data_frame) 22 | ds = ds.remove_columns(["index", "category", "source", "l2-category", "comment", "split"]) 23 | ds = ds.rename_column("answer", "label") 24 | 25 | 26 | def map_func_transform_image_column(example): 27 | example["image"] = Image.open(BytesIO(base64.b64decode(example["image"]))) 28 | return example 29 | 30 | 31 | new_features = deepcopy(ds.features) 32 | new_features["image"] = datasets.Image() 33 | new_features["label"] = datasets.features.ClassLabel(names=["A", "B", "C", "D"]) 34 | 35 | ds = ds.map(map_func_transform_image_column, features=new_features, num_proc=NUM_PROC) 36 | 37 | ds.push_to_hub(REPO_ID) 38 | 39 | 40 | def map_func_modif_context(example): 41 | question = example["question"] 42 | hint = example["hint"] 43 | context = [] 44 | if hint: 45 | context.append(f"Context: {hint}") 46 | context.append(f"Question: {question}") 47 | context.append("Possible answers:") 48 | for key in ["A", "B", "C", "D"]: 49 | ans = example[key] 50 | if ans: 51 | context.append(f"{key}: {ans}") 52 | context.append("Correct answer: ") 53 | example["context"] = "\n".join(context) 54 | return example 55 | 56 | 57 | ds = ds.map(map_func_modif_context, num_proc=NUM_PROC) 58 | ds = ds.remove_columns(["question", "hint", "A", "B", "C", "D"]) 59 | ds.push_to_hub(REPO_ID + "_modif", private=True) 60 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/create_evaluation_datasets/create_AI2D/create_ai2d.py: -------------------------------------------------------------------------------- 1 | """ 2 | srun --pty --cpus-per-task=8 --partition=hopper-cpu --qos high bash -i 3 | conda activate shared-m4 4 | """ 5 | 6 | 7 | import datasets 8 | from datasets import DatasetDict, load_dataset 9 | 10 | 11 | ORIGINAL_NAME_DS = "lmms-lab/ai2d" 12 | ORIGINAL_SPLIT_DS = "test" 13 | 14 | NUM_PROC = 32 15 | 16 | POSSIBLE_LABELS = ["1", "2", "3", "4"] 17 | 18 | FEATURES = datasets.Features( 19 | { 20 | "question": datasets.Value("string"), 21 | "label": datasets.features.ClassLabel(names=POSSIBLE_LABELS), 22 | "image": datasets.Image(decode=True), 23 | } 24 | ) 25 | 26 | NAME_DS_PUSH_HUB = "HuggingFaceM4/AI2D" 27 | 28 | 29 | def map_func_transform_ai2d_ds(example): 30 | example["label"] = str(int(example["answer"]) + 1) 31 | question = example["question"].strip() 32 | question = f"Question: {question}\nChoices:\n" 33 | choices = example["options"] 34 | for idx_choice, choice in enumerate(choices): 35 | question += f"Choice {idx_choice + 1}: {choice}\n" 36 | # question += "Answer with the option number." # Commented because should be defined in the evaluation prompt 37 | example["question"] = question.strip() 38 | return example 39 | 40 | 41 | ds_test = load_dataset(ORIGINAL_NAME_DS, split=ORIGINAL_SPLIT_DS) 42 | columns_to_remove = [c_n for c_n in ds_test.column_names if c_n not in list(FEATURES.keys())] 43 | ds_test = ds_test.map( 44 | map_func_transform_ai2d_ds, remove_columns=columns_to_remove, features=FEATURES, num_proc=NUM_PROC 45 | ) 46 | print(ds_test[0]["question"]) 47 | 48 | 49 | ds_all_splits = DatasetDict({"test": ds_test}) 50 | ds_all_splits.push_to_hub(NAME_DS_PUSH_HUB, private=True) 51 | 52 | # Cache dataset 53 | test_loading = load_dataset(NAME_DS_PUSH_HUB) 54 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/create_evaluation_datasets/create_imagenet1k_1ksupportset_subsets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from datasets import load_dataset 3 | from tqdm import tqdm 4 | 5 | 6 | NUM_SUBSETS = 6 7 | 8 | NAME_DS = "HuggingFaceM4/imagenet1k_support_1k_query_sets" 9 | 10 | 11 | ds_subsets = [load_dataset(NAME_DS, use_auth_token=True) for _ in range(NUM_SUBSETS)] 12 | 13 | num_test_examples = ds_subsets[0]["test_query_set"].num_rows 14 | 15 | selected_indices = np.array_split(range(num_test_examples), NUM_SUBSETS) 16 | 17 | for idx_ds in range(NUM_SUBSETS): 18 | ds_subsets[idx_ds]["test_query_set"] = ds_subsets[idx_ds]["test_query_set"].select(selected_indices[idx_ds]) 19 | 20 | for idx_ds in tqdm(range(NUM_SUBSETS)): 21 | ds_subsets[idx_ds].push_to_hub(repo_id=NAME_DS + f"_part_{idx_ds}", private=True) 22 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/create_evaluation_datasets/create_imagenet1k_5ksupportset_subsets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from datasets import load_dataset 3 | from tqdm import tqdm 4 | 5 | 6 | NUM_SUBSETS = 6 7 | 8 | NAME_DS = "HuggingFaceM4/imagenet1k_support_5k_query_sets" 9 | 10 | 11 | ds_subsets = [load_dataset(NAME_DS, use_auth_token=True) for _ in range(NUM_SUBSETS)] 12 | 13 | num_test_examples = ds_subsets[0]["test_query_set"].num_rows 14 | 15 | selected_indices = np.array_split(range(num_test_examples), NUM_SUBSETS) 16 | 17 | for idx_ds in range(NUM_SUBSETS): 18 | ds_subsets[idx_ds]["test_query_set"] = ds_subsets[idx_ds]["test_query_set"].select(selected_indices[idx_ds]) 19 | 20 | for idx_ds in tqdm(range(NUM_SUBSETS)): 21 | ds_subsets[idx_ds].push_to_hub(repo_id=NAME_DS + f"_part_{idx_ds}", private=True) 22 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/create_evaluation_datasets/create_vqav2_subsets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from datasets import load_dataset 3 | from tqdm import tqdm 4 | 5 | 6 | NUM_SUBSETS = 6 7 | 8 | NAME_DS = "HuggingFaceM4/VQAv2_modif_support_query_sets" 9 | 10 | 11 | ds_subsets = [load_dataset(NAME_DS, use_auth_token=True) for _ in range(NUM_SUBSETS)] 12 | 13 | num_test_examples = ds_subsets[0]["test_query_set"].num_rows 14 | 15 | selected_indices = np.array_split(range(num_test_examples), NUM_SUBSETS) 16 | 17 | for idx_ds in range(NUM_SUBSETS): 18 | ds_subsets[idx_ds]["test_query_set"] = ds_subsets[idx_ds]["test_query_set"].select(selected_indices[idx_ds]) 19 | 20 | for idx_ds in tqdm(range(NUM_SUBSETS)): 21 | ds_subsets[idx_ds].push_to_hub(repo_id=NAME_DS + f"_part_{idx_ds}", private=True) 22 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/create_fine_tuning_datasets/create_llavar.py: -------------------------------------------------------------------------------- 1 | # DL finetuning images: https://drive.google.com/file/d/1Ms7OCjcFQ18Whmujszpc9bTp0Jy0Dye4/view?usp=sharing 2 | # DL finetuning instructions: https://drive.google.com/file/d/1ISdKOV1wwVkLHf5FNutctpOBa-CmNRFv/view?usp=sharing 3 | 4 | 5 | import json 6 | import os 7 | 8 | from datasets import Dataset 9 | from PIL import Image 10 | 11 | 12 | PATH_CONV = "/Users/hugolaurencon/Desktop/llava_instruct_150k_llavar_16k.json" 13 | PATH_DIR_IMAGES = "/Users/hugolaurencon/Desktop/finetune" 14 | 15 | 16 | with open(PATH_CONV) as f: 17 | data_conv = json.load(f) 18 | data_conv = data_conv[-15500:] # Before it's only the regular LLaVA instructions 19 | 20 | 21 | all_image = [] 22 | all_user_texts = [] 23 | all_bot_texts = [] 24 | 25 | for conv in data_conv: 26 | image_path = os.path.join(PATH_DIR_IMAGES, conv["image"]) 27 | image = Image.open(image_path) 28 | all_image.append(image) 29 | user_texts = [] 30 | bot_texts = [] 31 | for turn in conv["conversations"]: 32 | if turn["from"] == "human": 33 | user_texts.append(turn["value"].replace("", "").strip()) 34 | elif turn["from"] == "gpt": 35 | bot_texts.append(turn["value"]) 36 | assert len(user_texts) == len(bot_texts) 37 | all_user_texts.append(user_texts) 38 | all_bot_texts.append(bot_texts) 39 | 40 | assert len(all_image) == len(all_user_texts) == len(all_bot_texts) 41 | 42 | 43 | ds = Dataset.from_dict({"image": all_image, "user_texts": all_user_texts, "bot_texts": all_bot_texts}) 44 | ds.push_to_hub("HuggingFaceM4/LLaVAR-Instruct-16K") 45 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/create_valid_ds/create_cm4_valid.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datasets import load_from_disk 4 | 5 | 6 | SUBSET_DIR_PATH = "/scratch/m4/webdocs/web_document_dataset_filtered/" 7 | BIG_SHARD_ID = 0 8 | cm4_valid_path = f"{SUBSET_DIR_PATH}/{BIG_SHARD_ID}" 9 | sync_cmd = ( 10 | "s5cmd sync" 11 | f" s3://m4-datasets/webdocs/web_document_dataset_filtered_imgurldedup_nsfwfiltered_urldedup_linededup_finalcleaning_setimgurlsdedup/{BIG_SHARD_ID}/*" 12 | f" {cm4_valid_path}" 13 | ) 14 | 15 | os.system(sync_cmd) 16 | 17 | ds = load_from_disk(cm4_valid_path) 18 | 19 | ds_sample = ds.select(range(10000)) 20 | repo_id = "HuggingFaceM4/cm4_valid-Sample" 21 | ds_sample.push_to_hub(repo_id, "valid", private=True) 22 | 23 | ds.push_to_hub("HuggingFaceM4/cm4_valid", "valid", private=True) 24 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/create_valid_ds/create_coco_valid.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datasets import load_from_disk 4 | 5 | 6 | SUBSET_DIR_PATH = "/scratch/general_pmd/image/coco/" 7 | valid_path = f"{SUBSET_DIR_PATH}/validation" 8 | sync_cmd = f"s5cmd sync s3://m4-datasets/general_pmd/image/coco/validation/00000-00001/* {valid_path}" 9 | 10 | os.system(sync_cmd) 11 | 12 | ds = load_from_disk(valid_path) 13 | print(ds) 14 | repo_id = "HuggingFaceM4/coco_valid" 15 | ds.push_to_hub(repo_id, "valid", private=True) 16 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/create_valid_ds/create_wiki_valid.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datasets import concatenate_datasets, load_from_disk 4 | 5 | 6 | SUBSET_DIR_PATH = "/scratch/enwiki/" 7 | valid_path = f"{SUBSET_DIR_PATH}/validation" 8 | sync_cmd = f"s5cmd sync s3://m4-datasets/enwiki/enwiki-v2/valid/* {valid_path}" 9 | 10 | os.system(sync_cmd) 11 | 12 | 13 | shard_valid_path = f"{SUBSET_DIR_PATH}/validation/shard_0" 14 | ds = load_from_disk(shard_valid_path) 15 | print(ds) 16 | repo_id = "HuggingFaceM4/enwiki-v2_valid-Sample" 17 | ds.push_to_hub(repo_id, "valid", private=True) 18 | 19 | 20 | valid_path = [f"{SUBSET_DIR_PATH}/validation/shard_{shard_id}" for shard_id in range(10)] 21 | ds = [load_from_disk(path) for path in valid_path] 22 | ds = concatenate_datasets(ds) 23 | 24 | print(ds) 25 | repo_id = "HuggingFaceM4/enwiki-v2_valid" 26 | ds.push_to_hub(repo_id, "valid", private=True) 27 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/enwiki/REAME.md: -------------------------------------------------------------------------------- 1 | This folder contains all the slurm, bash and python scripts used to build enwiki-v1 and enwiki-v2. The numbering of the files indicates the order in which they were run. 2 | 3 | Beware, these scripts have sometimes been used on different machines to process a portion of the shards, the changes needed to parallelize the work are not contained in the scripts in the folder `slurm_and_bash_scripts`. 4 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/enwiki/python_scripts/05_download_remaining_urls.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from m4.sourcing.data_collection.processors.web_document_extractor import download_images 4 | 5 | 6 | SHARD_ID = 9 7 | NUM_SHARDS = 33 8 | DATA_DIR = Path("/home/lucile/local_datasets/enwiki/enwiki-NS0-20230220-ENTERPRISE-HTML-EXTRACTION") 9 | DATASET_NAME_INCOMPLETE_EXAMPLES = "wikipedia_html_enterprise-with-images-incomplete-v1-v2" 10 | NUM_PROC = 32 // 2 11 | REMAINING_URLS_FILENAME = f"remaining_urls_v2_shard_{SHARD_ID}.txt" 12 | DOWNLOADED_IMAGES_DIRNAME = f"downloaded_images-v3_shard_{SHARD_ID}" 13 | 14 | 15 | path_save_file_image_urls = DATA_DIR / REMAINING_URLS_FILENAME 16 | path_save_dir_downloaded_images = DATA_DIR / DOWNLOADED_IMAGES_DIRNAME 17 | number_sample_per_shard = 10_000 18 | image_size = 256 19 | resize_mode = "no" 20 | num_proc = 1 21 | thread_count = 1 22 | 23 | download_images( 24 | path_save_file_image_urls, 25 | path_save_dir_downloaded_images, 26 | number_sample_per_shard, 27 | image_size, 28 | resize_mode, 29 | num_proc, 30 | thread_count, 31 | ) 32 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/enwiki/python_scripts/06_create_image_dataset.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | from pathlib import Path 4 | 5 | from m4.sourcing.data_collection.processors.web_document_extractor import create_dataset_images_from_tar 6 | 7 | 8 | path_save_dir_downloaded_images = Path("/home/lucile/local_datasets/enwiki/enwiki-v2-downloaded-images") 9 | path_save_dir_tmp_datasets_images = Path("/home/lucile/local_datasets/enwiki/enwiki-v2-ds-images-tmp") 10 | num_proc = 16 11 | path_save_file_map_url_idx = Path("/home/lucile/local_datasets/enwiki/enwiki-v2-map-url-idx.json") 12 | path_save_dir_dataset_images = Path("/home/lucile/local_datasets/enwiki/enwiki-v2-ds-images") 13 | 14 | tar_paths = [] 15 | for path_save_dir_downloaded_images_shard in path_save_dir_downloaded_images.glob("*"): 16 | if path_save_dir_downloaded_images_shard.is_dir(): 17 | tar_paths.extend(glob.glob(os.path.join(path_save_dir_downloaded_images_shard, "*.tar"))) 18 | 19 | create_dataset_images_from_tar( 20 | tar_paths, 21 | path_save_dir_tmp_datasets_images, 22 | num_proc, 23 | path_save_file_map_url_idx, 24 | path_save_dir_dataset_images, 25 | ) 26 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/enwiki/python_scripts/08_save_dataset.py: -------------------------------------------------------------------------------- 1 | # %% 2 | from pathlib import Path 3 | 4 | from datasets import concatenate_datasets, load_from_disk 5 | 6 | from m4.sourcing.data_collection.processors.web_document_extractor import save_split_sharded_already_splitted_dataset 7 | 8 | 9 | NUM_SHARDS = 68 10 | DS_V1_PATH = Path("/home/lucile/local_datasets/enwiki/enwiki-v1") 11 | DS_V2_COMMON_PATH = Path("/home/lucile/local_datasets/enwiki/enwiki-NS0-20230220-ENTERPRISE-HTML-EXTRACTION") 12 | EXCLUDE_SHARD_IDS = [34] 13 | DATASET_NAME_COMPLETE_EXAMPLES_V2 = "wikipedia_html_enterprise-with-images-full-v2-v3" 14 | SHARD_SIZE = 20_000 15 | 16 | DS_FINAL_DS_PATH = Path("/home/lucile/local_datasets/enwiki/enwiki-v2") 17 | ds_v1 = load_from_disk(DS_V1_PATH) 18 | # %% 19 | ds_v1 20 | # %% 21 | ds_v1_merged = concatenate_datasets([ds_v1["train"], ds_v1["valid"]]) 22 | # %% 23 | ds_v1_merged 24 | # %% 25 | 26 | ds_list = [] 27 | for shard_id in range(0, NUM_SHARDS): 28 | if shard_id in EXCLUDE_SHARD_IDS: 29 | continue 30 | print(f"Processing shard {shard_id}...") 31 | shard_dir = DS_V2_COMMON_PATH / f"shard_{shard_id}" 32 | ds_path = shard_dir / DATASET_NAME_COMPLETE_EXAMPLES_V2 33 | ds = load_from_disk(ds_path) 34 | ds_list.append(ds) 35 | 36 | ds_v2 = concatenate_datasets(ds_list) 37 | # %% 38 | ds_full = concatenate_datasets([ds_v1_merged, ds_v2]) 39 | # %% 40 | ds_full = ds_full.remove_columns(["images_urls", "num_found", "num_not_found", "mismatches"]) 41 | # %% 42 | ds_full = ds_full.train_test_split(test_size=0.05, shuffle=False) 43 | ds_full["valid"] = ds_full["test"] 44 | ds_full.pop("test") 45 | 46 | save_split_sharded_already_splitted_dataset( 47 | ds_full, Path("/home/lucile/local_datasets/enwiki") / "enwiki-v2-full", SHARD_SIZE 48 | ) 49 | # %% 50 | -------------------------------------------------------------------------------- /vision/data/datasets_processing_scripts/integrate_evaluation_benchmarks_chatbot/gqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import datasets 5 | from datasets import DatasetDict, load_dataset 6 | 7 | from datasets_processing_scripts.build_concatenation_datasets_sft.build_ds_sft import ( 8 | PROMPTS_ANSWER_SHORTLY, 9 | convert_img_to_bytes, 10 | ) 11 | 12 | 13 | NUM_PROC = 96 14 | 15 | FEATURES = datasets.Features( 16 | { 17 | "image": datasets.Image(decode=True), 18 | "question": datasets.Value("string"), 19 | "answers": datasets.Sequence(datasets.Value("string")), 20 | "question_id": datasets.Value("int64"), 21 | } 22 | ) 23 | 24 | 25 | def map_transform_gqa(example): 26 | path_image = os.path.join("/fsx/hugo/gqa/images", os.path.basename(example["image_id"])) 27 | image_bytes = convert_img_to_bytes(img_path=path_image, format="JPEG") 28 | question = example["question"] + random.choice(PROMPTS_ANSWER_SHORTLY) 29 | example["image"] = {"path": None, "bytes": image_bytes} 30 | example["question"] = question 31 | example["answers"] = [example["label"]] 32 | return example 33 | 34 | 35 | def load_gqa(split): 36 | ds_gqa = load_dataset("Graphcore/gqa", split=split) 37 | columns_to_keep = ["image", "question", "answers", "question_id"] 38 | columns_to_remove = [c_n for c_n in ds_gqa.column_names if c_n not in columns_to_keep] 39 | ds_gqa = ds_gqa.map(map_transform_gqa, remove_columns=columns_to_remove, features=FEATURES, num_proc=NUM_PROC) 40 | return ds_gqa 41 | 42 | 43 | ds_gqa_all_splits = DatasetDict( 44 | {"train": load_gqa("train"), "validation": load_gqa("validation"), "test": load_gqa("test")} 45 | ) 46 | 47 | ds_gqa_all_splits.push_to_hub("HuggingFaceM4/GQA", private=True) 48 | -------------------------------------------------------------------------------- /vision/evaluation/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation 2 | 3 | We implemented the evaluations for SmolVLM in [VLMEvalKit](https://github.com/open-compass/VLMEvalKit). 4 | They can be run by following the instructions in their repository. 5 | 6 | We also have our own internal evaluation scripts, they can be found in the experiments/evaluation folder. The code used for supporting those is in the m4 folder. -------------------------------------------------------------------------------- /vision/experiments/evaluation/vloom/common/accelerate_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: {} 3 | distributed_type: MULTI_GPU 4 | fsdp_config: {} 5 | machine_rank: 0 6 | main_process_ip: null 7 | main_process_port: null 8 | main_training_function: main 9 | mixed_precision: 'no' 10 | num_machines: null 11 | num_processes: null 12 | use_cpu: false 13 | -------------------------------------------------------------------------------- /vision/experiments/evaluation/vloom/common/sync_evaluations_on_gcs.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=gcs_sync_eval 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --qos=qos_cpu-dev 6 | #SBATCH --partition=compil 7 | #SBATCH --cpus-per-task=1 # number of cores per tasks 8 | #SBATCH --hint=nomultithread # we get physical cores not logical 9 | #SBATCH --time 00:05:00 # maximum execution time (HH:MM:SS) 10 | #SBATCH --output=/gpfsscratch/rech/cnw/commun/experiments/local_experiment_dir/evals/run_eval_master/logs/%x_%j.out 11 | #SBATCH --account=cnw@cpu 12 | #SBATCH --mail-type=FAIL,INVALID_DEPEND,REQUEUE,STAGE_OUT,TIME_LIMIT 13 | #SBATCH --mail-user=hf-m4-jz@googlegroups.com 14 | #SBATCH --no-requeue 15 | 16 | set -x -e 17 | 18 | source $cnw_ALL_CCFRWORK/start-m4-user 19 | 20 | gsutil cp $EVALUATION_JSONL_FILE gs://hf-science-m4-cold/local_experiment_dir/evals/results/ 21 | -------------------------------------------------------------------------------- /vision/experiments/evaluation/vloom/common/sync_evaluations_on_s3.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=s3_sync_eval 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=1 # number of cores per tasks 6 | #SBATCH --time 00:05:00 # maximum execution time (HH:MM:SS) 7 | #SBATCH --mem-per-cpu=11G 8 | #SBATCH --output=/fsx/m4/evals/run_eval_master/logs/%x_%j.out 9 | set -x -e 10 | 11 | source $cnw_ALL_CCFRWORK/start-m4-user 12 | BASENAME_EVALUATION_JSONL_FILE="$(basename $EVALUATION_JSONL_FILE)" 13 | aws s3 cp $EVALUATION_JSONL_FILE s3://m4-exps/eval_results/$BASENAME_EVALUATION_JSONL_FILE 14 | -------------------------------------------------------------------------------- /vision/experiments/evaluation/vloom/common/sync_evaluations_on_wandb.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=run_eval_automatic 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --qos=qos_cpu-dev 6 | #SBATCH --partition=compil 7 | #SBATCH --cpus-per-task=4 # number of cores per tasks 8 | #SBATCH --hint=nomultithread # we get physical cores not logical 9 | #SBATCH --time 01:00:00 # maximum execution time (HH:MM:SS) 10 | #SBATCH --output=/gpfsscratch/rech/cnw/commun/experiments/local_experiment_dir/evals/run_eval_master/logs/%x_%j.out 11 | #SBATCH --account=cnw@cpu 12 | #SBATCH --mail-type=FAIL,INVALID_DEPEND,REQUEUE,STAGE_OUT,TIME_LIMIT 13 | #SBATCH --mail-user=hf-m4-jz@googlegroups.com 14 | #SBATCH --no-requeue 15 | 16 | set -x -e 17 | 18 | source $cnw_ALL_CCFRWORK/start-m4-user 19 | 20 | conda activate $CONDA_ENV_NAME 21 | 22 | pushd $WORKING_DIR 23 | 24 | python m4/evaluation/scripts/sync_evaluations_on_wandb.py \ 25 | --evaluation_jsonl_files $EVALUATION_JSONL_FILE \ 26 | --run_name_to_log $RUN_NAME 27 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/common/sync_and_upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # $1: save_dir 4 | # $2: exp_name 5 | # $3: previously saved step 6 | # $4: current saved step 7 | # Example use: ./sync_and_upload.sh /home/victor/experiments vllama_debug opt_step-40 opt_step-50 8 | 9 | if [[ -n "$4" ]]; then 10 | if [[ -n "$3" ]]; then 11 | s5cmd sync "$1/$2/$3/" "s3://m4-exps/$2/$3/" && rm -rf "$1/$2/$3" 12 | fi 13 | s5cmd cp "$1/$2/$4/" "s3://m4-exps/$2/$4/" 14 | fi 15 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/common/webdataset_get_file.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # create a unique temp file per tar 4 | tmpfile=$(mktemp /scratch/m4data/tmp-dldata.XXXXXXXX) 5 | 6 | # auto-remove the temp file when the script exits for any reason or receives SIGINT/SIGTERM 7 | trap "rm -f $tmpfile" EXIT INT TERM 8 | 9 | # make sure that the only output from the script is generated by the `cat` command below 10 | s5cmd cp $1 $tmpfile > /dev/null 2>&1 11 | cat $tmpfile 12 | 13 | # note: tmpfile gets autodeleted on exit via trap above 14 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/slurm_scripts_templates/accelerate_config_multi_node.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | deepspeed_multinode_launcher: standard 4 | deepspeed_config_file: ./experiments/pretraining/vloom/slurm_scripts_templates/ds_config.json 5 | zero3_init_flag: true 6 | distributed_type: DEEPSPEED 7 | fsdp_config: {} 8 | machine_rank: 0 9 | main_process_ip: null 10 | main_process_port: null 11 | main_training_function: main 12 | mixed_precision: fp16 13 | num_machines: 2 14 | num_processes: 8 15 | use_cpu: false 16 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/slurm_scripts_templates/accelerate_config_single_node.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | deepspeed_multinode_launcher: standard 4 | deepspeed_config_file: ./experiments/pretraining/vloom/slurm_scripts_templates/ds_config.json 5 | zero3_init_flag: true 6 | distributed_type: DEEPSPEED 7 | fsdp_config: {} 8 | machine_rank: 0 9 | main_process_ip: null 10 | main_process_port: null 11 | main_training_function: main 12 | mixed_precision: fp16 13 | num_machines: 1 14 | num_processes: 4 15 | use_cpu: false 16 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/slurm_scripts_templates/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": true, 4 | "auto_cast": true, 5 | "loss_scale": 0, 6 | "initial_scale_power": 32, 7 | "loss_scale_window": 1000, 8 | "hysteresis": 2, 9 | "min_loss_scale": 1 10 | }, 11 | "zero_optimization": { 12 | "stage": 2, 13 | "allgather_partitions": true, 14 | "allgather_bucket_size": 5e8, 15 | "overlap_comm": false, 16 | "reduce_scatter": true, 17 | "reduce_bucket_size": "auto", 18 | "contiguous_gradients": true, 19 | "offload_optimizer": { 20 | "device": "cpu" 21 | }, 22 | "offload_param": { 23 | "device": "cpu" 24 | }, 25 | "stage3_gather_16bit_weights_on_model_save": "auto" 26 | }, 27 | "train_micro_batch_size_per_gpu": "auto", 28 | "train_batch_size": "auto", 29 | "gradient_clipping": "auto" 30 | } 31 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/slurm_scripts_templates/ds_config_bf16.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": true 4 | }, 5 | "zero_optimization": { 6 | "stage": 2, 7 | "allgather_partitions": true, 8 | "allgather_bucket_size": 5e8, 9 | "overlap_comm": false, 10 | "reduce_scatter": true, 11 | "reduce_bucket_size": "auto", 12 | "contiguous_gradients": true 13 | }, 14 | "gradient_clipping": "auto", 15 | "train_batch_size": "auto", 16 | "train_micro_batch_size_per_gpu": "auto" 17 | } 18 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/slurm_scripts_templates/hfc_with_launcher/cleanup-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=cleanup-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=1:00:00 6 | #SBATCH --partition=hopper-prod 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/$RUN_NAME/logs/%x-%j.out 8 | 9 | set -e 10 | 11 | ### EDIT ME START ### 12 | 13 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved 14 | RUN_FREQUENCY_IN_HOURS=3 15 | 16 | CONDA_ENV_NAME=shared-m4 17 | 18 | M4_REPO_PATH=/fsx/m4/repos/m4 19 | EXPERIMENT_NAME=tr_184_xxx 20 | 21 | ### EDIT ME END ### 22 | 23 | 24 | echo "START TIME: $(date)" 25 | 26 | source /fsx/m4/start-m4-user 27 | conda activate base 28 | conda activate $CONDA_ENV_NAME 29 | 30 | # ensure to restart self first 31 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 32 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm 33 | 34 | echo "running checkpoint cleanup" 35 | 36 | 37 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 38 | 39 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH 40 | 41 | echo "END TIME: $(date)" 42 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/slurm_scripts_templates/hfc_with_launcher/convert-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=convert-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --gres=gpu:8 7 | #SBATCH --partition=hopper-prod 8 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/$RUN_NAME/logs/%x-%j.out 9 | 10 | 11 | set -e 12 | 13 | ### EDIT ME START ### 14 | 15 | # how often to try to run the checkpoint conversion - hint: approximately as often as a checkpoint is saved 16 | RUN_FREQUENCY_IN_HOURS=3 17 | 18 | CONDA_ENV_NAME=shared-m4 19 | 20 | M4_REPO_PATH=/fsx/m4/repos/m4 21 | EXPERIMENT_NAME=tr_184_xxx 22 | 23 | ### EDIT ME END ### 24 | 25 | 26 | echo "START TIME: $(date)" 27 | 28 | source /fsx/m4/start-m4-user 29 | conda activate base 30 | conda activate $CONDA_ENV_NAME 31 | 32 | # ensure to restart self first 33 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 34 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour convert-checkpoints.slurm 35 | 36 | echo "running checkpoint converter" 37 | 38 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 39 | 40 | python -u $M4_REPO_PATH/m4/scripts/convert-checkpoints.py $M4_CHECKPOINTS_PATH 41 | 42 | echo "END TIME: $(date)" 43 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/slurm_scripts_templates/hfc_with_launcher/s3-upload-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=s3-upload-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --partition=hopper-prod 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/$RUN_NAME/logs/%x-%j.out 8 | 9 | 10 | set -e 11 | 12 | ### EDIT ME START ### 13 | 14 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved 15 | RUN_FREQUENCY_IN_HOURS=3 16 | 17 | CONDA_ENV_NAME=shared-m4 18 | 19 | M4_REPO_PATH=/fsx/m4/repos/m4 20 | EXPERIMENT_NAME=tr_184_xxx 21 | 22 | ### EDIT ME END ### 23 | 24 | 25 | echo "START TIME: $(date)" 26 | 27 | source /fsx/m4/start-m4-user 28 | conda activate base 29 | conda activate $CONDA_ENV_NAME 30 | 31 | # ensure to restart self first 32 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 33 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm 34 | 35 | echo "running checkpoint converter" 36 | 37 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 38 | 39 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH 40 | 41 | echo "END TIME: $(date)" 42 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/slurm_scripts_templates/hfc_with_launcher/schedule-evals.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=schedule-evals 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=00:30:00 6 | #SBATCH --partition=hopper-prod 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/$RUN_NAME/logs/%x-%j.out 8 | 9 | set -e 10 | 11 | ### EDIT ME START ### 12 | 13 | # how often to try to run the conversion - hint: approximately as often as a checkpoint is saved 14 | RUN_FREQUENCY_IN_HOURS=3 15 | 16 | CONDA_ENV_NAME=shared-m4 17 | 18 | M4_REPO_PATH=/fsx/m4/repos/m4 19 | EXPERIMENT_NAME=tr_184_xxx 20 | 21 | ### EDIT ME END ### 22 | 23 | 24 | echo "START TIME: $(date)" 25 | 26 | source /fsx/m4/start-m4-user 27 | conda activate base 28 | conda activate $CONDA_ENV_NAME 29 | 30 | # ensure to restart self first 31 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 32 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour schedule-evals.slurm 33 | 34 | echo "running eval scheduler" 35 | 36 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 37 | 38 | python -u $M4_REPO_PATH/m4/scripts/schedule-evals.py $M4_CHECKPOINTS_PATH 39 | 40 | echo "END TIME: $(date)" 41 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/slurm_scripts_templates/with_launcher/accelerate_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | deepspeed_multinode_launcher: standard 4 | deepspeed_config_file: ./experiments/pretraining/vloom/xxxx/ds_config.json 5 | gradient_accumulation_steps: 1 6 | gradient_clipping: 1.0 7 | offload_optimizer_device: cpu 8 | offload_param_device: cpu 9 | zero3_init_flag: true 10 | zero_stage: 2 11 | distributed_type: DEEPSPEED 12 | fsdp_config: {} 13 | machine_rank: 0 14 | main_process_ip: null 15 | main_process_port: null 16 | main_training_function: main 17 | mixed_precision: bf16 18 | num_machines: null 19 | num_processes: null 20 | use_cpu: false 21 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/slurm_scripts_templates/with_launcher/config.yaml: -------------------------------------------------------------------------------- 1 | data_param: 2 | map_batch_size: 32 3 | num_workers: 2 4 | p_next: 0. 5 | max_seq_len: 77 6 | pad_dataset: True 7 | realtime_processing: True 8 | persistent_workers: False 9 | hparams: 10 | tokenizer_name: gpt2 11 | tokenizer_params: '{"additional_special_tokens":[AddedToken("", rstrip=False, lstrip=False)], "use_fast":True}' 12 | tokenizer_add_special_tokens: '{"pad_token": tokenizer.eos_token}' 13 | model_name: gpt2-xl 14 | model_params: 15 | vision_image_size: 224 16 | vision_model_name: openai/clip-vit-base-patch16 17 | vision_model_params: '{"id2label":{}, "label2id":{}}' 18 | tie_word_embeddings: True 19 | freeze_lm_head: True 20 | freeze_text_layers: True 21 | freeze_vision_layers: True 22 | alpha_initializer: zeros 23 | alpha_type: float 24 | cross_layer_interval: 1 25 | batch_size: 8 26 | grad_acc_size: 1 27 | grad_clip: 1.0 28 | max_num_opt_steps: 500_000 29 | seed: 13 30 | train_logging_opt_steps: 10 31 | train_saving_opt_steps: 250 32 | val_logging_opt_steps: 250 33 | wandb_enable: true 34 | wandb_entity: huggingfacem4 35 | wandb_log_freq: 10 36 | wandb_project: VLOOM 37 | optim_param: 38 | vl_optim: AdamW 39 | vl_optim_params: 40 | betas: [0.9, 0.999] 41 | lr: 0.0001 42 | weight_decay: 0.1 43 | no_decay: ["bias", "alpha", "layernorm", "ln"] 44 | vl_lr_scheduler: get_constant_schedule_with_warmup 45 | vl_lr_scheduler_params: 46 | last_epoch: -1 47 | num_warmup_steps: 5_000 48 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/slurm_scripts_templates/with_launcher/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": true 4 | }, 5 | "zero_optimization": { 6 | "stage": 2, 7 | "allgather_partitions": true, 8 | "allgather_bucket_size": 5e8, 9 | "overlap_comm": false, 10 | "reduce_scatter": true, 11 | "reduce_bucket_size": "auto", 12 | "contiguous_gradients": true 13 | }, 14 | "gradient_clipping": "auto", 15 | "train_batch_size": "auto", 16 | "train_micro_batch_size_per_gpu": "auto" 17 | } 18 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_341_smolvlm_025b_1st_stage/cleanup-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_341_vsmollm2-cleanup-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=1:00:00 6 | #SBATCH --partition=hopper-cpu 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | set -e 11 | 12 | # ----------------- Auto-Workdir ----------------- 13 | if [ -n $SLURM_JOB_ID ]; then 14 | # check the original location through scontrol and $SLURM_JOB_ID 15 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 16 | else 17 | # otherwise: started with bash. Get the real location. 18 | SCRIPT_PATH=$(realpath $0) 19 | fi 20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 22 | # -------------------------------------------------- 23 | 24 | ### EDIT ME START ### 25 | 26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved 27 | RUN_FREQUENCY_IN_HOURS=6 28 | 29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 30 | 31 | EXPERIMENT_NAME=tr_341_vsmollm2_05b 32 | 33 | ### EDIT ME END ### 34 | 35 | 36 | echo "START TIME: $(date)" 37 | 38 | source /fsx/m4/start-m4-user 39 | conda activate base 40 | conda activate $CONDA_ENV_NAME 41 | 42 | pushd $M4_REPO_PATH 43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 45 | 46 | # ensure to restart self first 47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm 49 | 50 | echo "running checkpoint cleanup" 51 | 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_341_smolvlm_025b_1st_stage/merge_lora_and_resize_eou.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --gres=gpu:1 6 | #SBATCH --cpus-per-task=12 7 | #SBATCH --time=3:00:00 8 | #SBATCH --partition=hopper-prod 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out 10 | #SBATCH --qos=high 11 | 12 | set -e 13 | 14 | # ----------------- Auto-Workdir ----------------- 15 | if [ -n $SLURM_JOB_ID ]; then 16 | # check the original location through scontrol and $SLURM_JOB_ID 17 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 18 | else 19 | # otherwise: started with bash. Get the real location. 20 | SCRIPT_PATH=$(realpath $0) 21 | fi 22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd) 24 | 25 | # -------------------------------------------------- 26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3" 27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/opt_step-25750" 28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_341_vsmollm2_05b_opt_step_25750_merge_and_resize_eou" 29 | 30 | source /fsx/m4/start-m4-user 31 | conda activate base 32 | conda activate $CONDA_ENV_NAME 33 | 34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR 35 | python experiments/pretraining/vloom/tr_341_vsmollm2_05b/resize_embed_for_eou.py $OUTPUT_DIR 36 | echo "Done" 37 | 38 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_341_smolvlm_025b_1st_stage/s3-upload-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_341-s3-upload-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --partition=hopper-cpu 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | 11 | set -e 12 | 13 | # ----------------- Auto-Workdir ----------------- 14 | if [ -n $SLURM_JOB_ID ]; then 15 | # check the original location through scontrol and $SLURM_JOB_ID 16 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 17 | else 18 | # otherwise: started with bash. Get the real location. 19 | SCRIPT_PATH=$(realpath $0) 20 | fi 21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 23 | # -------------------------------------------------- 24 | 25 | ### EDIT ME START ### 26 | 27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved 28 | RUN_FREQUENCY_IN_HOURS=8 29 | 30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 31 | 32 | EXPERIMENT_NAME=tr_341_vsmollm2_05b 33 | 34 | ### EDIT ME END ### 35 | 36 | 37 | echo "START TIME: $(date)" 38 | 39 | source /fsx/m4/start-m4-user 40 | conda activate base 41 | conda activate $CONDA_ENV_NAME 42 | 43 | pushd $M4_REPO_PATH 44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 46 | 47 | # ensure to restart self first 48 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 49 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm 50 | 51 | echo "running s3 checkpoint upload" 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_341_smolvlm_025b_1st_stage/schedule-evals.sh: -------------------------------------------------------------------------------- 1 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_1024.slurm 2 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_2048.slurm 3 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_1024.slurm 4 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_2048.slurm 5 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_343_smolvlm_05b_1st_stage/cleanup-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_343-cleanup-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=1:00:00 6 | #SBATCH --partition=hopper-cpu 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_343_vsmollm2_05b/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | set -e 11 | 12 | # ----------------- Auto-Workdir ----------------- 13 | if [ -n $SLURM_JOB_ID ]; then 14 | # check the original location through scontrol and $SLURM_JOB_ID 15 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 16 | else 17 | # otherwise: started with bash. Get the real location. 18 | SCRIPT_PATH=$(realpath $0) 19 | fi 20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 22 | # -------------------------------------------------- 23 | 24 | ### EDIT ME START ### 25 | 26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved 27 | RUN_FREQUENCY_IN_HOURS=6 28 | 29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 30 | 31 | EXPERIMENT_NAME=tr_343_vsmollm2_05b 32 | 33 | ### EDIT ME END ### 34 | 35 | 36 | echo "START TIME: $(date)" 37 | 38 | source /fsx/m4/start-m4-user 39 | conda activate base 40 | conda activate $CONDA_ENV_NAME 41 | 42 | pushd $M4_REPO_PATH 43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 45 | 46 | # ensure to restart self first 47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm 49 | 50 | echo "running checkpoint cleanup" 51 | 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_343_smolvlm_05b_1st_stage/merge_lora_and_resize_eou.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --gres=gpu:1 6 | #SBATCH --cpus-per-task=12 7 | #SBATCH --time=3:00:00 8 | #SBATCH --partition=hopper-prod 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out 10 | #SBATCH --qos=high 11 | 12 | set -e 13 | 14 | # ----------------- Auto-Workdir ----------------- 15 | if [ -n $SLURM_JOB_ID ]; then 16 | # check the original location through scontrol and $SLURM_JOB_ID 17 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 18 | else 19 | # otherwise: started with bash. Get the real location. 20 | SCRIPT_PATH=$(realpath $0) 21 | fi 22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd) 24 | 25 | # -------------------------------------------------- 26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3" 27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_343_vsmollm2_05b/opt_step-24750" 28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_343_vsmollm2_05b_opt_step_24750_merge_and_resize_eou" 29 | 30 | source /fsx/m4/start-m4-user 31 | conda activate base 32 | conda activate $CONDA_ENV_NAME 33 | 34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR 35 | python experiments/pretraining/vloom/tr_343_vsmollm2_05b/resize_embed_for_eou.py $OUTPUT_DIR 36 | echo "Done" 37 | 38 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_343_smolvlm_05b_1st_stage/s3-upload-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_343-s3-upload-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --partition=hopper-cpu 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_343_vsmollm2_05b/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | 11 | set -e 12 | 13 | # ----------------- Auto-Workdir ----------------- 14 | if [ -n $SLURM_JOB_ID ]; then 15 | # check the original location through scontrol and $SLURM_JOB_ID 16 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 17 | else 18 | # otherwise: started with bash. Get the real location. 19 | SCRIPT_PATH=$(realpath $0) 20 | fi 21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 23 | # -------------------------------------------------- 24 | 25 | ### EDIT ME START ### 26 | 27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved 28 | RUN_FREQUENCY_IN_HOURS=8 29 | 30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 31 | 32 | EXPERIMENT_NAME=tr_343_vsmollm2_05b 33 | 34 | ### EDIT ME END ### 35 | 36 | 37 | echo "START TIME: $(date)" 38 | 39 | source /fsx/m4/start-m4-user 40 | conda activate base 41 | conda activate $CONDA_ENV_NAME 42 | 43 | pushd $M4_REPO_PATH 44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 46 | 47 | # ensure to restart self first 48 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 49 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm 50 | 51 | echo "running s3 checkpoint upload" 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_343_smolvlm_05b_1st_stage/schedule-evals.slurm: -------------------------------------------------------------------------------- 1 | sbatch experiments/evaluation/vloom/async_evals_tr_343/run_evals_4_shots_captioning_1024.slurm 2 | sbatch experiments/evaluation/vloom/async_evals_tr_343/run_evals_4_shots_captioning_2048.slurm 3 | sbatch experiments/evaluation/vloom/async_evals_tr_343/run_evals_4_shots_vqa_1024.slurm 4 | sbatch experiments/evaluation/vloom/async_evals_tr_343/run_evals_4_shots_vqa_2048.slurm 5 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_345_vsmollm2_256M_2nd_stage/cleanup-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_320-cleanup-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=1:00:00 6 | #SBATCH --partition=hopper-cpu 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_320_vsmollm2_long_context/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | set -e 11 | 12 | # ----------------- Auto-Workdir ----------------- 13 | if [ -n $SLURM_JOB_ID ]; then 14 | # check the original location through scontrol and $SLURM_JOB_ID 15 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 16 | else 17 | # otherwise: started with bash. Get the real location. 18 | SCRIPT_PATH=$(realpath $0) 19 | fi 20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 22 | # -------------------------------------------------- 23 | 24 | ### EDIT ME START ### 25 | 26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved 27 | RUN_FREQUENCY_IN_HOURS=6 28 | 29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 30 | 31 | EXPERIMENT_NAME=tr_320_vsmollm2_long_context 32 | 33 | ### EDIT ME END ### 34 | 35 | 36 | echo "START TIME: $(date)" 37 | 38 | source /fsx/m4/start-m4-user 39 | conda activate base 40 | conda activate $CONDA_ENV_NAME 41 | 42 | pushd $M4_REPO_PATH 43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 45 | 46 | # ensure to restart self first 47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm 49 | 50 | echo "running checkpoint cleanup" 51 | 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_345_vsmollm2_256M_2nd_stage/merge_lora_and_resize_eou.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --gres=gpu:1 6 | #SBATCH --cpus-per-task=12 7 | #SBATCH --time=3:00:00 8 | #SBATCH --partition=hopper-prod 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out 10 | #SBATCH --qos=high 11 | 12 | set -e 13 | 14 | # ----------------- Auto-Workdir ----------------- 15 | if [ -n $SLURM_JOB_ID ]; then 16 | # check the original location through scontrol and $SLURM_JOB_ID 17 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 18 | else 19 | # otherwise: started with bash. Get the real location. 20 | SCRIPT_PATH=$(realpath $0) 21 | fi 22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd) 24 | 25 | # -------------------------------------------------- 26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3" 27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/opt_step-18000" 28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_341_vsmollm2_05b_opt_step_18000_merge_and_resize_eou" 29 | 30 | source /fsx/m4/start-m4-user 31 | conda activate base 32 | conda activate $CONDA_ENV_NAME 33 | 34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR 35 | python experiments/pretraining/vloom/tr_341_vsmollm2_05b/resize_embed_for_eou.py $OUTPUT_DIR 36 | echo "Done" 37 | 38 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_345_vsmollm2_256M_2nd_stage/s3-upload-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_345-s3-upload-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --partition=hopper-cpu 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_345_vsmollm2_256M_2nd_stage/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | 11 | set -e 12 | 13 | # ----------------- Auto-Workdir ----------------- 14 | if [ -n $SLURM_JOB_ID ]; then 15 | # check the original location through scontrol and $SLURM_JOB_ID 16 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 17 | else 18 | # otherwise: started with bash. Get the real location. 19 | SCRIPT_PATH=$(realpath $0) 20 | fi 21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 23 | # -------------------------------------------------- 24 | 25 | ### EDIT ME START ### 26 | 27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved 28 | RUN_FREQUENCY_IN_HOURS=8 29 | 30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 31 | 32 | EXPERIMENT_NAME=tr_345_vsmollm2_256M_2nd_stage 33 | 34 | ### EDIT ME END ### 35 | 36 | 37 | echo "START TIME: $(date)" 38 | 39 | source /fsx/m4/start-m4-user 40 | conda activate base 41 | conda activate $CONDA_ENV_NAME 42 | 43 | pushd $M4_REPO_PATH 44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 46 | 47 | # ensure to restart self first 48 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 49 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm 50 | 51 | echo "running s3 checkpoint upload" 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_345_vsmollm2_256M_2nd_stage/schedule-evals.sh: -------------------------------------------------------------------------------- 1 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_1024.slurm 2 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_2048.slurm 3 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_1024.slurm 4 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_2048.slurm 5 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_346_vsmollm2_256M_3rd_stage/cleanup-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_346-cleanup-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=1:00:00 6 | #SBATCH --partition=hopper-cpu 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_346_vsmollm2_256M_3rd_stage/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | set -e 11 | 12 | # ----------------- Auto-Workdir ----------------- 13 | if [ -n $SLURM_JOB_ID ]; then 14 | # check the original location through scontrol and $SLURM_JOB_ID 15 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 16 | else 17 | # otherwise: started with bash. Get the real location. 18 | SCRIPT_PATH=$(realpath $0) 19 | fi 20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 22 | # -------------------------------------------------- 23 | 24 | ### EDIT ME START ### 25 | 26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved 27 | RUN_FREQUENCY_IN_HOURS=6 28 | 29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 30 | 31 | EXPERIMENT_NAME=tr_346_vsmollm2_256M_3rd_stage 32 | 33 | ### EDIT ME END ### 34 | 35 | 36 | echo "START TIME: $(date)" 37 | 38 | source /fsx/m4/start-m4-user 39 | conda activate base 40 | conda activate $CONDA_ENV_NAME 41 | 42 | pushd $M4_REPO_PATH 43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 45 | 46 | # ensure to restart self first 47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm 49 | 50 | echo "running checkpoint cleanup" 51 | 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_346_vsmollm2_256M_3rd_stage/merge_lora_and_resize_eou.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --gres=gpu:1 6 | #SBATCH --cpus-per-task=12 7 | #SBATCH --time=3:00:00 8 | #SBATCH --partition=hopper-prod 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out 10 | #SBATCH --qos=high 11 | 12 | set -e 13 | 14 | # ----------------- Auto-Workdir ----------------- 15 | if [ -n $SLURM_JOB_ID ]; then 16 | # check the original location through scontrol and $SLURM_JOB_ID 17 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 18 | else 19 | # otherwise: started with bash. Get the real location. 20 | SCRIPT_PATH=$(realpath $0) 21 | fi 22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd) 24 | 25 | # -------------------------------------------------- 26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3" 27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/opt_step-18000" 28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_341_vsmollm2_05b_opt_step_18000_merge_and_resize_eou" 29 | 30 | source /fsx/m4/start-m4-user 31 | conda activate base 32 | conda activate $CONDA_ENV_NAME 33 | 34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR 35 | python experiments/pretraining/vloom/tr_341_vsmollm2_05b/resize_embed_for_eou.py $OUTPUT_DIR 36 | echo "Done" 37 | 38 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_346_vsmollm2_256M_3rd_stage/s3-upload-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_346-s3-upload-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --partition=hopper-prod 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_346_vsmollm2_256M_3rd_stage/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | 11 | set -e 12 | 13 | # ----------------- Auto-Workdir ----------------- 14 | if [ -n $SLURM_JOB_ID ]; then 15 | # check the original location through scontrol and $SLURM_JOB_ID 16 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 17 | else 18 | # otherwise: started with bash. Get the real location. 19 | SCRIPT_PATH=$(realpath $0) 20 | fi 21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 23 | # -------------------------------------------------- 24 | 25 | ### EDIT ME START ### 26 | 27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved 28 | RUN_FREQUENCY_IN_HOURS=8 29 | 30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 31 | 32 | EXPERIMENT_NAME=tr_346_vsmollm2_256M_3rd_stage 33 | 34 | ### EDIT ME END ### 35 | 36 | 37 | echo "START TIME: $(date)" 38 | 39 | source /fsx/m4/start-m4-user 40 | conda activate base 41 | conda activate $CONDA_ENV_NAME 42 | 43 | pushd $M4_REPO_PATH 44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 46 | 47 | # ensure to restart self first 48 | # echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 49 | # sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm 50 | 51 | echo "running s3 checkpoint upload" 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_346_vsmollm2_256M_3rd_stage/schedule-evals.sh: -------------------------------------------------------------------------------- 1 | sbatch experiments/evaluation/vloom/async_evals_tr_346/run_evals_0_shots_val_512.slurm 2 | sbatch experiments/evaluation/vloom/async_evals_tr_346/run_evals_0_shots_val_1024.slurm 3 | sbatch experiments/evaluation/vloom/async_evals_tr_346/run_evals_0_shots_val_1536.slurm 4 | sbatch experiments/evaluation/vloom/async_evals_tr_346/run_evals_0_shots_val_2048.slurm 5 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_347_smolvlm_500M_2nd_stage/cleanup-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_320-cleanup-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=1:00:00 6 | #SBATCH --partition=hopper-cpu 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_320_vsmollm2_long_context/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | set -e 11 | 12 | # ----------------- Auto-Workdir ----------------- 13 | if [ -n $SLURM_JOB_ID ]; then 14 | # check the original location through scontrol and $SLURM_JOB_ID 15 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 16 | else 17 | # otherwise: started with bash. Get the real location. 18 | SCRIPT_PATH=$(realpath $0) 19 | fi 20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 22 | # -------------------------------------------------- 23 | 24 | ### EDIT ME START ### 25 | 26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved 27 | RUN_FREQUENCY_IN_HOURS=6 28 | 29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 30 | 31 | EXPERIMENT_NAME=tr_320_vsmollm2_long_context 32 | 33 | ### EDIT ME END ### 34 | 35 | 36 | echo "START TIME: $(date)" 37 | 38 | source /fsx/m4/start-m4-user 39 | conda activate base 40 | conda activate $CONDA_ENV_NAME 41 | 42 | pushd $M4_REPO_PATH 43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 45 | 46 | # ensure to restart self first 47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm 49 | 50 | echo "running checkpoint cleanup" 51 | 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_347_smolvlm_500M_2nd_stage/merge_lora_and_resize_eou.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --gres=gpu:1 6 | #SBATCH --cpus-per-task=12 7 | #SBATCH --time=3:00:00 8 | #SBATCH --partition=hopper-prod 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out 10 | #SBATCH --qos=high 11 | 12 | set -e 13 | 14 | # ----------------- Auto-Workdir ----------------- 15 | if [ -n $SLURM_JOB_ID ]; then 16 | # check the original location through scontrol and $SLURM_JOB_ID 17 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 18 | else 19 | # otherwise: started with bash. Get the real location. 20 | SCRIPT_PATH=$(realpath $0) 21 | fi 22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd) 24 | 25 | # -------------------------------------------------- 26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3" 27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/opt_step-18000" 28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_341_vsmollm2_05b_opt_step_18000_merge_and_resize_eou" 29 | 30 | source /fsx/m4/start-m4-user 31 | conda activate base 32 | conda activate $CONDA_ENV_NAME 33 | 34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR 35 | python experiments/pretraining/vloom/tr_341_vsmollm2_05b/resize_embed_for_eou.py $OUTPUT_DIR 36 | echo "Done" 37 | 38 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_347_smolvlm_500M_2nd_stage/s3-upload-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_345-s3-upload-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --partition=hopper-cpu 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_345_vsmollm2_256M_2nd_stage/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | 11 | set -e 12 | 13 | # ----------------- Auto-Workdir ----------------- 14 | if [ -n $SLURM_JOB_ID ]; then 15 | # check the original location through scontrol and $SLURM_JOB_ID 16 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 17 | else 18 | # otherwise: started with bash. Get the real location. 19 | SCRIPT_PATH=$(realpath $0) 20 | fi 21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 23 | # -------------------------------------------------- 24 | 25 | ### EDIT ME START ### 26 | 27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved 28 | RUN_FREQUENCY_IN_HOURS=8 29 | 30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 31 | 32 | EXPERIMENT_NAME=tr_345_vsmollm2_256M_2nd_stage 33 | 34 | ### EDIT ME END ### 35 | 36 | 37 | echo "START TIME: $(date)" 38 | 39 | source /fsx/m4/start-m4-user 40 | conda activate base 41 | conda activate $CONDA_ENV_NAME 42 | 43 | pushd $M4_REPO_PATH 44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 46 | 47 | # ensure to restart self first 48 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 49 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm 50 | 51 | echo "running s3 checkpoint upload" 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_347_smolvlm_500M_2nd_stage/schedule-evals.sh: -------------------------------------------------------------------------------- 1 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_1024.slurm 2 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_2048.slurm 3 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_1024.slurm 4 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_2048.slurm 5 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_348_smolvlm_2B/cleanup-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_348-cleanup-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=1:00:00 6 | #SBATCH --partition=hopper-cpu 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_348_smolvlm_2B_token_fix/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | set -e 11 | 12 | # ----------------- Auto-Workdir ----------------- 13 | if [ -n $SLURM_JOB_ID ]; then 14 | # check the original location through scontrol and $SLURM_JOB_ID 15 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 16 | else 17 | # otherwise: started with bash. Get the real location. 18 | SCRIPT_PATH=$(realpath $0) 19 | fi 20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 22 | # -------------------------------------------------- 23 | 24 | ### EDIT ME START ### 25 | 26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved 27 | RUN_FREQUENCY_IN_HOURS=6 28 | 29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 30 | 31 | EXPERIMENT_NAME=tr_348_smolvlm_2B_token_fix 32 | 33 | ### EDIT ME END ### 34 | 35 | 36 | echo "START TIME: $(date)" 37 | 38 | source /fsx/m4/start-m4-user 39 | conda activate base 40 | conda activate $CONDA_ENV_NAME 41 | 42 | pushd $M4_REPO_PATH 43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 45 | 46 | # ensure to restart self first 47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm 49 | 50 | echo "running checkpoint cleanup" 51 | 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_348_smolvlm_2B/merge_lora_and_resize_eou.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --gres=gpu:1 6 | #SBATCH --cpus-per-task=12 7 | #SBATCH --time=3:00:00 8 | #SBATCH --partition=hopper-prod 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out 10 | #SBATCH --qos=high 11 | 12 | set -e 13 | 14 | # ----------------- Auto-Workdir ----------------- 15 | if [ -n $SLURM_JOB_ID ]; then 16 | # check the original location through scontrol and $SLURM_JOB_ID 17 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 18 | else 19 | # otherwise: started with bash. Get the real location. 20 | SCRIPT_PATH=$(realpath $0) 21 | fi 22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd) 24 | 25 | # -------------------------------------------------- 26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3" 27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_348_smolvlm_2B_token_fix/opt_step-22250/" 28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_348_smolvlm_2B_token_fix_opt_step_22250_merge_and_resize_eou" 29 | 30 | source /fsx/m4/start-m4-user 31 | conda activate base 32 | conda activate $CONDA_ENV_NAME 33 | 34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR 35 | python experiments/pretraining/vloom/tr_348_smolvlm_2B_token_fix/resize_embed_for_eou.py $OUTPUT_DIR 36 | echo "Done" 37 | 38 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_348_smolvlm_2B/s3-upload-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_348-s3-upload-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --partition=hopper-cpu 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_348_smolvlm_2B_token_fix/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | 11 | set -e 12 | 13 | # ----------------- Auto-Workdir ----------------- 14 | if [ -n $SLURM_JOB_ID ]; then 15 | # check the original location through scontrol and $SLURM_JOB_ID 16 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 17 | else 18 | # otherwise: started with bash. Get the real location. 19 | SCRIPT_PATH=$(realpath $0) 20 | fi 21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 23 | # -------------------------------------------------- 24 | 25 | ### EDIT ME START ### 26 | 27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved 28 | RUN_FREQUENCY_IN_HOURS=8 29 | 30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 31 | 32 | EXPERIMENT_NAME=tr_348_smolvlm_2B_token_fix 33 | 34 | ### EDIT ME END ### 35 | 36 | 37 | echo "START TIME: $(date)" 38 | 39 | source /fsx/m4/start-m4-user 40 | conda activate base 41 | conda activate $CONDA_ENV_NAME 42 | 43 | pushd $M4_REPO_PATH 44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 46 | 47 | # ensure to restart self first 48 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 49 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm 50 | 51 | echo "running s3 checkpoint upload" 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_348_smolvlm_2B/schedule-evals.slurm: -------------------------------------------------------------------------------- 1 | sbatch experiments/evaluation/vloom/async_evals_tr_348/run_evals_4_shots_captioning_1024.slurm 2 | sbatch experiments/evaluation/vloom/async_evals_tr_348/run_evals_4_shots_captioning_1920.slurm 3 | sbatch experiments/evaluation/vloom/async_evals_tr_348/run_evals_4_shots_vqa_1024.slurm 4 | sbatch experiments/evaluation/vloom/async_evals_tr_348/run_evals_4_shots_vqa_1920.slurm 5 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_349_vsmollm2_500M_3rd_stage/cleanup-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_349-cleanup-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=1:00:00 6 | #SBATCH --partition=hopper-cpu 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_349_vsmollm2_500M_3rd_stage/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | set -e 11 | 12 | # ----------------- Auto-Workdir ----------------- 13 | if [ -n $SLURM_JOB_ID ]; then 14 | # check the original location through scontrol and $SLURM_JOB_ID 15 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 16 | else 17 | # otherwise: started with bash. Get the real location. 18 | SCRIPT_PATH=$(realpath $0) 19 | fi 20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 22 | # -------------------------------------------------- 23 | 24 | ### EDIT ME START ### 25 | 26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved 27 | RUN_FREQUENCY_IN_HOURS=6 28 | 29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 30 | 31 | EXPERIMENT_NAME=tr_349_vsmollm2_500M_3rd_stage 32 | 33 | ### EDIT ME END ### 34 | 35 | 36 | echo "START TIME: $(date)" 37 | 38 | source /fsx/m4/start-m4-user 39 | conda activate base 40 | conda activate $CONDA_ENV_NAME 41 | 42 | pushd $M4_REPO_PATH 43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 45 | 46 | # ensure to restart self first 47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm 49 | 50 | echo "running checkpoint cleanup" 51 | 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_349_vsmollm2_500M_3rd_stage/merge_lora_and_resize_eou.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --gres=gpu:1 6 | #SBATCH --cpus-per-task=12 7 | #SBATCH --time=3:00:00 8 | #SBATCH --partition=hopper-prod 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out 10 | #SBATCH --qos=high 11 | 12 | set -e 13 | 14 | # ----------------- Auto-Workdir ----------------- 15 | if [ -n $SLURM_JOB_ID ]; then 16 | # check the original location through scontrol and $SLURM_JOB_ID 17 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 18 | else 19 | # otherwise: started with bash. Get the real location. 20 | SCRIPT_PATH=$(realpath $0) 21 | fi 22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd) 24 | 25 | # -------------------------------------------------- 26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3" 27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/opt_step-18000" 28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_341_vsmollm2_05b_opt_step_18000_merge_and_resize_eou" 29 | 30 | source /fsx/m4/start-m4-user 31 | conda activate base 32 | conda activate $CONDA_ENV_NAME 33 | 34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR 35 | python experiments/pretraining/vloom/tr_341_vsmollm2_05b/resize_embed_for_eou.py $OUTPUT_DIR 36 | echo "Done" 37 | 38 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_349_vsmollm2_500M_3rd_stage/s3-upload-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_349-s3-upload-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --partition=hopper-prod 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_349_vsmollm2_500M_3rd_stage/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | 11 | set -e 12 | 13 | # ----------------- Auto-Workdir ----------------- 14 | if [ -n $SLURM_JOB_ID ]; then 15 | # check the original location through scontrol and $SLURM_JOB_ID 16 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 17 | else 18 | # otherwise: started with bash. Get the real location. 19 | SCRIPT_PATH=$(realpath $0) 20 | fi 21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 23 | # -------------------------------------------------- 24 | 25 | ### EDIT ME START ### 26 | 27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved 28 | RUN_FREQUENCY_IN_HOURS=8 29 | 30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 31 | 32 | EXPERIMENT_NAME=tr_349_vsmollm2_500M_3rd_stage 33 | 34 | ### EDIT ME END ### 35 | 36 | 37 | echo "START TIME: $(date)" 38 | 39 | source /fsx/m4/start-m4-user 40 | conda activate base 41 | conda activate $CONDA_ENV_NAME 42 | 43 | pushd $M4_REPO_PATH 44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 46 | 47 | # ensure to restart self first 48 | # echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 49 | # sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm 50 | 51 | echo "running s3 checkpoint upload" 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_349_vsmollm2_500M_3rd_stage/schedule-evals.sh: -------------------------------------------------------------------------------- 1 | sbatch experiments/evaluation/vloom/async_evals_tr_349/run_evals_0_shots_val_512.slurm 2 | sbatch experiments/evaluation/vloom/async_evals_tr_349/run_evals_0_shots_val_1024.slurm 3 | sbatch experiments/evaluation/vloom/async_evals_tr_349/run_evals_0_shots_val_1536.slurm 4 | sbatch experiments/evaluation/vloom/async_evals_tr_349/run_evals_0_shots_val_2048.slurm 5 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_350_smolvlm_2B_2nd_stage/cleanup-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_320-cleanup-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=1:00:00 6 | #SBATCH --partition=hopper-cpu 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_320_vsmollm2_long_context/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | set -e 11 | 12 | # ----------------- Auto-Workdir ----------------- 13 | if [ -n $SLURM_JOB_ID ]; then 14 | # check the original location through scontrol and $SLURM_JOB_ID 15 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 16 | else 17 | # otherwise: started with bash. Get the real location. 18 | SCRIPT_PATH=$(realpath $0) 19 | fi 20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 22 | # -------------------------------------------------- 23 | 24 | ### EDIT ME START ### 25 | 26 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved 27 | RUN_FREQUENCY_IN_HOURS=6 28 | 29 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 30 | 31 | EXPERIMENT_NAME=tr_320_vsmollm2_long_context 32 | 33 | ### EDIT ME END ### 34 | 35 | 36 | echo "START TIME: $(date)" 37 | 38 | source /fsx/m4/start-m4-user 39 | conda activate base 40 | conda activate $CONDA_ENV_NAME 41 | 42 | pushd $M4_REPO_PATH 43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 45 | 46 | # ensure to restart self first 47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm 49 | 50 | echo "running checkpoint cleanup" 51 | 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH --skip-evals-check 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_350_smolvlm_2B_2nd_stage/merge_lora_and_resize_eou.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --gres=gpu:1 6 | #SBATCH --cpus-per-task=12 7 | #SBATCH --time=3:00:00 8 | #SBATCH --partition=hopper-prod 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out 10 | #SBATCH --qos=high 11 | 12 | set -e 13 | 14 | # ----------------- Auto-Workdir ----------------- 15 | if [ -n $SLURM_JOB_ID ]; then 16 | # check the original location through scontrol and $SLURM_JOB_ID 17 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 18 | else 19 | # otherwise: started with bash. Get the real location. 20 | SCRIPT_PATH=$(realpath $0) 21 | fi 22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd) 24 | 25 | # -------------------------------------------------- 26 | CONDA_ENV_NAME="shared-m4-2024-05-28-copy3" 27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_341_vsmollm2_05b/opt_step-18000" 28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_341_vsmollm2_05b_opt_step_18000_merge_and_resize_eou" 29 | 30 | source /fsx/m4/start-m4-user 31 | conda activate base 32 | conda activate $CONDA_ENV_NAME 33 | 34 | python m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR 35 | python experiments/pretraining/vloom/tr_341_vsmollm2_05b/resize_embed_for_eou.py $OUTPUT_DIR 36 | echo "Done" 37 | 38 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_350_smolvlm_2B_2nd_stage/s3-upload-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_350-s3-upload-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --partition=hopper-cpu 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_350_smolvlm_2B_2nd_stage/logs/crons/%x-%j.out 8 | #SBATCH --qos high 9 | 10 | 11 | set -e 12 | 13 | # ----------------- Auto-Workdir ----------------- 14 | if [ -n $SLURM_JOB_ID ]; then 15 | # check the original location through scontrol and $SLURM_JOB_ID 16 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 17 | else 18 | # otherwise: started with bash. Get the real location. 19 | SCRIPT_PATH=$(realpath $0) 20 | fi 21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 23 | # -------------------------------------------------- 24 | 25 | ### EDIT ME START ### 26 | 27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved 28 | RUN_FREQUENCY_IN_HOURS=8 29 | 30 | CONDA_ENV_NAME=shared-m4-2024-05-28-copy3 31 | 32 | EXPERIMENT_NAME=tr_350_smolvlm_2B_2nd_stage 33 | 34 | ### EDIT ME END ### 35 | 36 | 37 | echo "START TIME: $(date)" 38 | 39 | source /fsx/m4/start-m4-user 40 | conda activate base 41 | conda activate $CONDA_ENV_NAME 42 | 43 | pushd $M4_REPO_PATH 44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 46 | 47 | # ensure to restart self first 48 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 49 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm 50 | 51 | echo "running s3 checkpoint upload" 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | python -u $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH 56 | 57 | echo "END TIME: $(date)" 58 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_350_smolvlm_2B_2nd_stage/schedule-evals.sh: -------------------------------------------------------------------------------- 1 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_1024.slurm 2 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_captioning_2048.slurm 3 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_1024.slurm 4 | sbatch experiments/evaluation/vloom/async_evals_tr_341/run_evals_4_shots_vqa_2048.slurm 5 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_cron_template/README.md: -------------------------------------------------------------------------------- 1 | # Chronicles 2 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_cron_template/cleanup-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_XYZ-cleanup-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=1:00:00 6 | #SBATCH --partition=production-cluster 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_cron_template/logs/crons/%x-%j.out 8 | 9 | set -e 10 | 11 | # ----------------- Auto-Workdir ----------------- 12 | if [ -n $SLURM_JOB_ID ]; then 13 | # check the original location through scontrol and $SLURM_JOB_ID 14 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 15 | else 16 | # otherwise: started with bash. Get the real location. 17 | SCRIPT_PATH=$(realpath $0) 18 | fi 19 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 20 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 21 | # -------------------------------------------------- 22 | 23 | ### EDIT ME START ### 24 | 25 | # how often to try to run the cleanup - hint: approximately as often as a checkpoint is saved 26 | RUN_FREQUENCY_IN_HOURS=1 27 | 28 | CONDA_ENV_NAME=shared-m4 29 | 30 | EXPERIMENT_NAME=tr_cron_template 31 | 32 | ### EDIT ME END ### 33 | 34 | 35 | echo "START TIME: $(date)" 36 | 37 | source /fsx/m4/start-m4-user 38 | conda activate base 39 | conda activate $CONDA_ENV_NAME 40 | 41 | pushd $M4_REPO_PATH 42 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 43 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 44 | 45 | # ensure to restart self first 46 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 47 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour cleanup-checkpoints.slurm 48 | 49 | echo "running checkpoint cleanup" 50 | 51 | 52 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 53 | 54 | $M4_REPO_PATH/m4/scripts/cleanup-checkpoints.py $M4_CHECKPOINTS_PATH 55 | 56 | echo "END TIME: $(date)" 57 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_cron_template/convert-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_XYZ-convert-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --partition=production-cluster 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_cron_template/logs/crons/%x-%j.out 8 | 9 | 10 | set -e 11 | 12 | # ----------------- Auto-Workdir ----------------- 13 | if [ -n $SLURM_JOB_ID ]; then 14 | # check the original location through scontrol and $SLURM_JOB_ID 15 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 16 | else 17 | # otherwise: started with bash. Get the real location. 18 | SCRIPT_PATH=$(realpath $0) 19 | fi 20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 22 | # -------------------------------------------------- 23 | 24 | ### EDIT ME START ### 25 | 26 | # how often to try to run the checkpoint conversion - hint: approximately as often as a checkpoint is saved 27 | RUN_FREQUENCY_IN_HOURS=1 28 | 29 | CONDA_ENV_NAME=shared-m4 30 | 31 | EXPERIMENT_NAME=tr_cron_template 32 | 33 | ### EDIT ME END ### 34 | 35 | 36 | echo "START TIME: $(date)" 37 | 38 | source /fsx/m4/start-m4-user 39 | conda activate base 40 | conda activate $CONDA_ENV_NAME 41 | 42 | pushd $M4_REPO_PATH 43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 45 | 46 | # ensure to restart self first 47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour convert-checkpoints.slurm 49 | 50 | echo "running checkpoint converter" 51 | 52 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 53 | 54 | $M4_REPO_PATH/m4/scripts/convert-checkpoints.py $M4_CHECKPOINTS_PATH 55 | 56 | echo "END TIME: $(date)" 57 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_cron_template/s3-upload-checkpoints.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_XYZ-s3-upload-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --partition=production-cluster 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_cron_template/logs/crons/%x-%j.out 8 | 9 | 10 | set -e 11 | 12 | # ----------------- Auto-Workdir ----------------- 13 | if [ -n $SLURM_JOB_ID ]; then 14 | # check the original location through scontrol and $SLURM_JOB_ID 15 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 16 | else 17 | # otherwise: started with bash. Get the real location. 18 | SCRIPT_PATH=$(realpath $0) 19 | fi 20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 22 | # -------------------------------------------------- 23 | 24 | ### EDIT ME START ### 25 | 26 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved 27 | RUN_FREQUENCY_IN_HOURS=1 28 | 29 | CONDA_ENV_NAME=shared-m4 30 | 31 | EXPERIMENT_NAME=tr_cron_template 32 | 33 | ### EDIT ME END ### 34 | 35 | 36 | echo "START TIME: $(date)" 37 | 38 | source /fsx/m4/start-m4-user 39 | conda activate base 40 | conda activate $CONDA_ENV_NAME 41 | 42 | pushd $M4_REPO_PATH 43 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 44 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 45 | 46 | # ensure to restart self first 47 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 48 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-checkpoints.slurm 49 | 50 | echo "running checkpoint uploader" 51 | 52 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 53 | 54 | $M4_REPO_PATH/m4/scripts/s3-upload-checkpoints.py $M4_CHECKPOINTS_PATH 55 | 56 | echo "END TIME: $(date)" 57 | -------------------------------------------------------------------------------- /vision/experiments/pretraining/vloom/tr_cron_template/s3-upload-run-files.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_XYZ-s3-upload-run-files 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --mem-per-cpu=11G 7 | #SBATCH --partition=production-cluster 8 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/tr_cron_template/logs/crons/%x-%j.out 9 | 10 | 11 | set -e 12 | 13 | # ----------------- Auto-Workdir ----------------- 14 | if [ -n $SLURM_JOB_ID ]; then 15 | # check the original location through scontrol and $SLURM_JOB_ID 16 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 17 | else 18 | # otherwise: started with bash. Get the real location. 19 | SCRIPT_PATH=$(realpath $0) 20 | fi 21 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 22 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../../../; pwd) 23 | # -------------------------------------------------- 24 | 25 | ### EDIT ME START ### 26 | 27 | # how often to try to run the checkpoint upload - hint: approximately as often as a checkpoint is saved 28 | RUN_FREQUENCY_IN_HOURS=1 29 | 30 | CONDA_ENV_NAME=shared-m4 31 | 32 | EXPERIMENT_NAME=tr_cron_template 33 | 34 | ### EDIT ME END ### 35 | 36 | 37 | echo "START TIME: $(date)" 38 | 39 | source /fsx/m4/start-m4-user 40 | conda activate base 41 | conda activate $CONDA_ENV_NAME 42 | 43 | pushd $M4_REPO_PATH 44 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 45 | cd $M4_REPO_PATH/experiments/pretraining/vloom/$EXPERIMENT_NAME 46 | 47 | # ensure to restart self first 48 | echo scheduling to run again in $RUN_FREQUENCY_IN_HOURS hours 49 | sbatch --begin=now+${RUN_FREQUENCY_IN_HOURS}hour s3-upload-run-files.slurm 50 | 51 | echo "upload run files" 52 | 53 | M4_CHECKPOINTS_PATH=/fsx/m4/experiments/local_experiment_dir/${EXPERIMENT_NAME} 54 | 55 | 56 | # Upload other files 57 | aws s3 cp $M4_CHECKPOINTS_PATH s3://m4-exps/${EXPERIMENT_NAME} --exclude "*opt*" --exclude "*shared*" --recursive 58 | 59 | echo "END TIME: $(date)" 60 | -------------------------------------------------------------------------------- /vision/finetuning/README.md: -------------------------------------------------------------------------------- 1 | # Finetuning 2 | 3 | Here you can find a notebook to finetune SmolVLM on Visual Question Answering using Consumer GPU with QLoRA. -------------------------------------------------------------------------------- /vision/m4/__init__.py: -------------------------------------------------------------------------------- 1 | from m4.utils import logging 2 | -------------------------------------------------------------------------------- /vision/m4/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/evaluation/__init__.py -------------------------------------------------------------------------------- /vision/m4/evaluation/custom_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from m4.evaluation.custom_metrics.classification_vqa_metrics import ClassificationVQAMetrics 2 | from m4.evaluation.custom_metrics.doc_vqa_metrics import DocVQAMetrics 3 | from m4.evaluation.custom_metrics.image_caption_matching_metrics import ImageCaptionMatchingMetrics 4 | from m4.evaluation.custom_metrics.open_ended_vqa_metrics import OpenEndedVQAMetrics 5 | from m4.evaluation.custom_metrics.perplexity_metrics import PerplexityMetrics 6 | from m4.evaluation.custom_metrics.unfolded_classification_metrics import UnfoldedClassificationMetrics 7 | from m4.evaluation.custom_metrics.unfolded_image_captioning_metrics import UnfoldedImageCaptioningMetrics 8 | -------------------------------------------------------------------------------- /vision/m4/evaluation/evaluators/__init__.py: -------------------------------------------------------------------------------- 1 | from m4.evaluation.evaluators.in_contexter import in_contexter 2 | from m4.evaluation.evaluators.linear_prober import linear_prober 3 | -------------------------------------------------------------------------------- /vision/m4/evaluation/generation/README.md: -------------------------------------------------------------------------------- 1 | # Generation Process: 2 | 3 | - find one or more opt-step checkpoints to make generations with 4 | - create folder in code/m4/experiments/generations 5 | - add a config.yaml and a [gen_folder_name]_generate.slurm folder 6 | - fill the config file according to desired hyperparameters: prompt/num_beams/ngram_repeats etc.. 7 | - run sbatch [m4_repo_name]/experiments/generation/[gen_folder_name]/[gen_folder_name]_generate.slurm 8 | - check wandb and make sure your column shows up. If it doesn't, click on "columns" at the bottom right of the generation table and slide the missing generation to the "Displayed columns" side 9 | -------------------------------------------------------------------------------- /vision/m4/evaluation/generation/deprecated_generation/log_generation.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=make_generation 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --partition prepost 6 | #SBATCH --cpus-per-task=1 # number of cores per tasks 7 | #SBATCH --hint=nomultithread # we get physical cores not logical 8 | #SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfsscratch/rech/cnw/commun/experiments/generation_dir/logs/%x_%j.out 10 | #SBATCH --account=cnw@cpu 11 | 12 | source $cnw_ALL_CCFRWORK/start-m4-user 13 | 14 | conda activate $CONDA_ENV_NAME 15 | 16 | export WANDB_DIR=$cnw_ALL_CCFRSCRATCH/experiments 17 | 18 | pushd $WORKING_DIR 19 | 20 | GIT_PYTHON_GIT_EXECUTABLE=`which git` 21 | export GIT_PYTHON_GIT_EXECUTABLE 22 | 23 | python m4/evaluation/generation/log_generation.py --gen_file $GEN_FILE 24 | -------------------------------------------------------------------------------- /vision/m4/evaluation/generation/deprecated_generation/make_generation.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=make_generation 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --partition gpu_p2 6 | #SBATCH --gres=gpu:1 # number of gpu 7 | #SBATCH --cpus-per-task=2 # number of cores per tasks 8 | #SBATCH --hint=nomultithread # we get physical cores not logical 9 | #SBATCH --time 00:30:00 # maximum execution time (HH:MM:SS) 10 | #SBATCH --output=/gpfsscratch/rech/cnw/commun/experiments/generation_dir/logs/%x_%j.out 11 | #SBATCH --account=cnw@gpu 12 | 13 | source $cnw_ALL_CCFRWORK/start-m4-user 14 | 15 | conda activate leo-flash 16 | 17 | # We are on an offline partition 18 | export TRANSFORMERS_OFFLINE=1 19 | 20 | pushd $WORKING_DIR 21 | GIT_PYTHON_GIT_EXECUTABLE=`which git` 22 | export GIT_PYTHON_GIT_EXECUTABLE 23 | 24 | echo "model paths:" 25 | echo $MODEL_PATHS 26 | python m4/evaluation/generation/launch_generation.py --load_config $CONFIG --job_id $SLURM_JOB_ID 27 | -------------------------------------------------------------------------------- /vision/m4/evaluation/generation/deprecated_generation/master_generate.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=generate 3 | #SBATCH --nodes=1 4 | #SBATCH --qos=qos_cpu-dev 5 | #SBATCH --ntasks-per-node=1 6 | #SBATCH --cpus-per-task=1 # number of cores per tasks 7 | #SBATCH --hint=nomultithread # we get physical cores not logical 8 | #SBATCH --time 00:05:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfsscratch/rech/cnw/commun/experiments/generation_dir/logs/%x_%j.out 10 | #SBATCH --account=cnw@cpu 11 | 12 | source $cnw_ALL_CCFRWORK/start-m4-user 13 | 14 | CONDA_ENV_NAME="leo" 15 | 16 | conda activate $CONDA_ENV_NAME 17 | 18 | # We are on an offline partition 19 | WORKING_DIR=$WORK/code/m4_runs_5 20 | pushd $WORKING_DIR 21 | 22 | GIT_PYTHON_GIT_EXECUTABLE=`which git` 23 | export GIT_PYTHON_GIT_EXECUTABLE 24 | 25 | CONFIG="m4/evaluation/generation/generate_config.yaml" 26 | JID_JOB=$(sbatch --job-name=make_generation_$SLURM_JOB_ID --export=ALL,CONDA_ENV_NAME=$CONDA_ENV_NAME,WORKING_DIR=$WORKING_DIR,CONFIG=$CONFIG m4/evaluation/generation/make_generation.slurm) 27 | 28 | JID_JOB=${JID_JOB##* } 29 | 30 | echo $JID_JOB 31 | 32 | GEN_FILE="/gpfsscratch/rech/cnw/commun/experiments/generation_dir/generation_tmp_files_dir/gen_${JID_JOB}.json" 33 | 34 | echo $GEN_FILE 35 | 36 | sbatch --dependency=afterok:$JID_JOB --job-name=log_generation_$SLURM_JOB_ID --export=ALL,CONDA_ENV_NAME=$CONDA_ENV_NAME,WORKING_DIR=$WORKING_DIR,GEN_FILE=$GEN_FILE m4/evaluation/generation/log_generation.slurm 37 | -------------------------------------------------------------------------------- /vision/m4/evaluation/scripts/README.md: -------------------------------------------------------------------------------- 1 | We need to locally save some datasets with `copy_remote_sample_datasets.py` because the caching function does not work for some datasets, see https://github.com/huggingface/datasets/issues/4760 and https://github.com/huggingface/datasets/issues/3547. 2 | -------------------------------------------------------------------------------- /vision/m4/evaluation/scripts/copy_remote_sample_datasets.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from datasets import load_dataset 4 | 5 | from m4.evaluation.tasks import VGPT2_SAMPLE_TASKS, Predictor 6 | from m4.evaluation.utils import EvaluationVersion 7 | 8 | 9 | MIN_DATASET_SIZE = 100 10 | DEFAULT_NUM_EX_PER_CLASS = 3 11 | 12 | ALREADY_COPIED_DATASETS = set() 13 | 14 | model_name = "gpt2" # Not used but necessary to load the task 15 | tokenizer_name = "t5-base" # Not used but necessary to load the task 16 | image_size = 224 # Not used but necessary to load the task 17 | evaluation_version = EvaluationVersion.v2 # Not used but necessary to load the task 18 | 19 | save_dir = Path("/gpfsscratch/rech/cnw/commun/local_datasets") 20 | 21 | 22 | def load_and_save_dataset(task, split, save_dir): 23 | dataset_split = load_dataset(task.dataset_name, name=task.dataset_config, split=split, use_auth_token=True) 24 | print("********************************************************") 25 | print(task.__class__.__name__) 26 | print(len(dataset_split)) 27 | print(f"Dataset name is {task.dataset_name} and split is {split} and config is {task.dataset_config}") 28 | print("********************************************************") 29 | 30 | dataset_split.save_to_disk(save_dir / task.dataset_name / split) 31 | 32 | 33 | if __name__ == "__main__": 34 | for task_objet in VGPT2_SAMPLE_TASKS[Predictor.in_contexter]: 35 | task = task_objet( 36 | model_name=model_name, 37 | tokenizer_name=tokenizer_name, 38 | image_size=image_size, 39 | evaluation_version=evaluation_version, 40 | ) 41 | 42 | load_and_save_dataset(task, task.test_split_name, save_dir) 43 | load_and_save_dataset(task, task.train_split_name, save_dir) 44 | -------------------------------------------------------------------------------- /vision/m4/evaluation/tasks/base.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import List, Optional 3 | 4 | 5 | class Predictor(Enum): 6 | in_contexter = "in_contexter" 7 | linear_prober = "linear_prober" 8 | 9 | 10 | class BaseTask: 11 | dataset_name: str # Dataset (example: birdsnap) 12 | dataset_config: Optional[str] = None # Dataset config (example: partition_1) 13 | default_support_split_name: Optional[str] = None 14 | default_query_split_name: str 15 | metric_name: str # the metric to use (example: accuracy) - use evaluate 16 | metrics_kwargs: Optional[dict] = {} 17 | extra_metrics: Optional[list] = None 18 | model_class: str # The model 19 | predictor_class: Predictor 20 | id_column_name: Optional[str] = None 21 | 22 | def __init__(self, **kwargs) -> None: 23 | pass 24 | 25 | 26 | class BaseTaskClassification(BaseTask): 27 | image_column_names: List[str] 28 | label_column_name: str 29 | context_column_names: Optional[List[str]] = None 30 | tested_ex_excluded_context_columns: Optional[List[str]] = None 31 | tested_labels_column_name: Optional[str] = None 32 | relevance_scores_column_name: Optional[str] = None 33 | 34 | 35 | class BaseTaskOpenEndedVQA(BaseTask): 36 | image_column_name: str 37 | question_column_name: str 38 | answers_column_name: str 39 | context_column_names: Optional[List[str]] = None 40 | 41 | 42 | class BaseTaskImageCaptioning(BaseTask): 43 | image_column_name: str 44 | reference_captions_column_name: str 45 | context_column_name: Optional[str] = None 46 | 47 | 48 | class BaseTaskImageCaptionMatching(BaseTask): 49 | image_column_names: List[str] 50 | caption_column_names: List[str] 51 | -------------------------------------------------------------------------------- /vision/m4/models/__init__.py: -------------------------------------------------------------------------------- 1 | from m4.models.custom_modules import DecoupledEmbedding, DecoupledLinear 2 | from m4.models.idefics.configuration_idefics import IdeficsConfig 3 | from m4.models.idefics.modeling_idefics import IdeficsForCausalLM 4 | from m4.models.vgpt2.configuration_vgpt2 import VGPT2Config 5 | from m4.models.vgpt2.modeling_vgpt2 import VGPT2LMHeadModel 6 | from m4.models.vllama3.configuration_vllama3 import VLlama3Config 7 | from m4.models.vllama3.modeling_vllama3 import VLlama3ForCausalLM 8 | from m4.models.vmistral.configuration_vmistral import VMistralConfig 9 | from m4.models.vmistral.modeling_vmistral import VMistralForCausalLM 10 | 11 | 12 | _SUPPORTED_MODELS = { 13 | "vgpt2": VGPT2Config, 14 | # "vllama": IdeficsConfig, 15 | "idefics": IdeficsConfig, 16 | "vmistral": VMistralConfig, 17 | "vllama3": VLlama3Config, 18 | } 19 | 20 | model_type_to_modeling_class = { 21 | "vgpt2": VGPT2LMHeadModel, 22 | # "vllama": IdeficsForCausalLM, 23 | "idefics": IdeficsForCausalLM, 24 | "vmistral": VMistralForCausalLM, 25 | "vllama3": VLlama3ForCausalLM, 26 | } 27 | -------------------------------------------------------------------------------- /vision/m4/models/idefics/evaluation_classification_vqa_in_context_idefics.py: -------------------------------------------------------------------------------- 1 | from m4.evaluation.custom_metrics.classification_vqa_metrics import ClassifVQAMetrics 2 | from m4.models.vgpt2.evaluation_classification_vqa_in_context_vgpt2 import Vgpt2ClassificationVQAInContext 3 | 4 | 5 | class IdeficsClassificationVQAInContext(Vgpt2ClassificationVQAInContext): 6 | model_class: str = "IdeficsForCausalLM" 7 | tokenizer_max_seq_len = 2048 8 | 9 | 10 | class VQAv2IdeficsClassificationVQAInContextAcc(IdeficsClassificationVQAInContext): 11 | dataset_name: str = "HuggingFaceM4/VQAv2_modif" 12 | metric_name: str = "ClassificationVQAMetrics" 13 | metric_kwargs = { 14 | "metrics": [ 15 | ClassifVQAMetrics.VQA_ACCURACY, 16 | ClassifVQAMetrics.ENTROPY_DISTRIBUTION, 17 | ClassifVQAMetrics.ENTROPY_MEAN, 18 | ] 19 | } 20 | default_query_split_name: str = "validation" 21 | default_support_split_name: str = "train" 22 | image_column_name: str = "image" 23 | question_column_name: str = "question" 24 | answers_column_name: str = "answers" 25 | length_normalize: bool = False 26 | 27 | 28 | class VQAv2SampleIdeficsClassificationVQAInContextAcc(VQAv2IdeficsClassificationVQAInContextAcc): 29 | dataset_name: str = "HuggingFaceM4/VQAv2_modif-Sample" 30 | -------------------------------------------------------------------------------- /vision/m4/models/idefics/make_tiny_llama.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This script creates a super tiny model that is useful inside tests, when we just want to test that 4 | # the machinery works, without needing to check the quality of the outcomes. 5 | # 6 | # usage: adjust the configs if wanted, but otherwise just run the script 7 | 8 | from pathlib import Path 9 | 10 | from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer 11 | 12 | 13 | mname_tiny = "tiny-random-LlamaForCausalLM" 14 | 15 | path = Path(mname_tiny) 16 | path.mkdir(parents=True, exist_ok=True) 17 | 18 | config = LlamaConfig() 19 | config.update( 20 | dict( 21 | vocab_size=32000, 22 | hidden_size=16, 23 | intermediate_size=16 * 4, 24 | num_hidden_layers=2, 25 | num_attention_heads=4, 26 | ) 27 | ) 28 | model = LlamaForCausalLM(config) 29 | tokenizer = LlamaTokenizer.from_pretrained("path_to_llama_7b") 30 | 31 | # Test w/ one text 32 | query = "This is a test" 33 | query_tokens = tokenizer(query, return_tensors="pt") 34 | 35 | input = { 36 | "input_ids": query_tokens["input_ids"], 37 | "attention_mask": query_tokens["attention_mask"], 38 | } 39 | 40 | out_gen = model.generate(**input) 41 | text = tokenizer.batch_decode(out_gen) 42 | 43 | # Save model + config + tokenizer 44 | model.half() # makes it smaller 45 | model.save_pretrained(path) 46 | tokenizer.save_pretrained(path) 47 | 48 | # test we can load it back 49 | model = LlamaForCausalLM.from_pretrained(path) 50 | 51 | print(f"Generated {mname_tiny} - Upload the generated folder to the hub") 52 | -------------------------------------------------------------------------------- /vision/m4/models/vgpt2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/models/vgpt2/__init__.py -------------------------------------------------------------------------------- /vision/m4/models/vllama3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/models/vllama3/__init__.py -------------------------------------------------------------------------------- /vision/m4/models/vllama3/make_tiny_llama3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This script creates a super tiny model that is useful inside tests, when we just want to test that 4 | # the machinery works, without needing to check the quality of the outcomes. 5 | # 6 | # usage: adjust the configs if wanted, but otherwise just run the script 7 | 8 | from pathlib import Path 9 | 10 | from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM 11 | 12 | 13 | mname_tiny = "tiny-random-Llama3ForCausalLM" 14 | 15 | path = Path(mname_tiny) 16 | path.mkdir(parents=True, exist_ok=True) 17 | 18 | config = LlamaConfig() 19 | config.update( 20 | dict( 21 | vocab_size=128_256, 22 | hidden_size=16, 23 | intermediate_size=16 * 4, 24 | num_hidden_layers=2, 25 | num_attention_heads=4, 26 | num_key_value_heads=1, 27 | ) 28 | ) 29 | model = LlamaForCausalLM(config) 30 | tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B") 31 | 32 | # Test w/ one text 33 | query = "This is a test" 34 | query_tokens = tokenizer(query, return_tensors="pt") 35 | 36 | input = { 37 | "input_ids": query_tokens["input_ids"], 38 | "attention_mask": query_tokens["attention_mask"], 39 | } 40 | 41 | out_gen = model.generate(**input) 42 | text = tokenizer.batch_decode(out_gen) 43 | 44 | # Save model + config + tokenizer 45 | model.half() # makes it smaller 46 | model.save_pretrained(path) 47 | tokenizer.save_pretrained(path) 48 | 49 | # test we can load it back 50 | model = LlamaForCausalLM.from_pretrained(path) 51 | 52 | print(f"Generated {mname_tiny} - Upload the generated folder to the hub") 53 | # Pushed to HuggingFaceM4/tiny-random-Llama3ForCausalLM 54 | -------------------------------------------------------------------------------- /vision/m4/models/vmistral/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/models/vmistral/__init__.py -------------------------------------------------------------------------------- /vision/m4/models/vmistral/evaluation_classification_vqa_in_context_vmistral.py: -------------------------------------------------------------------------------- 1 | from m4.evaluation.custom_metrics.classification_vqa_metrics import ClassifVQAMetrics 2 | from m4.models.vgpt2.evaluation_classification_vqa_in_context_vgpt2 import Vgpt2ClassificationVQAInContext 3 | 4 | 5 | class VMistralClassificationVQAInContext(Vgpt2ClassificationVQAInContext): 6 | model_class: str = "VMistralForCausalLM" 7 | tokenizer_max_seq_len = 4096 8 | 9 | 10 | class VQAv2VMistralClassificationVQAInContextAcc(VMistralClassificationVQAInContext): 11 | dataset_name: str = "HuggingFaceM4/VQAv2_modif" 12 | metric_name: str = "ClassificationVQAMetrics" 13 | metric_kwargs = { 14 | "metrics": [ 15 | ClassifVQAMetrics.VQA_ACCURACY, 16 | ClassifVQAMetrics.ENTROPY_DISTRIBUTION, 17 | ClassifVQAMetrics.ENTROPY_MEAN, 18 | ] 19 | } 20 | default_query_split_name: str = "validation" 21 | default_support_split_name: str = "train" 22 | image_column_name: str = "image" 23 | question_column_name: str = "question" 24 | answers_column_name: str = "answers" 25 | length_normalize: bool = False 26 | 27 | 28 | class VQAv2SampleVMistralClassificationVQAInContextAcc(VQAv2VMistralClassificationVQAInContextAcc): 29 | dataset_name: str = "HuggingFaceM4/VQAv2_modif-Sample" 30 | -------------------------------------------------------------------------------- /vision/m4/models/vmistral/make_tiny_mistral.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This script creates a super tiny model that is useful inside tests, when we just want to test that 4 | # the machinery works, without needing to check the quality of the outcomes. 5 | # 6 | # usage: adjust the configs if wanted, but otherwise just run the script 7 | 8 | from pathlib import Path 9 | 10 | from transformers import AutoTokenizer, MistralConfig, MistralForCausalLM 11 | 12 | 13 | mname_tiny = "tiny-random-MistralForCausalLM" 14 | 15 | path = Path(mname_tiny) 16 | path.mkdir(parents=True, exist_ok=True) 17 | 18 | config = MistralConfig() 19 | config.update( 20 | dict( 21 | vocab_size=32000, 22 | hidden_size=16, 23 | intermediate_size=16 * 4, 24 | num_hidden_layers=2, 25 | num_attention_heads=4, 26 | num_key_value_heads=1, 27 | ) 28 | ) 29 | model = MistralForCausalLM(config) 30 | tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") 31 | 32 | # Test w/ one text 33 | query = "This is a test" 34 | query_tokens = tokenizer(query, return_tensors="pt") 35 | 36 | input = { 37 | "input_ids": query_tokens["input_ids"], 38 | "attention_mask": query_tokens["attention_mask"], 39 | } 40 | 41 | out_gen = model.generate(**input) 42 | text = tokenizer.batch_decode(out_gen) 43 | 44 | # Save model + config + tokenizer 45 | model.half() # makes it smaller 46 | model.save_pretrained(path) 47 | tokenizer.save_pretrained(path) 48 | 49 | # test we can load it back 50 | model = MistralForCausalLM.from_pretrained(path) 51 | 52 | print(f"Generated {mname_tiny} - Upload the generated folder to the hub") 53 | # Pushed to HuggingFaceM4/tiny-random-MistralForCausalLM 54 | -------------------------------------------------------------------------------- /vision/m4/scripts/clean_jsonl_evals.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | PATH_JSONL = "/Users/hugolaurencon/Desktop/tr_209_ift_mixture_test_final_evaluations.jsonl" 5 | 6 | 7 | BANNED_KEYS = [ 8 | "kl_distribution", 9 | "entropy_distribution", 10 | "kl_mean", 11 | "Bleu_1", 12 | "Bleu_1_all", 13 | "Bleu_2", 14 | "Bleu_2_all", 15 | "Bleu_3", 16 | "Bleu_3_all", 17 | "Bleu_4", 18 | "Bleu_4_all", 19 | "METEOR", 20 | "METEOR_all", 21 | "CIDEr_all", 22 | "ROUGE_L", 23 | "ROUGE_L_all", 24 | "per_bucket_accuracy", 25 | "std_per_bucket_accuracy", 26 | "entropy_mean", 27 | ] 28 | 29 | 30 | jsonl_data = [] 31 | with open(PATH_JSONL, "r") as file: 32 | for line in file: 33 | json_data = json.loads(line) 34 | jsonl_data.append(json_data) 35 | 36 | 37 | for idx, data in enumerate(jsonl_data): 38 | if "score" in data: 39 | if type(data["score"]) == str: 40 | data["score"] = json.loads(data["score"].replace("'", '"')) 41 | for banned_key in BANNED_KEYS: 42 | if banned_key in data["score"]: 43 | data["score"].pop(banned_key) 44 | jsonl_data[idx] = data 45 | 46 | 47 | with open(PATH_JSONL, "w") as file: 48 | for item in jsonl_data: 49 | item_json = json.dumps(item) 50 | file.write(item_json + "\n") 51 | -------------------------------------------------------------------------------- /vision/m4/scripts/convert_vmistral_lm_head.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | 4 | from safetensors import safe_open 5 | from safetensors.torch import save_file 6 | 7 | 8 | model_path = "/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_272_bis_opt_step_15000_merge_and_resize_eou_renamed_lmhead/unwrapped_model" 9 | safetensor_files = glob.glob(f"{model_path}/model*.safetensors") 10 | 11 | KEYS_TO_MODIFY_MAPPING = { 12 | "lm_head.additional_fc": "additional_fc", 13 | } 14 | 15 | for file in safetensor_files: 16 | tensors = {} 17 | with safe_open(file, framework="pt", device="cpu") as f: 18 | for old_key in f.keys(): 19 | final_key = old_key 20 | for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): 21 | if key_to_modify in old_key: 22 | final_key = old_key.replace(key_to_modify, new_key) 23 | tensors[final_key] = f.get_tensor(old_key) 24 | print(f"{tensors.keys()}") 25 | save_file(tensors, file, metadata={"format": "pt"}) 26 | 27 | with open(f"{model_path}/model.safetensors.index.json", "r") as f: 28 | data = json.load(f) 29 | keys_to_iterate = list(data["weight_map"].keys()) 30 | new_data_weight_map = {} 31 | for old_key, v in data["weight_map"].items(): 32 | final_key = old_key 33 | for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): 34 | if key_to_modify in old_key: 35 | final_key = old_key.replace(key_to_modify, new_key) 36 | new_data_weight_map[final_key] = v 37 | data["weight_map"] = new_data_weight_map 38 | 39 | with open(f"{model_path}/model.safetensors.index.json", "w") as f: 40 | json_object = json.dumps(data, indent=4) 41 | f.write(json_object) 42 | -------------------------------------------------------------------------------- /vision/m4/scripts/job_update_siglip_model_pos_embeds.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=update_siglip_model_pos_embeds.py 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=96 6 | #SBATCH --mem-per-cpu=20G 7 | #SBATCH --output=/fsx/m4/experiments/general_logs/update_siglip_model_pos_embeds/%x_%j.out 8 | #SBATCH --time=20:00:00 9 | #SBATCH --partition=hopper-prod 10 | #SBATCH --gpus=1 11 | #SBATCH --qos high 12 | 13 | 14 | set -x -e 15 | 16 | source /fsx/m4/start-m4-user 17 | conda activate base 18 | conda activate shared-m4 19 | 20 | 21 | WORKING_DIR=/fsx/hugo/repos/m4_36 22 | pushd $WORKING_DIR 23 | 24 | python m4/scripts/update_siglip_model_pos_embeds.py 25 | -------------------------------------------------------------------------------- /vision/m4/scripts/merge_lora_and_behead.sh: -------------------------------------------------------------------------------- 1 | set -x -e 2 | 3 | source /fsx/m4/start-m4-user 4 | conda activate victor 5 | 6 | INPUT_DIR=/fsx/m4/experiments/local_experiment_dir/tr_289_288_ter_12600_lima_sft/opt_step-1400 7 | OUTPUT_DIR=/fsx/m4/victor/idefics2 8 | 9 | SCRIPT_RELATIVE_PATH="${BASH_SOURCE[0]}" 10 | PATH_TO_THIS_FILE=$(realpath "$SCRIPT_RELATIVE_PATH") 11 | echo "The absolute path of the current script file is: $PATH_TO_THIS_FILE" 12 | 13 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 14 | WORKING_DIR=$(builtin cd $SCRIPT_DIR/; pwd) 15 | echo "Working dir is: $WORKING_DIR" 16 | 17 | cd $WORKING_DIR 18 | 19 | 20 | python merge_lora_and_save.py $INPUT_DIR $OUTPUT_DIR 21 | echo "Finished merge lora" 22 | mv $OUTPUT_DIR/unwrapped_model/model* $OUTPUT_DIR 23 | rm -rf $OUTPUT_DIR/unwrapped_model 24 | rm -rf $OUTPUT_DIR/tokenizer # Just a sanity 25 | 26 | 27 | python behead_unused_params.py \ 28 | --model_dir $OUTPUT_DIR \ 29 | --behead_siglip_pooling \ 30 | --behead_perceiver_rmsnorm 31 | echo "Finished behead unused parameters" 32 | 33 | # Push `/fsx/m4/victor/idefics2` to `HuggingFaceM4/idefics2` 34 | # Then call optionally to transform into transformers compatible checkpoint and push to `HuggingFaceM4/idefics2-tfrm-compatible` 35 | # python transformers/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py \ 36 | # --original_model_id HuggingFaceM4/idefics2 \ 37 | # --output_hub_path /fsx/m4/victor/idefics2-tfrm-compatible 38 | -------------------------------------------------------------------------------- /vision/m4/scripts/merge_lora_and_resize_eou_template.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=template-merge-lora-and-resize-eou 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --gres=gpu:1 6 | #SBATCH --cpus-per-task=12 7 | #SBATCH --time=3:00:00 8 | #SBATCH --partition=hopper-prod 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras_and_resize_eou/%x-%j.out 10 | #SBATCH --qos=high 11 | 12 | set -e 13 | 14 | # ----------------- Auto-Workdir ----------------- 15 | if [ -n $SLURM_JOB_ID ]; then 16 | # check the original location through scontrol and $SLURM_JOB_ID 17 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 18 | else 19 | # otherwise: started with bash. Get the real location. 20 | SCRIPT_PATH=$(realpath $0) 21 | fi 22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd) 24 | 25 | # -------------------------------------------------- 26 | CONDA_ENV_NAME="shared-m4" 27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/tr_315_vsmollm_long_context/opt_step-12810/" 28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_315_vsmollm_long_contex_opt_step_12810_merge_and_resize_eou" 29 | 30 | source /fsx/m4/start-m4-user 31 | conda activate base 32 | conda activate $CONDA_ENV_NAME 33 | pushd $M4_REPO_PATH 34 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 35 | 36 | python $M4_REPO_PATH/m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR 37 | python $M4_REPO_PATH/m4/scripts/resize_embed_for_eou.py $OUTPUT_DIR 38 | echo "Done" 39 | -------------------------------------------------------------------------------- /vision/m4/scripts/merge_lora_template.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=template-merge-lora 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --gres=gpu:0 6 | #SBATCH --cpus-per-task=12 7 | #SBATCH --time=3:00:00 8 | #SBATCH --partition=hopper-prod 9 | #SBATCH --output=/fsx/m4/experiments/general_logs/merge_loras/%x-%j.out 10 | 11 | 12 | set -e 13 | 14 | # ----------------- Auto-Workdir ----------------- 15 | if [ -n $SLURM_JOB_ID ]; then 16 | # check the original location through scontrol and $SLURM_JOB_ID 17 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 18 | else 19 | # otherwise: started with bash. Get the real location. 20 | SCRIPT_PATH=$(realpath $0) 21 | fi 22 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 23 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd) 24 | 25 | # -------------------------------------------------- 26 | CONDA_ENV_NAME="shared-m4" 27 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/template_dir" 28 | OUTPUT_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/template_out_dir" 29 | 30 | 31 | source /fsx/m4/start-m4-user 32 | conda activate base 33 | conda activate $CONDA_ENV_NAME 34 | pushd $M4_REPO_PATH 35 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 36 | 37 | python $M4_REPO_PATH/m4/scripts/merge_lora_and_save.py $OPT_STEP_DIR $OUTPUT_DIR 38 | -------------------------------------------------------------------------------- /vision/m4/scripts/s3_checkpoint_download_convert_upload.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_test-s3-download-and-convert-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --partition=production-cluster 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/logs/%x-%j.out 8 | 9 | 10 | set -e 11 | 12 | # ----------------- Auto-Workdir ----------------- 13 | if [ -n $SLURM_JOB_ID ]; then 14 | # check the original location through scontrol and $SLURM_JOB_ID 15 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 16 | else 17 | # otherwise: started with bash. Get the real location. 18 | SCRIPT_PATH=$(realpath $0) 19 | fi 20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd) 22 | 23 | # -------------------------------------------------- 24 | 25 | ### EDIT ME START ### 26 | 27 | CONDA_ENV_NAME=shared-m4 28 | 29 | EXPERIMENT_NAME=tr_194_laion_cm4_mix 30 | 31 | opt_step_num_list=( 32 | "1000" 33 | "2000" 34 | ) 35 | 36 | ### EDIT ME END ### 37 | 38 | 39 | echo "START TIME: $(date)" 40 | 41 | source /fsx/m4/start-m4-user 42 | conda activate base 43 | conda activate $CONDA_ENV_NAME 44 | pushd $M4_REPO_PATH 45 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 46 | 47 | echo "running checkpoint download, convert, upload for opt-steps: ${opt_step_num_list[@]} of experiment: $EXPERIMENT_NAME" 48 | 49 | python $M4_REPO_PATH/m4/scripts/s3_checkpoint_download_convert_upload.py $EXPERIMENT_NAME ${opt_step_num_list[@]} $M4_REPO_PATH 50 | 51 | echo "END TIME: $(date)" 52 | -------------------------------------------------------------------------------- /vision/m4/scripts/s3_downloaded_checkpoints_cleanup.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tr_test-s3-cleanup-checkpoints 3 | #SBATCH --ntasks=1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=3:00:00 6 | #SBATCH --partition=production-cluster 7 | #SBATCH --output=/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/logs/%x-%j.out 8 | 9 | 10 | set -e 11 | 12 | # ----------------- Auto-Workdir ----------------- 13 | if [ -n $SLURM_JOB_ID ]; then 14 | # check the original location through scontrol and $SLURM_JOB_ID 15 | SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}') 16 | else 17 | # otherwise: started with bash. Get the real location. 18 | SCRIPT_PATH=$(realpath $0) 19 | fi 20 | SCRIPT_DIR=$(dirname ${SCRIPT_PATH}) 21 | M4_REPO_PATH=$(builtin cd $SCRIPT_DIR/../../; pwd) 22 | 23 | # -------------------------------------------------- 24 | 25 | ### EDIT ME START ### 26 | 27 | CONDA_ENV_NAME=shared-m4 28 | 29 | EXPERIMENT_NAME=tr_194_laion_cm4_mix 30 | 31 | opt_step_num_list=( 32 | "1000" 33 | "2000" 34 | ) 35 | 36 | ### EDIT ME END ### 37 | 38 | 39 | echo "START TIME: $(date)" 40 | 41 | source /fsx/m4/start-m4-user 42 | conda activate base 43 | conda activate $CONDA_ENV_NAME 44 | pushd $M4_REPO_PATH 45 | export PYTHONPATH=$WORKING_DIR:$PYTHONPATH 46 | 47 | for opt_step_num in ${opt_step_num_list[@]} 48 | do 49 | OPT_STEP_DIR="/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/${EXPERIMENT_NAME}/opt_step-${opt_step_num}" 50 | rm -r $OPT_STEP_DIR 51 | echo "Deleted $OPT_STEP_DIR of experiment: $EXPERIMENT_NAME" 52 | done 53 | 54 | echo "END TIME: $(date)" 55 | -------------------------------------------------------------------------------- /vision/m4/sourcing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/callers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/callers/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/configs/config_extract_web_documents.yaml: -------------------------------------------------------------------------------- 1 | dom_tree_simplificator: 2 | strip_multiple_linebreaks: True 3 | strip_multiple_spaces: True 4 | remove_html_comments: True 5 | replace_line_break_tags: True 6 | unwrap_tags: True 7 | strip_tags: True 8 | strip_special_divs: True 9 | remove_dates: True 10 | remove_empty_leaves: True 11 | unnest_nodes: True 12 | remake_tree: True 13 | css_rules: 14 | - "[class~='footer']" 15 | - "[class~='site-info']" 16 | css_rules_replace_with_text: {"[class~='more-link']": "\n\nEND_OF_DOCUMENT_TOKEN_TO_BE_REPLACED\n\n"} 17 | pre_extraction_simplificator: 18 | only_text_image_nodes: True 19 | format_texts: True 20 | merge_consecutive_text_nodes: True 21 | web_document_extractor: 22 | image_size: 256 23 | resize_mode: no 24 | -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/configs/config_filter_laion_pairs.yaml: -------------------------------------------------------------------------------- 1 | cond_check_size_image: True 2 | original_width_min_cutoff: 150 3 | original_width_max_cutoff: 10_000 4 | original_height_min_cutoff: 150 5 | original_height_max_cutoff: 10_000 6 | aspect_ratio_max_cutoff: 3 7 | cond_check_number_words: True 8 | number_words_min_cutoff: 1 9 | number_words_max_cutoff: 256 10 | cond_check_word_repetition_ratio: True 11 | word_repetition_length: 1 12 | word_repetition_max_cutoff: 0.7 13 | cond_check_special_character_ratio: True 14 | special_character_ratio_max_cutoff: 0.4 15 | cond_check_common_word_ratio: True 16 | common_word_ratio_min_cutoff: 0.7 17 | -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/configs/config_filter_text_image_pairs.yaml: -------------------------------------------------------------------------------- 1 | cond_check_image_in_simplified_dom_tree: True 2 | cond_check_format: True 3 | valid_formats: !!set {jpg, jpeg, png, webp} 4 | cond_check_size_image: True 5 | original_width_min_cutoff: 100 6 | # PIL decompression bomb warning is at 178M pixels, and 10000^2=100M is close 7 | original_width_max_cutoff: 10000 8 | original_height_min_cutoff: 100 9 | original_height_max_cutoff: 10000 10 | rendered_width_min_cutoff: 100 11 | rendered_width_max_cutoff: 100000 12 | rendered_height_min_cutoff: 100 13 | rendered_height_max_cutoff: 100000 14 | aspect_ratio_max_cutoff: 3 15 | cond_remove_non_printing_characters: False # Warning if set to True, it contains " " and "\n" 16 | cond_standardize_whitespace: True 17 | cond_check_number_words: True 18 | number_words_min_cutoff: 3 19 | number_words_max_cutoff: 256 20 | cond_check_special_character_ratio: True 21 | special_character_ratio_max_cutoff: 0.4 22 | cond_check_stopword_ratio: False 23 | stopword_ratio_min_cutoff: 0 24 | cond_check_repetition_ratio: True 25 | repetition_ratio_max_cutoff: 0.3 26 | cond_check_clip_score: True 27 | clip_score_min_cutoff: 0.2 28 | -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/debug/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/debug/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/outputs/README.md: -------------------------------------------------------------------------------- 1 | # Clip distributions - descriptive stat 2 | 3 | - SBU Captions 4 | ```python 5 | DescribeResult(nobs=10000, minmax=(0.11153904348611832, 0.44991129636764526), mean=0.2874957061290741, variance=0.0016425453395696478, skewness=-0.22512623318313724, kurtosis=0.1512977180455395) 6 | ``` 7 | 8 | - Red Caps 9 | ```python 10 | DescribeResult(nobs=10000, minmax=(0.08980361372232437, 0.4210364818572998), mean=0.3082767878524959, variance=0.001230211924011678, skewness=-0.5157219676083339, kurtosis=0.6965278169334876) 11 | ``` 12 | 13 | - LAION 400M 14 | ```python 15 | DescribeResult(nobs=10000, minmax=(0.16056129336357117, 0.4760231077671051), mean=0.333618477447331, variance=0.0008586748609226699, skewness=0.7131919650316029, kurtosis=1.668628208211425) 16 | ``` 17 | -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/outputs/clip_scores_laion400m_10000.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/outputs/clip_scores_laion400m_10000.npy -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/outputs/clip_scores_red_caps_10000.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/outputs/clip_scores_red_caps_10000.npy -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/outputs/clip_scores_sbu_captions_10000.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/outputs/clip_scores_sbu_captions_10000.npy -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/outputs/distributions_extracted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/outputs/distributions_extracted.png -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/outputs/distributions_reference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/outputs/distributions_reference.png -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/processors/__init__.py: -------------------------------------------------------------------------------- 1 | from m4.sourcing.data_collection.processors.dom_tree_simplificator import DOMTreeSimplificator 2 | from m4.sourcing.data_collection.processors.html_extractor import HtmlExtractor 3 | from m4.sourcing.data_collection.processors.image_deduplicator import ImageDeduplicator 4 | from m4.sourcing.data_collection.processors.pair_extractor import TextMediaPairsExtractor 5 | from m4.sourcing.data_collection.processors.pair_filtering import PairFiltering 6 | from m4.sourcing.data_collection.processors.pre_extraction_simplificator import PreExtractionSimplificator 7 | from m4.sourcing.data_collection.processors.warc_downloader import WarcDownloader 8 | from m4.sourcing.data_collection.processors.web_document_extractor import CommonCrawlWebDocumentExtractor 9 | from m4.sourcing.data_collection.processors.web_document_filtering import ( 10 | FilteringFunctions, 11 | WebDocumentFilteringDocLevel, 12 | WebDocumentFilteringNodeLevel, 13 | ) 14 | from m4.sourcing.data_collection.processors.web_document_image_deduplication import WebDocumentImageDeduplication 15 | from m4.sourcing.data_collection.processors.web_document_line_deduplication import WebDocumentLineDeduplication 16 | -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/processors/warc_downloader.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from botocore.config import Config 3 | from botocore.exceptions import ClientError, ProxyConnectionError 4 | 5 | 6 | class WarcDownloader: 7 | def __init__(self): 8 | config_boto = Config( 9 | # region_name="us-east-1", # Location of the CC data, commenting this line since it doesn't help 10 | retries={"max_attempts": 10, "mode": "standard"} 11 | ) 12 | self.client = boto3.client("s3", config=config_boto) 13 | 14 | def __call__(self, example): 15 | if example["warc"] and not example["warc_error"]: 16 | return example 17 | 18 | warc_filename = example["warc_filename"] 19 | warc_record_offset = example["warc_record_offset"] 20 | warc_record_length = example["warc_record_length"] 21 | 22 | warc, warc_error = self.get_warc_from_metadata( 23 | client=self.client, 24 | warc_filename=warc_filename, 25 | warc_record_offset=warc_record_offset, 26 | warc_record_length=warc_record_length, 27 | ) 28 | example["warc"] = warc 29 | example["warc_error"] = warc_error 30 | return example 31 | 32 | def get_warc_from_metadata(self, client, warc_filename, warc_record_offset, warc_record_length): 33 | try: 34 | response = client.get_object( 35 | Bucket="commoncrawl", 36 | Key=warc_filename, 37 | Range=f"bytes={warc_record_offset}-{warc_record_offset + warc_record_length - 1}", 38 | ) 39 | except (ClientError, ProxyConnectionError) as e: 40 | return b"", repr(e) 41 | return response["Body"].read(), "" 42 | 43 | # Needed to make multiprocessing work 44 | def __reduce__(self): 45 | return (self.__class__, ()) 46 | -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from m4.sourcing.data_collection.utils.clip_utils import compute_clip_score 2 | from m4.sourcing.data_collection.utils.fetching_utils import fetch_single_image 3 | from m4.sourcing.data_collection.utils.filtering_utils import ( 4 | DIGITS_RE, 5 | FLAGGED_WORDS, 6 | NON_PRINTING_CHARACTERS_RE, 7 | PUNCTUATION, 8 | SPECIAL_CHARACTERS, 9 | STOPWORDS, 10 | UNICODE_PUNCTUATION, 11 | ) 12 | from m4.sourcing.data_collection.utils.kl_utils import NB_BINS, kl_div 13 | from m4.sourcing.data_collection.utils.simplification_utils import ( 14 | TAG_TO_SEP, 15 | format_filename, 16 | format_image_size, 17 | format_relative_to_absolute_path, 18 | get_media_src, 19 | is_url_valid, 20 | simplify_media_node, 21 | ) 22 | from m4.sourcing.data_collection.utils.tags_attributes import ( 23 | INTERESTING_TAGS_SET, 24 | MEDIA_CONTAIN_INTERESTING_ATTRIBUTES_SET, 25 | UNWRAP_TAGS, 26 | InterestingAttributesSetCategory, 27 | ) 28 | from m4.sourcing.data_collection.utils.utils import load_dataset_html, make_selectolax_tree 29 | -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/utils/kl_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | NB_BINS = 40 5 | 6 | 7 | def kl_div(p, q, nb_bins=NB_BINS): 8 | freq_p, _ = np.histogram(p, bins=nb_bins, range=(0.0, 1.0), density=True) 9 | freq_q, _ = np.histogram(q, bins=nb_bins, range=(0.0, 1.0), density=True) 10 | elem = freq_p * np.log(freq_p / freq_q) 11 | return np.sum(np.where((~np.isnan(elem)) & (freq_q != 0), elem, 0)) 12 | -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/utils/utils.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from selectolax.parser import HTMLParser 3 | 4 | 5 | def load_dataset_html(shuffle=False, buffer_size=10000, seed=42): 6 | dataset = load_dataset( 7 | "bs-modeling-metadata/c4-en-html-with-metadata", 8 | streaming=True, 9 | split="train", 10 | use_auth_token=True, 11 | ) 12 | if shuffle: 13 | dataset = dataset.shuffle(buffer_size=buffer_size, seed=seed) 14 | dataset = iter(dataset) 15 | return dataset 16 | 17 | 18 | def make_selectolax_tree(html_str): 19 | selectolax_tree = HTMLParser(html_str) 20 | return selectolax_tree 21 | -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/visualization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/data_collection/visualization/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/visualization/choose_filtering_parameters_laion_pairs.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import streamlit as st 3 | 4 | 5 | if __name__ == "__main__": 6 | st.set_page_config(layout="wide") 7 | st.title("Visualization to help choosing the filtering parameters for image / text pair datasets") 8 | 9 | path_stats = "./large_files/stats_vis_choose_filtering_params.pkl" 10 | df_stats = pd.read_pickle(path_stats) 11 | 12 | num_considered_examples = st.number_input( 13 | "Choose the number of image / text pairs to consider", 14 | min_value=0, 15 | max_value=len(df_stats), 16 | value=1_000, 17 | help=f"Enter a number between 0 and {len(df_stats)}", 18 | ) 19 | df_stats = df_stats.head(num_considered_examples) 20 | 21 | order_sort = st.selectbox("Sorting in", options=["ascending order", "descending order"], index=0) 22 | stat_sort_on = st.selectbox( 23 | "Sorting on", 24 | options=[name for name in list(df_stats.columns.values) if name not in ["img", "caption"]], 25 | index=0, 26 | ) 27 | ascending_sort = True if "ascending" in order_sort else False 28 | df_stats = df_stats.sort_values(stat_sort_on, ascending=ascending_sort) 29 | 30 | html_data_frame = df_stats.to_html(escape=False) 31 | st.markdown(html_data_frame, unsafe_allow_html=True) 32 | -------------------------------------------------------------------------------- /vision/m4/sourcing/data_collection/visualization/wikipedia/explore_wiki_results.py: -------------------------------------------------------------------------------- 1 | import random 2 | from pathlib import Path 3 | 4 | import streamlit as st 5 | from datasets import load_from_disk 6 | 7 | 8 | st.set_page_config(layout="wide") 9 | 10 | processed_data_dir = Path("/home/lucile/local_datasets/enwiki/enwiki-NS0-20230220-ENTERPRISE-HTML-EXTRACTION") 11 | original_data_dir = Path("/home/lucile/local_datasets/enwiki/enwiki-NS0-20230220-ENTERPRISE-HTML") 12 | shard_id = 30 13 | exclude_shards = [34] 14 | 15 | processed_ds_name_2 = ( 16 | processed_data_dir / f"shard_{shard_id}" / "wikipedia_html_enterprise-with-images-and-html-full-v1-v2" 17 | ) 18 | shard_ds = load_from_disk(processed_ds_name_2) 19 | 20 | shard_ds = shard_ds.filter(lambda x: x["html"] is not None) 21 | num_docs = len(shard_ds) 22 | 23 | st.header("Document") 24 | if st.button("Select a random document"): 25 | dct_idx = random.randint(a=0, b=num_docs - 1) 26 | else: 27 | dct_idx = 0 28 | idx = st.number_input( 29 | f"Select a document among the first {num_docs} ones", 30 | min_value=0, 31 | max_value=num_docs - 1, 32 | value=dct_idx, 33 | step=1, 34 | help=f"Index between 0 and {num_docs-1}", 35 | ) 36 | current_example = shard_ds[idx] 37 | current_html = current_example["html"] 38 | 39 | 40 | col1, col2 = st.columns(2) 41 | with col1: 42 | st.subheader("Raw html rendering") 43 | st.components.v1.html(current_html, height=700, scrolling=True) 44 | with col2: 45 | st.subheader("Texts and images extracted from the html") 46 | for text, img in zip(current_example["texts"], current_example["images"]): 47 | if img is not None: 48 | st.image(img, caption=text) 49 | else: 50 | st.write(text) 51 | -------------------------------------------------------------------------------- /vision/m4/sourcing/get_modelling_metadata_dataset/get_modelling_metadata_dataset.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=m4_get_dataset # (change me!) job name 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # (change me! between 0 and 48) number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --time 012:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) 8 | #SBATCH --output=/gpfsdswork/projects/rech/cnw/uue59kq/logs/get_dataset/%j-%x.out # output file name 9 | #SBATCH --account=cnw@cpu # account 10 | #SBATCH --array=0-2756 11 | #SBATCH --partition=cpu_p1 12 | 13 | set -x -e 14 | 15 | source $cnw_ALL_CCFRWORK/start-m4-user 16 | conda activate lucile-m4 17 | 18 | export HF_DATASETS_OFFLINE=1 19 | export HF_DATASETS_CACHE=/gpfsscratch/rech/cnw/uue59kq/to_delete 20 | 21 | WORKING_DIR=/gpfswork/rech/cnw/uue59kq/repos/m4/m4/sourcing/processing/extracting_documents/get_modelling_metadata_dataset 22 | pushd $WORKING_DIR 23 | 24 | readarray -t SHARD_NAMES < shard_names.txt 25 | SHARD_NAME=${SHARD_NAMES[$SLURM_ARRAY_TASK_ID]} 26 | echo "Downloading shard: "$SHARD_NAME 27 | 28 | python get_modelling_metadata_dataset.py \ 29 | --dataset-path /gpfsscratch/rech/cnw/urd43gx/c4-en-html-with-metadata/ \ 30 | --save-dir /gpfsscratch/rech/cnw/commun/local_datasets/c4-en-html-with-metadata-arrow/ \ 31 | --shard-name $SHARD_NAME 32 | -------------------------------------------------------------------------------- /vision/m4/sourcing/pmd/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from functools import lru_cache 3 | from pathlib import Path 4 | 5 | import datasets 6 | 7 | 8 | DEFAULT_M4_CACHE_HOME = Path("~/.cache/m4") 9 | 10 | 11 | @lru_cache(maxsize=1) 12 | def get_m4_cache_dir() -> Path: 13 | return (Path(os.environ["M4_MANUAL_DIR"]) if "M4_MANUAL_DIR" in os.environ else DEFAULT_M4_CACHE_HOME).expanduser() 14 | 15 | 16 | @lru_cache(maxsize=1) 17 | def get_jz_dataset_dir() -> Path: 18 | if "DSDIR" in os.environ: 19 | return Path(os.environ["DSDIR"]).expanduser() 20 | raise ValueError("We're not in JZ. This method should only be called when running in JZ.") 21 | 22 | 23 | # All PMD datasets should following a single feature API. 24 | _FEATURES = datasets.Features( 25 | { 26 | "image": datasets.Image(), 27 | "text": datasets.Value("string"), 28 | # Define where the sample comes from, this is necessary when we start to use aggregated versions like PMD. 29 | "source": datasets.Value("string"), 30 | # We commit any kind of additional information in json format in `meta` 31 | "meta": datasets.Value("string"), 32 | } 33 | ) 34 | -------------------------------------------------------------------------------- /vision/m4/sourcing/pmd/fix_image_path.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Dict, List 2 | 3 | import datasets 4 | 5 | 6 | """ 7 | Images can be stored in `datasets` using bytes or path to an actual file. If the a path is given, one needs to make the 8 | path work with local setup. The way to do so we remove a prefix and replace with a environment dependent one. 9 | `home/thomas_wang_hugginface_co/.cache/m4/...` -> f"{get_m4_cache_dir()}/..." 10 | """ 11 | 12 | 13 | def get_image_paths_fixer(image_column_name: str, image_path_fixer: Callable[[str], str]): 14 | image_feature = datasets.Image(decode=True) 15 | 16 | def image_paths_fixer(batch: Dict[str, List]) -> Dict[str, List]: 17 | # Image(decode=False) which allows the images to be `{'path': str, 'bytes': str}` 18 | image_dicts = batch[image_column_name] 19 | 20 | for image_dict in image_dicts: 21 | # We ignore Images that store bytes directly 22 | if image_dict["bytes"] is not None: 23 | continue 24 | 25 | path = image_dict["path"] 26 | assert path is not None 27 | new_path = image_path_fixer(path) 28 | assert new_path is not None 29 | # Careful that's an in-place operation, which updates the dict stored in `batch` 30 | image_dict["path"] = new_path 31 | 32 | batch[image_column_name] = [image_feature.decode_example(image_dict) for image_dict in image_dicts] 33 | return batch 34 | 35 | return image_paths_fixer 36 | -------------------------------------------------------------------------------- /vision/m4/sourcing/pmd/jz_loaders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/jz_loaders/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/pmd/jz_loaders/jz_conceptual_captions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/jz_loaders/jz_conceptual_captions/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/pmd/jz_loaders/jz_wit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/jz_loaders/jz_wit/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/pmd/local_loaders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/local_loaders/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/pmd/local_loaders/coco/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/local_loaders/coco/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/pmd/local_loaders/localized_narratives__ADE20k/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/local_loaders/localized_narratives__ADE20k/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/pmd/local_loaders/localized_narratives__coco/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/local_loaders/localized_narratives__coco/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/pmd/local_loaders/localized_narratives__flickr30k/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/local_loaders/localized_narratives__flickr30k/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/pmd/local_loaders/localized_narratives__openimages/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/local_loaders/localized_narratives__openimages/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/pmd/local_loaders/yfcc100m/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/pmd/local_loaders/yfcc100m/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/pmd/scripts/jz_image_pmd.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=jz_image_pmd 3 | #SBATCH --qos=qos_cpu-t4 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=40 6 | #SBATCH --partition=cpu_p1 7 | #SBATCH --hint=nomultithread 8 | #SBATCH --array=0-1%2 9 | #SBATCH --time=100:00:00 10 | #SBATCH --output=output-%x-%A_%a.out 11 | #SBATCH --error=output-%x-%A_%a.out 12 | #SBATCH --account=cnw@cpu 13 | #SBATCH --mail-type=ALL 14 | #SBATCH --mail-user=victor@huggingface.co 15 | 16 | source ~/.bashrc_cnw 17 | eval "$(conda shell.bash hook)" 18 | conda activate victor 19 | 20 | export HF_DATASETS_OFFLINE=1 21 | export HF_DATASETS_CACHE=$cnw_ALL_CCFRSCRATCH/datasets 22 | 23 | all_dataset=( 24 | jz_wit 25 | jz_conceptual_captions 26 | ) 27 | dataset_name=${all_dataset[${SLURM_ARRAY_TASK_ID}]} 28 | 29 | python jz_pmd.py --dataset_name $dataset_name --loading_script_path $WORK/code/m4/m4/sourcing/pmd/jz_loaders/$dataset_name 30 | -------------------------------------------------------------------------------- /vision/m4/sourcing/pmd/scripts/jz_pmd.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import math 3 | import os 4 | 5 | from datasets import DatasetDict, load_dataset 6 | 7 | from m4.utils.datasets.get_self_contained_ds import process_ds_wrapped 8 | 9 | 10 | if __name__ == "__main__": 11 | parser = argparse.ArgumentParser(description="Create arrow files for subsets of image PMD - JZ version/") 12 | parser.add_argument( 13 | "--dataset_name", type=str, required=True, help="Should be either `jz_conceptual_captions` or `jz_wit`." 14 | ) 15 | parser.add_argument("--loading_script_path", type=str, required=True, help="Path to the loading script.") 16 | parser.add_argument( 17 | "--num_proc", 18 | type=int, 19 | default=1, 20 | help="Number of processed for multiprocessing (in particular calls to `map`s).", 21 | ) 22 | args = parser.parse_args() 23 | 24 | dataset_name = args.dataset_name.replace("jz_", "") 25 | 26 | dataset = load_dataset(args.loading_script_path) 27 | 28 | print("Start converting the images to bytes.") 29 | dataset = process_ds_wrapped(dataset, batch_size=1_000, num_proc=args.num_proc) 30 | 31 | print("Start saving shards.") 32 | if isinstance(dataset, DatasetDict): 33 | for split_name, dset in dataset.items(): 34 | nb_of_shards = math.ceil(len(dset) / 50_000) 35 | shards = [dset.shard(num_shards=nb_of_shards, index=i, contiguous=True) for i in range(nb_of_shards)] 36 | for i, shard in enumerate(shards): 37 | shard.save_to_disk( 38 | f"{os.environ['cnw_ALL_CCFRSCRATCH']}/general_pmd/image/{dataset_name}/{split_name}/{i:05}-{nb_of_shards:05}" 39 | ) 40 | else: 41 | raise ValueError(f"`datasets` is of type {type(dataset)} which is not supported yet.") 42 | -------------------------------------------------------------------------------- /vision/m4/sourcing/processing/README.md: -------------------------------------------------------------------------------- 1 | # Data Processing Pipelines 2 | 3 | Relate to issue [#12](https://github.com/huggingface/m4/issues/12). 4 | 5 | We have two v0 data processing pipelines: 6 | - (a) split (for sharding) + parallel/slurm arrays of whatever processing scripts (python or rust for instance) 7 | - (b) apache beam (for creating processing pipelines) + Dataflow (for horizontal scaling) 8 | 9 | ## App 10 | 11 | ngram search is mostly an example. 12 | to launch the app: 13 | ```bash 14 | streamlit run app.py --server.port 6006 15 | ``` 16 | -------------------------------------------------------------------------------- /vision/m4/sourcing/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/processing/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/processing/extracting_ngrams/README.md: -------------------------------------------------------------------------------- 1 | ## Locally 2 | Run the `run_document_ngrams_extraction.sh` script. 3 | 4 | ## On JZ 5 | On JZ: 6 | - Add to your `~/.bashrc` the following line (custom installation of `jq` and `parallel`): 7 | ```bash 8 | export PATH=$PATH:/gpfswork/rech/six/commun/lib/jq-1.5/bin/:/gpfswork/rech/six/commun/lib/parallel/bin/ 9 | ``` 10 | 11 | Then, run the slurm script (`sbatch pipe.slurm`). 12 | -------------------------------------------------------------------------------- /vision/m4/sourcing/processing/extracting_ngrams/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/sourcing/processing/extracting_ngrams/__init__.py -------------------------------------------------------------------------------- /vision/m4/sourcing/processing/extracting_ngrams/run_document_ngrams_extraction.sh: -------------------------------------------------------------------------------- 1 | eval "$(conda shell.bash hook)" 2 | conda activate m3 3 | 4 | # TODO: update so that we can take in multiple shards 5 | N_WORKERS=4 6 | DATA_PATH=/home/victor_huggingface_co/m4/data 7 | SHARD_NAME=4e47925f7c894bd8eb56e5dd1d778ec77bf2c90f6cee0e32e31615393391c67a 8 | NB_DOCS_PER_SUBSHARD=2000 9 | 10 | # Get the text field only 11 | jq ".text" < $DATA_PATH/raw_dumps/$SHARD_NAME > $DATA_PATH/processed_dumps/$SHARD_NAME.texts 12 | # Get the URL field only 13 | jq -r "[input_line_number,.url] | @csv" < $DATA_PATH/raw_dumps/$SHARD_NAME > $DATA_PATH/extracted_databases/$SHARD_NAME.urls.csv 14 | # Get the HTML field only 15 | jq -r "[input_line_number,.html] | @csv" < $DATA_PATH/raw_dumps/$SHARD_NAME > $DATA_PATH/extracted_databases/$SHARD_NAME.htmls.csv 16 | 17 | # Splitting into subshards 18 | split --lines $NB_DOCS_PER_SUBSHARD --numeric-suffixes $DATA_PATH/processed_dumps/$SHARD_NAME.texts $DATA_PATH/processed_dumps/$SHARD_NAME.texts. 19 | 20 | # Extract ngrams in each documents 21 | find $DATA_PATH/processed_dumps/ | \ 22 | grep "${DATA_PATH}/processed_dumps/${SHARD_NAME}.texts.[0-9]+*" | \ 23 | parallel --verbose -j $N_WORKERS --progress "TRANSFORMERS_OFFLINE=1 TRANSFORMERS_VERBOSITY=error python extract_documents_ngrams.py --filepath {} --nb_docs_per_subshard $NB_DOCS_PER_SUBSHARD" > \ 24 | $DATA_PATH/extracted_databases/$SHARD_NAME.ngrams.csv 25 | 26 | # Remove the subshards 27 | find $DATA_PATH/processed_dumps/ | grep "${DATA_PATH}/processed_dumps/${SHARD_NAME}.texts.[0-9]+*" | xargs -d"\n" rm 28 | -------------------------------------------------------------------------------- /vision/m4/training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/training/__init__.py -------------------------------------------------------------------------------- /vision/m4/training/debug_utils.py: -------------------------------------------------------------------------------- 1 | """ Trainer debug utils """ 2 | 3 | 4 | def dump_optim_states(self): 5 | """dumps basic information about the state of the optimizer""" 6 | 7 | print("*** Optim States Dump:") 8 | param_groups_cnt = len(self.vl_optim.param_groups) 9 | # state dict has more than param_groups info, so extract only the param groups 10 | param_group_states = list(self.vl_optim.state.values())[:param_groups_cnt] 11 | for i, state in enumerate(param_group_states): 12 | print(f"param group: {i}") 13 | print(f" step={state['step']}") 14 | print(f" exp_avg all_zero={all(state['exp_avg'] == 0)}") 15 | print(f" exp_avg_sq all_zero={all(state['exp_avg_sq'] == 0)}") 16 | 17 | # can also dump LR state if need be 18 | # print(f"LR={self.vl_scheduler.get_last_lr()}") 19 | 20 | 21 | def validate_optim_states_are_reset(self): 22 | """ 23 | for a new or fully reset optimizer we expect all zeros `exp_avg` and `exp_avg_sq` state tensors and step=1 24 | """ 25 | 26 | param_groups_cnt = len(self.vl_optim.param_groups) 27 | param_group_states = list(self.vl_optim.state.values())[:param_groups_cnt] 28 | for i, state in enumerate(param_group_states): 29 | if state["step"] != 1: 30 | raise ValueError(f"optimizer reset didn't seem to work: state={i} step={state['step']}") 31 | if not all(state["exp_avg"] == 0): 32 | raise ValueError(f"optimizer reset didn't seem to work: state={i} step={state['exp_avg']}") 33 | if not all(state["exp_avg_sq"] == 0): 34 | raise ValueError(f"optimizer reset didn't seem to work: state={i} step={state['exp_avg_sq']}") 35 | -------------------------------------------------------------------------------- /vision/m4/training/setup_language_model.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from m4.models.idefics.configuration_idefics import IdeficsConfig 4 | from m4.models.idefics.modeling_idefics import IdeficsForCausalLM 5 | from m4.models.vgpt2.configuration_vgpt2 import VGPT2Config 6 | from m4.models.vgpt2.modeling_vgpt2 import VGPT2LMHeadModel 7 | from m4.models.vllama3.configuration_vllama3 import VLlama3Config 8 | from m4.models.vllama3.modeling_vllama3 import VLlama3ForCausalLM 9 | from m4.models.vmistral.configuration_vmistral import VMistralConfig 10 | from m4.models.vmistral.modeling_vmistral import VMistralForCausalLM 11 | 12 | 13 | model_name2classes = { 14 | r"gpt2": [VGPT2Config, VGPT2LMHeadModel], 15 | r"idefics": [IdeficsConfig, IdeficsForCausalLM], 16 | r"mistral": [VMistralConfig, VMistralForCausalLM], 17 | r"llama": [VLlama3Config, VLlama3ForCausalLM], 18 | r"smollm": [VLlama3Config, VLlama3ForCausalLM], 19 | } 20 | 21 | 22 | def model_name_to_classes(model_name_or_path): 23 | """returns config_class, model_class for a given model name or path""" 24 | 25 | model_name_lowcase = model_name_or_path.lower() 26 | for rx, classes in model_name2classes.items(): 27 | if re.search(rx, model_name_lowcase): 28 | return classes 29 | else: 30 | raise ValueError( 31 | f"Unknown type of backbone LM. Got {model_name_or_path}, supported regexes:" 32 | f" {list(model_name2classes.keys())}." 33 | ) 34 | -------------------------------------------------------------------------------- /vision/m4/training/setup_vision_model.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from transformers import AutoModel 4 | 5 | 6 | # map to check the supported cv archs and also how to extract the model - in some arch, we want to 7 | # go through a specific prefix to get to the model as in `model.vision_model` for clip 8 | vision_model_name2model = { 9 | r"clip": lambda model: model.vision_model, 10 | r"siglip": lambda model: model.vision_model, 11 | r"vit": lambda model: model, 12 | } 13 | 14 | 15 | def vision_model_name_to_model(model_name_or_path, model): 16 | """returns the model if supported, asserts otherwise""" 17 | 18 | model_name_lowcase = model_name_or_path.lower() 19 | for rx, lookup in vision_model_name2model.items(): 20 | if re.search(rx, model_name_lowcase): 21 | return lookup(model) 22 | else: 23 | raise ValueError( 24 | f"Unknown type of backbone vision model. Got {model_name_or_path}, supported regexes:" 25 | f" {list(vision_model_name2model.keys())}." 26 | ) 27 | 28 | 29 | def get_vision_model(config): 30 | vision_model_name = config.vision_model_name 31 | vision_model_params = eval(config.vision_model_params) 32 | 33 | model = AutoModel.from_pretrained(vision_model_name, **vision_model_params, trust_remote_code=True) 34 | return vision_model_name_to_model(vision_model_name, model) 35 | -------------------------------------------------------------------------------- /vision/m4/training/types.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class DatasetNames(Enum): 5 | PMD = "pmd" 6 | LAION = "laion" 7 | LAION_COCO = "laion_coco" 8 | TIKZ = "tikz" 9 | CM4 = "cm4" 10 | WIKI = "wiki" 11 | IMAGE_WEBSITE_CODE = "image_website_code" 12 | VQAV2_TASK_FINETUNING = "vqav2_task_finetuning" 13 | OCR = "ocr" 14 | DOCVQA = "docvqa" 15 | SFT = "sft" 16 | 17 | 18 | class DatasetTypes(Enum): 19 | WEB_DOCUMENTS = "wd" 20 | IMAGE_CAPTION_PAIRS = "icp" 21 | VQAV2_TASK_FINETUNING = "vqav2_task_finetuning" 22 | OCR = "ocr" 23 | DOCVQA = "docvqa" 24 | SFT = "sft" 25 | -------------------------------------------------------------------------------- /vision/m4/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/utils/__init__.py -------------------------------------------------------------------------------- /vision/m4/utils/check_valid_tokenizer.py: -------------------------------------------------------------------------------- 1 | def check_valid_tokenizer(tokenizer) -> bool: 2 | """Check if the special tokens were correctly added to the tokenizer, 3 | and if they are not normalized. 4 | """ 5 | tok_class = type(tokenizer).__name__.lower() 6 | if ("idefics" in tok_class) or ("mistral" in tok_class): 7 | assert "" in tokenizer.get_vocab() 8 | assert "" in tokenizer.get_vocab() 9 | assert "" in tokenizer.get_vocab() 10 | assert "" in tokenizer.get_vocab() 11 | assert "" in tokenizer.get_vocab() 12 | 13 | for _, val in tokenizer.added_tokens_decoder.items(): 14 | assert not val.normalized # assert that normalized=False for all AddedToken 15 | -------------------------------------------------------------------------------- /vision/m4/utils/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/utils/datasets/__init__.py -------------------------------------------------------------------------------- /vision/m4/utils/debug.py: -------------------------------------------------------------------------------- 1 | import builtins 2 | import fcntl 3 | 4 | 5 | def printflock(*args, **kwargs): 6 | """ 7 | This is a wrapper around the built-in Python `print` which calls `flock` before calling 8 | `print` and unlocks it immediately after. This wrapper is useful for when each rank needs to 9 | print a message without getting it interleaved with prints from other ranks. 10 | The lock file is the file this wrapper is defined in. 11 | The output order will be random per rank. 12 | 13 | Example: 14 | >>> # assuming 4 GPUs 15 | >>> world_size = dist.get_world_size() 16 | >>> rank = dist.get_rank() 17 | >>> printflock(f"This is a very long message from rank {rank}/{world_size}") 18 | This is a very long message from rank 0/4 19 | This is a very long message from rank 2/4 20 | This is a very long message from rank 3/4 21 | This is a very long message from rank 1/4 22 | 23 | It can also be used to override normal `print` for an easier multi-gpu debug: 24 | 25 | from m4.utils.debug import printflock as print 26 | 27 | and then you don't need to change anything in your code, the normal `print` calls will all be non-interleaved 28 | """ 29 | 30 | with open(__file__, "r") as fh: 31 | fcntl.flock(fh, fcntl.LOCK_EX) 32 | try: 33 | builtins.print(*args, **kwargs) 34 | finally: 35 | fcntl.flock(fh, fcntl.LOCK_UN) 36 | -------------------------------------------------------------------------------- /vision/m4/utils/training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/smollm/542250b39015654d47083245bfb4c03332643bd6/vision/m4/utils/training/__init__.py --------------------------------------------------------------------------------