├── .gitattributes ├── .gitignore ├── DATASET.md ├── EMB.md ├── INSTALL.md ├── LICENSE ├── OMG_Seg_README.md ├── README.md ├── demo ├── README.md ├── configs │ ├── m2_convl.py │ ├── m2_convl_vid.py │ └── names │ │ └── th139_st101.py ├── image_demo.py ├── images │ ├── 350_6L1vA-xJt-M │ │ ├── 00002020.jpg │ │ ├── 00002023.jpg │ │ ├── 00002026.jpg │ │ ├── 00002029.jpg │ │ ├── 00002032.jpg │ │ ├── 00002035.jpg │ │ ├── 00002038.jpg │ │ ├── 00002041.jpg │ │ ├── 00002044.jpg │ │ ├── 00002047.jpg │ │ ├── 00002050.jpg │ │ ├── 00002053.jpg │ │ ├── 00002056.jpg │ │ ├── 00002059.jpg │ │ └── 00002062.jpg │ └── sa_1002.jpg └── video_demo.py ├── ext ├── cityscapes_scripts │ ├── createPanopticImgs.py │ └── helpers │ │ ├── __init__.py │ │ ├── annotation.py │ │ ├── csHelpers.py │ │ ├── labels.py │ │ ├── labels_cityPersons.py │ │ └── version.py ├── class_names │ └── VIPSeg.py ├── davis2017 │ ├── __init__.py │ ├── davis.py │ ├── evaluation.py │ ├── metrics.py │ ├── results.py │ └── utils.py ├── meta │ └── sam_meta.py ├── open_clip │ ├── __init__.py │ ├── bpe_simple_vocab_16e6.txt.gz │ ├── coca_model.py │ ├── constants.py │ ├── factory.py │ ├── generation_utils.py │ ├── hf_configs.py │ ├── hf_model.py │ ├── loss.py │ ├── model.py │ ├── model_configs │ │ ├── EVA01-g-14-plus.json │ │ ├── EVA01-g-14.json │ │ ├── EVA02-B-16.json │ │ ├── EVA02-E-14-plus.json │ │ ├── EVA02-E-14.json │ │ ├── EVA02-L-14-336.json │ │ ├── EVA02-L-14.json │ │ ├── RN101-quickgelu.json │ │ ├── RN101.json │ │ ├── RN50-quickgelu.json │ │ ├── RN50.json │ │ ├── RN50x16.json │ │ ├── RN50x4.json │ │ ├── RN50x64.json │ │ ├── ViT-B-16-plus-240.json │ │ ├── ViT-B-16-plus.json │ │ ├── ViT-B-16.json │ │ ├── ViT-B-32-plus-256.json │ │ ├── ViT-B-32-quickgelu.json │ │ ├── ViT-B-32.json │ │ ├── ViT-H-14.json │ │ ├── ViT-H-16.json │ │ ├── ViT-L-14-280.json │ │ ├── ViT-L-14-336.json │ │ ├── ViT-L-14.json │ │ ├── ViT-L-16-320.json │ │ ├── ViT-L-16.json │ │ ├── ViT-M-16-alt.json │ │ ├── ViT-M-16.json │ │ ├── ViT-M-32-alt.json │ │ ├── ViT-M-32.json │ │ ├── ViT-S-16-alt.json │ │ ├── ViT-S-16.json │ │ ├── ViT-S-32-alt.json │ │ ├── ViT-S-32.json │ │ ├── ViT-bigG-14.json │ │ ├── ViT-e-14.json │ │ ├── ViT-g-14.json │ │ ├── coca_ViT-B-32.json │ │ ├── coca_ViT-L-14.json │ │ ├── coca_base.json │ │ ├── coca_roberta-ViT-B-32.json │ │ ├── convnext_base.json │ │ ├── convnext_base_w.json │ │ ├── convnext_base_w_320.json │ │ ├── convnext_large.json │ │ ├── convnext_large_d.json │ │ ├── convnext_large_d_320.json │ │ ├── convnext_small.json │ │ ├── convnext_tiny.json │ │ ├── convnext_xlarge.json │ │ ├── convnext_xxlarge.json │ │ ├── convnext_xxlarge_320.json │ │ ├── mt5-base-ViT-B-32.json │ │ ├── mt5-xl-ViT-H-14.json │ │ ├── roberta-ViT-B-32.json │ │ ├── swin_base_patch4_window7_224.json │ │ ├── vit_medium_patch16_gap_256.json │ │ ├── vit_relpos_medium_patch16_cls_224.json │ │ ├── xlm-roberta-base-ViT-B-32.json │ │ └── xlm-roberta-large-ViT-H-14.json │ ├── modified_resnet.py │ ├── openai.py │ ├── pretrained.py │ ├── push_to_hf_hub.py │ ├── timm_model.py │ ├── tokenizer.py │ ├── transform.py │ ├── transformer.py │ ├── utils.py │ ├── version.py │ ├── zero_shot_classifier.py │ └── zero_shot_metadata.py ├── sam │ ├── __init__.py │ ├── common.py │ ├── image_encoder.py │ ├── mask_decoder.py │ ├── prompt_encoder.py │ └── transformer.py └── templates │ ├── __init__.py │ └── vild.py ├── figs ├── method_comparison.jpg └── omg_teaser.jpg ├── omg_llava ├── .owners.yml ├── .pre-commit-config-zh-cn.yaml ├── .pre-commit-config.yaml ├── INSTALL.md ├── MANIFEST.in ├── README.md ├── figs │ └── omg_llava.png ├── omg_llava │ ├── __init__.py │ ├── configs │ │ ├── __init__.py │ │ ├── finetune │ │ │ ├── omg_llava_7b_finetune_8gpus.py │ │ │ └── specific_tasks_finetune │ │ │ │ ├── finetune_gcg.py │ │ │ │ └── finetune_refseg.py │ │ └── pretrain │ │ │ └── omg_llava_7b_pretrain_8gpus.py │ ├── dataset │ │ ├── CombineDataset.py │ │ ├── DecoupledGCGDataset.py │ │ ├── GCGDataset.py │ │ ├── LlavaDataset.py │ │ ├── MDPVPointsDataset.py │ │ ├── ReferringSegDataset.py │ │ ├── RegionCaptionDataset.py │ │ ├── SemanticSegDataset.py │ │ ├── __init__.py │ │ ├── collect_fns │ │ │ ├── __init__.py │ │ │ └── omg_llava_collate_fn.py │ │ ├── process_functions │ │ │ ├── __init__.py │ │ │ ├── decoupled_gcg_process.py │ │ │ ├── gcg_process.py │ │ │ ├── mdpv_points_process.py │ │ │ ├── referring_seg_process.py │ │ │ ├── region_caption_process.py │ │ │ └── semantic_seg_process.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── ade20k_classes.json │ │ │ ├── cocostuff_classes.txt │ │ │ ├── grefer.py │ │ │ ├── refcoco_refer.py │ │ │ └── utils.py │ ├── engine │ │ ├── __init__.py │ │ ├── dataset_info_hook.py │ │ └── evaluate_chat_hook.py │ ├── model │ │ ├── __init__.py │ │ ├── convnext_clip │ │ │ ├── __init__.py │ │ │ ├── open_clip │ │ │ │ ├── __init__.py │ │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ │ ├── coca_model.py │ │ │ │ ├── constants.py │ │ │ │ ├── factory.py │ │ │ │ ├── generation_utils.py │ │ │ │ ├── hf_configs.py │ │ │ │ ├── hf_model.py │ │ │ │ ├── loss.py │ │ │ │ ├── model.py │ │ │ │ ├── model_configs │ │ │ │ │ ├── EVA01-g-14-plus.json │ │ │ │ │ ├── EVA01-g-14.json │ │ │ │ │ ├── EVA02-B-16.json │ │ │ │ │ ├── EVA02-E-14-plus.json │ │ │ │ │ ├── EVA02-E-14.json │ │ │ │ │ ├── EVA02-L-14-336.json │ │ │ │ │ ├── EVA02-L-14.json │ │ │ │ │ ├── RN101-quickgelu.json │ │ │ │ │ ├── RN101.json │ │ │ │ │ ├── RN50-quickgelu.json │ │ │ │ │ ├── RN50.json │ │ │ │ │ ├── RN50x16.json │ │ │ │ │ ├── RN50x4.json │ │ │ │ │ ├── RN50x64.json │ │ │ │ │ ├── ViT-B-16-plus-240.json │ │ │ │ │ ├── ViT-B-16-plus.json │ │ │ │ │ ├── ViT-B-16.json │ │ │ │ │ ├── ViT-B-32-plus-256.json │ │ │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ │ │ ├── ViT-B-32.json │ │ │ │ │ ├── ViT-H-14.json │ │ │ │ │ ├── ViT-H-16.json │ │ │ │ │ ├── ViT-L-14-280.json │ │ │ │ │ ├── ViT-L-14-336.json │ │ │ │ │ ├── ViT-L-14.json │ │ │ │ │ ├── ViT-L-16-320.json │ │ │ │ │ ├── ViT-L-16.json │ │ │ │ │ ├── ViT-M-16-alt.json │ │ │ │ │ ├── ViT-M-16.json │ │ │ │ │ ├── ViT-M-32-alt.json │ │ │ │ │ ├── ViT-M-32.json │ │ │ │ │ ├── ViT-S-16-alt.json │ │ │ │ │ ├── ViT-S-16.json │ │ │ │ │ ├── ViT-S-32-alt.json │ │ │ │ │ ├── ViT-S-32.json │ │ │ │ │ ├── ViT-bigG-14.json │ │ │ │ │ ├── ViT-e-14.json │ │ │ │ │ ├── ViT-g-14.json │ │ │ │ │ ├── coca_ViT-B-32.json │ │ │ │ │ ├── coca_ViT-L-14.json │ │ │ │ │ ├── coca_base.json │ │ │ │ │ ├── coca_roberta-ViT-B-32.json │ │ │ │ │ ├── convnext_base.json │ │ │ │ │ ├── convnext_base_w.json │ │ │ │ │ ├── convnext_base_w_320.json │ │ │ │ │ ├── convnext_large.json │ │ │ │ │ ├── convnext_large_d.json │ │ │ │ │ ├── convnext_large_d_320.json │ │ │ │ │ ├── convnext_small.json │ │ │ │ │ ├── convnext_tiny.json │ │ │ │ │ ├── convnext_xlarge.json │ │ │ │ │ ├── convnext_xxlarge.json │ │ │ │ │ ├── convnext_xxlarge_320.json │ │ │ │ │ ├── mt5-base-ViT-B-32.json │ │ │ │ │ ├── mt5-xl-ViT-H-14.json │ │ │ │ │ ├── roberta-ViT-B-32.json │ │ │ │ │ ├── swin_base_patch4_window7_224.json │ │ │ │ │ ├── vit_medium_patch16_gap_256.json │ │ │ │ │ ├── vit_relpos_medium_patch16_cls_224.json │ │ │ │ │ ├── xlm-roberta-base-ViT-B-32.json │ │ │ │ │ └── xlm-roberta-large-ViT-H-14.json │ │ │ │ ├── modified_resnet.py │ │ │ │ ├── openai.py │ │ │ │ ├── pretrained.py │ │ │ │ ├── push_to_hf_hub.py │ │ │ │ ├── timm_model.py │ │ │ │ ├── tokenizer.py │ │ │ │ ├── transform.py │ │ │ │ ├── transformer.py │ │ │ │ ├── utils.py │ │ │ │ ├── version.py │ │ │ │ ├── zero_shot_classifier.py │ │ │ │ └── zero_shot_metadata.py │ │ │ └── openclip_backbone.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── projector │ │ │ │ ├── __init__.py │ │ │ │ ├── configuration_projector.py │ │ │ │ └── modeling_projector.py │ │ ├── omg_llava.py │ │ ├── omg_seg │ │ │ ├── __init__.py │ │ │ ├── mask2former_vid.py │ │ │ ├── mask2former_vid_semanticsam.py │ │ │ ├── omg_seg_visual_encoder.py │ │ │ └── utils.py │ │ └── utils.py │ └── tools │ │ ├── __init__.py │ │ ├── app.py │ │ ├── app_utils.py │ │ ├── chat_omg_llava.py │ │ ├── chat_omg_llava_msseg.py │ │ ├── convert_deepspeed2pth.py │ │ ├── evaluate_gcg.py │ │ ├── evaluate_region_cap.py │ │ ├── gcg_omg_seg_llava.py │ │ ├── mmbench_omg_seg_llava.py │ │ ├── refcoco_omg_seg_llava.py │ │ ├── region_cap_mask_omg_seg_llava.py │ │ └── utils_refcoco.py ├── requirements.txt ├── requirements │ ├── deepspeed.txt │ ├── docs.txt │ ├── modelscope.txt │ └── runtime.txt ├── setup.cfg ├── setup.py ├── test.jpg └── xtuner │ ├── __init__.py │ ├── apis │ ├── __init__.py │ ├── datasets │ │ ├── __init__.py │ │ ├── alpaca.py │ │ ├── arxiv.py │ │ ├── code_alpaca.py │ │ ├── colorist.py │ │ ├── lawyer.py │ │ ├── medical.py │ │ ├── moss_003_sft.py │ │ ├── oasst1.py │ │ ├── open_orca.py │ │ ├── sql.py │ │ ├── tiny_codes.py │ │ └── wizardlm.py │ ├── model.py │ └── training_args.py │ ├── configs │ ├── __init__.py │ ├── baichuan │ │ ├── baichuan2_13b_base │ │ │ ├── baichuan2_13b_base_qlora_alpaca_e3.py │ │ │ ├── baichuan2_13b_base_qlora_alpaca_enzh_e3.py │ │ │ ├── baichuan2_13b_base_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── baichuan2_13b_base_qlora_alpaca_zh_e3.py │ │ │ ├── baichuan2_13b_base_qlora_arxiv_gentitle_e3.py │ │ │ ├── baichuan2_13b_base_qlora_code_alpaca_e3.py │ │ │ ├── baichuan2_13b_base_qlora_colorist_e5.py │ │ │ ├── baichuan2_13b_base_qlora_lawyer_e3.py │ │ │ ├── baichuan2_13b_base_qlora_oasst1_512_e3.py │ │ │ ├── baichuan2_13b_base_qlora_oasst1_e3.py │ │ │ ├── baichuan2_13b_base_qlora_open_platypus_e3.py │ │ │ └── baichuan2_13b_base_qlora_sql_e3.py │ │ ├── baichuan2_13b_chat │ │ │ ├── baichuan2_13b_chat_qlora_alpaca_e3.py │ │ │ ├── baichuan2_13b_chat_qlora_alpaca_enzh_e3.py │ │ │ ├── baichuan2_13b_chat_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── baichuan2_13b_chat_qlora_alpaca_zh_e3.py │ │ │ ├── baichuan2_13b_chat_qlora_code_alpaca_e3.py │ │ │ ├── baichuan2_13b_chat_qlora_lawyer_e3.py │ │ │ ├── baichuan2_13b_chat_qlora_oasst1_512_e3.py │ │ │ ├── baichuan2_13b_chat_qlora_oasst1_e3.py │ │ │ └── baichuan2_13b_chat_qlora_open_platypus_e3.py │ │ ├── baichuan2_7b_base │ │ │ ├── baichuan2_7b_base_qlora_alpaca_e3.py │ │ │ ├── baichuan2_7b_base_qlora_alpaca_enzh_e3.py │ │ │ ├── baichuan2_7b_base_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── baichuan2_7b_base_qlora_alpaca_zh_e3.py │ │ │ ├── baichuan2_7b_base_qlora_arxiv_gentitle_e3.py │ │ │ ├── baichuan2_7b_base_qlora_code_alpaca_e3.py │ │ │ ├── baichuan2_7b_base_qlora_colorist_e5.py │ │ │ ├── baichuan2_7b_base_qlora_lawyer_e3.py │ │ │ ├── baichuan2_7b_base_qlora_oasst1_512_e3.py │ │ │ ├── baichuan2_7b_base_qlora_oasst1_e3.py │ │ │ ├── baichuan2_7b_base_qlora_open_platypus_e3.py │ │ │ └── baichuan2_7b_base_qlora_sql_e3.py │ │ ├── baichuan2_7b_chat │ │ │ ├── baichuan2_7b_chat_qlora_alpaca_e3.py │ │ │ ├── baichuan2_7b_chat_qlora_alpaca_enzh_e3.py │ │ │ ├── baichuan2_7b_chat_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── baichuan2_7b_chat_qlora_alpaca_zh_e3.py │ │ │ ├── baichuan2_7b_chat_qlora_code_alpaca_e3.py │ │ │ ├── baichuan2_7b_chat_qlora_lawyer_e3.py │ │ │ ├── baichuan2_7b_chat_qlora_oasst1_512_e3.py │ │ │ ├── baichuan2_7b_chat_qlora_oasst1_e3.py │ │ │ └── baichuan2_7b_chat_qlora_open_platypus_e3.py │ │ ├── baichuan_13b_base │ │ │ ├── baichuan_13b_base_qlora_alpaca_e3.py │ │ │ ├── baichuan_13b_base_qlora_alpaca_enzh_e3.py │ │ │ ├── baichuan_13b_base_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── baichuan_13b_base_qlora_alpaca_zh_e3.py │ │ │ ├── baichuan_13b_base_qlora_arxiv_gentitle_e3.py │ │ │ ├── baichuan_13b_base_qlora_code_alpaca_e3.py │ │ │ ├── baichuan_13b_base_qlora_colorist_e5.py │ │ │ ├── baichuan_13b_base_qlora_lawyer_e3.py │ │ │ ├── baichuan_13b_base_qlora_medical_e1.py │ │ │ ├── baichuan_13b_base_qlora_moss_sft_all_e1.py │ │ │ ├── baichuan_13b_base_qlora_moss_sft_all_e2_gpu8.py │ │ │ ├── baichuan_13b_base_qlora_moss_sft_plugins_e1.py │ │ │ ├── baichuan_13b_base_qlora_oasst1_512_e3.py │ │ │ ├── baichuan_13b_base_qlora_oasst1_e3.py │ │ │ ├── baichuan_13b_base_qlora_open_platypus_e3.py │ │ │ ├── baichuan_13b_base_qlora_openorca_e1.py │ │ │ ├── baichuan_13b_base_qlora_sql_e3.py │ │ │ └── baichuan_13b_base_qlora_tiny_codes_e1.py │ │ ├── baichuan_13b_chat │ │ │ ├── baichuan_13b_chat_qlora_alpaca_e3.py │ │ │ ├── baichuan_13b_chat_qlora_alpaca_enzh_e3.py │ │ │ ├── baichuan_13b_chat_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── baichuan_13b_chat_qlora_alpaca_zh_e3.py │ │ │ ├── baichuan_13b_chat_qlora_arxiv_gentitle_e3.py │ │ │ ├── baichuan_13b_chat_qlora_code_alpaca_e3.py │ │ │ ├── baichuan_13b_chat_qlora_colorist_e5.py │ │ │ ├── baichuan_13b_chat_qlora_lawyer_e3.py │ │ │ ├── baichuan_13b_chat_qlora_medical_e1.py │ │ │ ├── baichuan_13b_chat_qlora_oasst1_512_e3.py │ │ │ ├── baichuan_13b_chat_qlora_oasst1_e3.py │ │ │ ├── baichuan_13b_chat_qlora_open_platypus_e3.py │ │ │ ├── baichuan_13b_chat_qlora_openorca_e1.py │ │ │ ├── baichuan_13b_chat_qlora_sql_e3.py │ │ │ └── baichuan_13b_chat_qlora_tiny_codes_e1.py │ │ └── baichuan_7b │ │ │ ├── baichuan_7b_qlora_alpaca_e3.py │ │ │ ├── baichuan_7b_qlora_alpaca_enzh_e3.py │ │ │ ├── baichuan_7b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── baichuan_7b_qlora_alpaca_zh_e3.py │ │ │ ├── baichuan_7b_qlora_arxiv_gentitle_e3.py │ │ │ ├── baichuan_7b_qlora_code_alpaca_e3.py │ │ │ ├── baichuan_7b_qlora_colorist_e5.py │ │ │ ├── baichuan_7b_qlora_lawyer_e3.py │ │ │ ├── baichuan_7b_qlora_medical_e1.py │ │ │ ├── baichuan_7b_qlora_moss_sft_all_e1.py │ │ │ ├── baichuan_7b_qlora_moss_sft_all_e2_gpu8.py │ │ │ ├── baichuan_7b_qlora_moss_sft_plugins_e1.py │ │ │ ├── baichuan_7b_qlora_oasst1_512_e3.py │ │ │ ├── baichuan_7b_qlora_oasst1_e3.py │ │ │ ├── baichuan_7b_qlora_open_platypus_e3.py │ │ │ ├── baichuan_7b_qlora_openorca_e1.py │ │ │ ├── baichuan_7b_qlora_sql_e3.py │ │ │ └── baichuan_7b_qlora_tiny_codes_e1.py │ ├── chatglm │ │ ├── chatglm2_6b │ │ │ ├── chatglm2_6b_qlora_alpaca_e3.py │ │ │ ├── chatglm2_6b_qlora_alpaca_enzh_e3.py │ │ │ ├── chatglm2_6b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── chatglm2_6b_qlora_alpaca_zh_e3.py │ │ │ ├── chatglm2_6b_qlora_arxiv_gentitle_e3.py │ │ │ ├── chatglm2_6b_qlora_code_alpaca_e3.py │ │ │ ├── chatglm2_6b_qlora_colorist_e5.py │ │ │ ├── chatglm2_6b_qlora_lawyer_e3.py │ │ │ ├── chatglm2_6b_qlora_medical_e1.py │ │ │ ├── chatglm2_6b_qlora_oasst1_512_e3.py │ │ │ ├── chatglm2_6b_qlora_oasst1_e3.py │ │ │ ├── chatglm2_6b_qlora_open_platypus_e3.py │ │ │ ├── chatglm2_6b_qlora_openorca_e1.py │ │ │ ├── chatglm2_6b_qlora_sql_e3.py │ │ │ └── chatglm2_6b_qlora_tiny_codes_e1.py │ │ ├── chatglm3_6b │ │ │ ├── chatglm3_6b_qlora_alpaca_e3.py │ │ │ ├── chatglm3_6b_qlora_alpaca_enzh_e3.py │ │ │ ├── chatglm3_6b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── chatglm3_6b_qlora_alpaca_zh_e3.py │ │ │ ├── chatglm3_6b_qlora_arxiv_gentitle_e3.py │ │ │ ├── chatglm3_6b_qlora_code_alpaca_e3.py │ │ │ ├── chatglm3_6b_qlora_colorist_e5.py │ │ │ ├── chatglm3_6b_qlora_lawyer_e3.py │ │ │ ├── chatglm3_6b_qlora_medical_e1.py │ │ │ ├── chatglm3_6b_qlora_oasst1_512_e3.py │ │ │ ├── chatglm3_6b_qlora_oasst1_e3.py │ │ │ ├── chatglm3_6b_qlora_open_platypus_e3.py │ │ │ ├── chatglm3_6b_qlora_openorca_e1.py │ │ │ ├── chatglm3_6b_qlora_sql_e3.py │ │ │ └── chatglm3_6b_qlora_tiny_codes_e1.py │ │ └── chatglm3_6b_base │ │ │ ├── chatglm3_6b_base_qlora_alpaca_e3.py │ │ │ ├── chatglm3_6b_base_qlora_alpaca_enzh_e3.py │ │ │ ├── chatglm3_6b_base_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── chatglm3_6b_base_qlora_alpaca_zh_e3.py │ │ │ ├── chatglm3_6b_base_qlora_arxiv_gentitle_e3.py │ │ │ ├── chatglm3_6b_base_qlora_code_alpaca_e3.py │ │ │ ├── chatglm3_6b_base_qlora_colorist_e5.py │ │ │ ├── chatglm3_6b_base_qlora_lawyer_e3.py │ │ │ ├── chatglm3_6b_base_qlora_medical_e1.py │ │ │ ├── chatglm3_6b_base_qlora_oasst1_512_e3.py │ │ │ ├── chatglm3_6b_base_qlora_oasst1_e3.py │ │ │ ├── chatglm3_6b_base_qlora_open_platypus_e3.py │ │ │ ├── chatglm3_6b_base_qlora_openorca_e1.py │ │ │ ├── chatglm3_6b_base_qlora_sql_e3.py │ │ │ └── chatglm3_6b_base_qlora_tiny_codes_e1.py │ ├── cohere │ │ ├── README.md │ │ └── cohere_104b │ │ │ └── cohere_100b_128k_sp32.py │ ├── custom_dataset │ │ ├── pretrain │ │ │ ├── baichuan │ │ │ │ ├── baichuan2_13b_base_full_custom_pretrain_e1.py │ │ │ │ └── baichuan2_7b_base_full_custom_pretrain_e1.py │ │ │ ├── chatglm │ │ │ │ ├── chatglm2_6b_full_custom_pretrain_e1.py │ │ │ │ └── chatglm3_6b_full_custom_pretrain_e1.py │ │ │ ├── deepseek │ │ │ │ └── deepseek_moe_16b_base_full_custom_pretrain_e1.py │ │ │ ├── gemma │ │ │ │ ├── gemma_2b_full_custom_pretrain_e1.py │ │ │ │ └── gemma_7b_full_custom_pretrain_e1.py │ │ │ ├── internlm │ │ │ │ ├── internlm2_1_8b_full_custom_pretrain_e1.py │ │ │ │ ├── internlm2_20b_full_custom_pretrain_e1.py │ │ │ │ └── internlm2_7b_full_custom_pretrain_e1.py │ │ │ ├── llama │ │ │ │ ├── llama2_70b_full_custom_pretrain_e1.py │ │ │ │ └── llama2_7b_full_custom_pretrain_e1.py │ │ │ ├── mistral │ │ │ │ └── mistral_7b_full_custom_pretrain_e1.py │ │ │ ├── mixtral │ │ │ │ └── mixtral_8x7b_full_custom_pretrain_e1.py │ │ │ ├── qwen │ │ │ │ ├── qwen1_5_0_5b_full_custom_pretrain_e1.py │ │ │ │ ├── qwen1_5_14b_full_custom_pretrain_e1.py │ │ │ │ ├── qwen1_5_1_8b_full_custom_pretrain_e1.py │ │ │ │ ├── qwen1_5_4b_full_custom_pretrain_e1.py │ │ │ │ ├── qwen1_5_72b_full_custom_pretrain_e1.py │ │ │ │ ├── qwen1_5_7b_full_custom_pretrain_e1.py │ │ │ │ ├── qwen_1_8b_full_custom_pretrain_e1.py │ │ │ │ ├── qwen_72b_full_custom_pretrain_e1.py │ │ │ │ └── qwen_7b_full_custom_pretrain_e1.py │ │ │ ├── starcoder │ │ │ │ └── starcoder_full_custom_pretrain_e1.py │ │ │ ├── yi │ │ │ │ ├── yi_34b_full_custom_pretrain_e1.py │ │ │ │ └── yi_6b_full_custom_pretrain_e1.py │ │ │ └── zephyr │ │ │ │ └── zephyr_7b_beta_full_custom_pretrain_e1.py │ │ └── sft │ │ │ ├── baichuan │ │ │ ├── baichuan2_13b_chat_qlora_custom_sft_e1.py │ │ │ ├── baichuan2_7b_chat_qlora_custom_sft_e1.py │ │ │ ├── baichuan_13b_chat_qlora_custom_sft_e1.py │ │ │ └── baichuan_7b_qlora_custom_sft_e1.py │ │ │ ├── chatglm │ │ │ ├── chatglm2_6b_qlora_custom_sft_e1.py │ │ │ └── chatglm3_6b_qlora_custom_sft_e1.py │ │ │ ├── deepseek │ │ │ ├── deepseek_moe_16b_chat_qlora_custom_sft_e1.py │ │ │ └── deepseekcoder_6_7b_instruct_qlora_custom_sft_e1.py │ │ │ ├── gemma │ │ │ ├── gemma_2b_it_qlora_custom_sft_e1.py │ │ │ ├── gemma_2b_qlora_custom_sft_e1.py │ │ │ ├── gemma_7b_it_qlora_custom_sft_e1.py │ │ │ └── gemma_7b_qlora_custom_sft_e1.py │ │ │ ├── internlm │ │ │ ├── internlm2_chat_1_8b_qlora_custom_sft_e1.py │ │ │ ├── internlm2_chat_20b_qlora_custom_sft_e1.py │ │ │ └── internlm2_chat_7b_qlora_custom_sft_e1.py │ │ │ ├── llama │ │ │ ├── llama2_70b_qlora_custom_sft_e1.py │ │ │ └── llama2_7b_chat_qlora_custom_sft_e1.py │ │ │ ├── mistral │ │ │ └── mistral_7b_full_finetune_custom_sft_e1.py │ │ │ ├── mixtral │ │ │ └── mixtral_8x7b_instruct_qlora_custom_sft_e1.py │ │ │ ├── qwen │ │ │ ├── qwen1_5_0_5b_chat_qlora_custom_sft_e1.py │ │ │ ├── qwen1_5_14b_chat_qlora_custom_sft_e1.py │ │ │ ├── qwen1_5_1_8b_chat_qlora_custom_sft_e1.py │ │ │ ├── qwen1_5_4b_chat_qlora_custom_sft_e1.py │ │ │ ├── qwen1_5_72b_chat_qlora_custom_sft_e1.py │ │ │ ├── qwen1_5_7b_chat_qlora_custom_sft_e1.py │ │ │ ├── qwen_1_8b_chat_qlora_custom_sft_e1.py │ │ │ ├── qwen_72b_qlora_custom_sft_e1.py │ │ │ └── qwen_7b_chat_qlora_custom_sft_e1.py │ │ │ ├── starcoder │ │ │ └── starcoder_qlora_custom_sft_e1.py │ │ │ ├── yi │ │ │ ├── yi_34b_qlora_custom_sft_e1.py │ │ │ └── yi_6b_qlora_custom_sft_e1.py │ │ │ └── zephyr │ │ │ └── zephyr_7b_beta_qlora_custom_sft_e1.py │ ├── deepseek │ │ ├── README.md │ │ ├── deepseek_coder_6_7b_base │ │ │ └── deepseek_coder_6_7b_base_qlora_code_alpaca_e3.py │ │ ├── deepseek_coder_6_7b_instruct │ │ │ └── deepseekcoder_6_7b_instruct_qlora_code_alpaca_e3.py │ │ ├── deepseek_moe_16b_base │ │ │ ├── deepseek_moe_16b_base_full_oasst1_e3.py │ │ │ └── deepseek_moe_16b_base_qlora_oasst1_e3.py │ │ ├── deepseek_moe_16b_chat │ │ │ ├── deepseek_moe_16b_chat_full_oasst1_e3.py │ │ │ └── deepseek_moe_16b_chat_qlora_oasst1_e3.py │ │ ├── deepseek_v2_chat │ │ │ └── deepseek_v2_chat_full_alpaca_e3.py │ │ └── deepseek_v2_lite_chat │ │ │ ├── deepseek_v2_lite_chat_full_alpaca_e3.py │ │ │ └── deepseek_v2_lite_chat_full_alpaca_e3_32k_varlen.py │ ├── deepspeed │ │ ├── deepspeed_zero1.json │ │ ├── deepspeed_zero2.json │ │ ├── deepspeed_zero2_offload.json │ │ ├── deepspeed_zero3.json │ │ └── deepspeed_zero3_offload.json │ ├── dpo │ │ ├── internlm │ │ │ ├── internlm2_chat_1_8b_dpo_full.py │ │ │ ├── internlm2_chat_1_8b_dpo_full_varlenattn.py │ │ │ ├── internlm2_chat_1_8b_dpo_full_varlenattn_jsonl_dataset.py │ │ │ └── internlm2_chat_7b_dpo_qlora_varlenattn.py │ │ └── llama │ │ │ └── llama3_8b_instruct_dpo_qlora_varlenattn.py │ ├── gemma │ │ ├── gemma_2b │ │ │ ├── gemma_2b_full_alpaca_e3.py │ │ │ └── gemma_2b_qlora_alpaca_e3.py │ │ ├── gemma_2b_it │ │ │ ├── gemma_2b_it_full_alpaca_e3.py │ │ │ └── gemma_2b_it_qlora_alpaca_e3.py │ │ ├── gemma_7b │ │ │ ├── gemma_7b_full_alpaca_e3.py │ │ │ └── gemma_7b_qlora_alpaca_e3.py │ │ └── gemma_7b_it │ │ │ ├── gemma_7b_it_full_alpaca_e3.py │ │ │ └── gemma_7b_it_qlora_alpaca_e3.py │ ├── internlm │ │ ├── internlm2_1_8b │ │ │ ├── internlm2_1_8b_full_alpaca_e3.py │ │ │ └── internlm2_1_8b_qlora_alpaca_e3.py │ │ ├── internlm2_20b │ │ │ ├── internlm2_20b_full_finetune_custom_dataset_e1.py │ │ │ ├── internlm2_20b_qlora_alpaca_e3.py │ │ │ ├── internlm2_20b_qlora_arxiv_gentitle_e3.py │ │ │ ├── internlm2_20b_qlora_code_alpaca_e3.py │ │ │ ├── internlm2_20b_qlora_colorist_e5.py │ │ │ ├── internlm2_20b_qlora_lawyer_e3.py │ │ │ ├── internlm2_20b_qlora_msagent_react_e3_gpu8.py │ │ │ ├── internlm2_20b_qlora_oasst1_512_e3.py │ │ │ ├── internlm2_20b_qlora_oasst1_e3.py │ │ │ └── internlm2_20b_qlora_sql_e3.py │ │ ├── internlm2_7b │ │ │ ├── internlm2_7b_full_finetune_custom_dataset_e1.py │ │ │ ├── internlm2_7b_full_finetune_custom_dataset_e1_sequence_parallel_4.py │ │ │ ├── internlm2_7b_qlora_alpaca_e3.py │ │ │ ├── internlm2_7b_qlora_arxiv_gentitle_e3.py │ │ │ ├── internlm2_7b_qlora_code_alpaca_e3.py │ │ │ ├── internlm2_7b_qlora_colorist_e5.py │ │ │ ├── internlm2_7b_qlora_json_e3.py │ │ │ ├── internlm2_7b_qlora_lawyer_e3.py │ │ │ ├── internlm2_7b_qlora_msagent_react_e3_gpu8.py │ │ │ ├── internlm2_7b_qlora_oasst1_512_e3.py │ │ │ ├── internlm2_7b_qlora_oasst1_e3.py │ │ │ ├── internlm2_7b_qlora_sql_e3.py │ │ │ ├── internlm2_7b_w_internevo_dataset.py │ │ │ ├── internlm2_7b_w_tokenized_dataset.py │ │ │ └── internlm2_7b_w_untokenized_dataset.py │ │ ├── internlm2_chat_1_8b │ │ │ ├── internlm2_chat_1_8b_full_alpaca_e3.py │ │ │ └── internlm2_chat_1_8b_qlora_alpaca_e3.py │ │ ├── internlm2_chat_20b │ │ │ ├── internlm2_chat_20b_full_finetune_custom_dataset_e1.py │ │ │ ├── internlm2_chat_20b_qlora_alpaca_e3.py │ │ │ ├── internlm2_chat_20b_qlora_code_alpaca_e3.py │ │ │ ├── internlm2_chat_20b_qlora_lawyer_e3.py │ │ │ ├── internlm2_chat_20b_qlora_oasst1_512_e3.py │ │ │ └── internlm2_chat_20b_qlora_oasst1_e3.py │ │ ├── internlm2_chat_7b │ │ │ ├── internlm2_chat_7b_full_finetune_custom_dataset_e1.py │ │ │ ├── internlm2_chat_7b_qlora_alpaca_e3.py │ │ │ ├── internlm2_chat_7b_qlora_code_alpaca_e3.py │ │ │ ├── internlm2_chat_7b_qlora_lawyer_e3.py │ │ │ ├── internlm2_chat_7b_qlora_oasst1_512_e3.py │ │ │ └── internlm2_chat_7b_qlora_oasst1_e3.py │ │ ├── internlm_20b │ │ │ ├── internlm_20b_qlora_alpaca_e3.py │ │ │ ├── internlm_20b_qlora_alpaca_enzh_e3.py │ │ │ ├── internlm_20b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── internlm_20b_qlora_alpaca_zh_e3.py │ │ │ ├── internlm_20b_qlora_arxiv_gentitle_e3.py │ │ │ ├── internlm_20b_qlora_code_alpaca_e3.py │ │ │ ├── internlm_20b_qlora_colorist_e5.py │ │ │ ├── internlm_20b_qlora_lawyer_e3.py │ │ │ ├── internlm_20b_qlora_msagent_react_e3_gpu8.py │ │ │ ├── internlm_20b_qlora_oasst1_512_e3.py │ │ │ ├── internlm_20b_qlora_oasst1_e3.py │ │ │ ├── internlm_20b_qlora_open_platypus_e3.py │ │ │ └── internlm_20b_qlora_sql_e3.py │ │ ├── internlm_7b │ │ │ ├── internlm_7b_full_alpaca_e3.py │ │ │ ├── internlm_7b_full_alpaca_enzh_e3.py │ │ │ ├── internlm_7b_full_alpaca_enzh_oasst1_e3.py │ │ │ ├── internlm_7b_full_alpaca_zh_e3.py │ │ │ ├── internlm_7b_full_intern_repo_dataset_template.py │ │ │ ├── internlm_7b_full_oasst1_e3.py │ │ │ ├── internlm_7b_qlora_alpaca_e3.py │ │ │ ├── internlm_7b_qlora_alpaca_enzh_e3.py │ │ │ ├── internlm_7b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── internlm_7b_qlora_alpaca_zh_e3.py │ │ │ ├── internlm_7b_qlora_arxiv_gentitle_e3.py │ │ │ ├── internlm_7b_qlora_code_alpaca_e3.py │ │ │ ├── internlm_7b_qlora_colorist_e5.py │ │ │ ├── internlm_7b_qlora_json_e3.py │ │ │ ├── internlm_7b_qlora_lawyer_e3.py │ │ │ ├── internlm_7b_qlora_medical_e1.py │ │ │ ├── internlm_7b_qlora_moss_sft_all_e1.py │ │ │ ├── internlm_7b_qlora_moss_sft_all_e2_gpu8.py │ │ │ ├── internlm_7b_qlora_moss_sft_plugins_e1.py │ │ │ ├── internlm_7b_qlora_msagent_react_e3_gpu8.py │ │ │ ├── internlm_7b_qlora_oasst1_512_e3.py │ │ │ ├── internlm_7b_qlora_oasst1_e3.py │ │ │ ├── internlm_7b_qlora_oasst1_e3_hf.py │ │ │ ├── internlm_7b_qlora_oasst1_mmlu_e3.py │ │ │ ├── internlm_7b_qlora_open_platypus_e3.py │ │ │ ├── internlm_7b_qlora_openorca_e1.py │ │ │ ├── internlm_7b_qlora_sql_e3.py │ │ │ └── internlm_7b_qlora_tiny_codes_e1.py │ │ ├── internlm_chat_20b │ │ │ ├── internlm_chat_20b_qlora_alpaca_e3.py │ │ │ ├── internlm_chat_20b_qlora_alpaca_enzh_e3.py │ │ │ ├── internlm_chat_20b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── internlm_chat_20b_qlora_alpaca_zh_e3.py │ │ │ ├── internlm_chat_20b_qlora_code_alpaca_e3.py │ │ │ ├── internlm_chat_20b_qlora_lawyer_e3.py │ │ │ ├── internlm_chat_20b_qlora_oasst1_512_e3.py │ │ │ ├── internlm_chat_20b_qlora_oasst1_e3.py │ │ │ └── internlm_chat_20b_qlora_open_platypus_e3.py │ │ └── internlm_chat_7b │ │ │ ├── internlm_chat_7b_qlora_alpaca_e3.py │ │ │ ├── internlm_chat_7b_qlora_alpaca_enzh_e3.py │ │ │ ├── internlm_chat_7b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── internlm_chat_7b_qlora_alpaca_zh_e3.py │ │ │ ├── internlm_chat_7b_qlora_arxiv_gentitle_e3.py │ │ │ ├── internlm_chat_7b_qlora_code_alpaca_e3.py │ │ │ ├── internlm_chat_7b_qlora_colorist_e5.py │ │ │ ├── internlm_chat_7b_qlora_lawyer_e3.py │ │ │ ├── internlm_chat_7b_qlora_medical_e1.py │ │ │ ├── internlm_chat_7b_qlora_oasst1_512_e3.py │ │ │ ├── internlm_chat_7b_qlora_oasst1_e3.py │ │ │ ├── internlm_chat_7b_qlora_open_platypus_e3.py │ │ │ ├── internlm_chat_7b_qlora_openorca_e1.py │ │ │ ├── internlm_chat_7b_qlora_sql_e3.py │ │ │ └── internlm_chat_7b_qlora_tiny_codes_e1.py │ ├── llama │ │ ├── llama2_70b │ │ │ ├── llama2_70b_full_wizardlm_e1.py │ │ │ ├── llama2_70b_int8_lora_open_platypus_e1.py │ │ │ ├── llama2_70b_int8_lora_open_platypus_e1_hf.py │ │ │ ├── llama2_70b_qlora_open_platypus_e1.py │ │ │ └── llama2_70b_qlora_open_platypus_e1_hf.py │ │ ├── llama2_7b │ │ │ ├── llama2_7b_full_pgbooks_400iters_sp1.py │ │ │ ├── llama2_7b_full_pgbooks_400iters_sp4.py │ │ │ ├── llama2_7b_full_wizardlm_e1.py │ │ │ ├── llama2_7b_qlora_alpaca_e3.py │ │ │ ├── llama2_7b_qlora_alpaca_enzh_e3.py │ │ │ ├── llama2_7b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── llama2_7b_qlora_alpaca_zh_e3.py │ │ │ ├── llama2_7b_qlora_arxiv_gentitle_e3.py │ │ │ ├── llama2_7b_qlora_code_alpaca_e3.py │ │ │ ├── llama2_7b_qlora_colorist_e5.py │ │ │ ├── llama2_7b_qlora_lawyer_e3.py │ │ │ ├── llama2_7b_qlora_medical_e1.py │ │ │ ├── llama2_7b_qlora_moss_sft_all_e1.py │ │ │ ├── llama2_7b_qlora_moss_sft_all_e2_gpu8.py │ │ │ ├── llama2_7b_qlora_moss_sft_plugins_e1.py │ │ │ ├── llama2_7b_qlora_msagent_react_e3_gpu8.py │ │ │ ├── llama2_7b_qlora_oasst1_512_e3.py │ │ │ ├── llama2_7b_qlora_oasst1_e3.py │ │ │ ├── llama2_7b_qlora_open_platypus_e3.py │ │ │ ├── llama2_7b_qlora_openorca_e1.py │ │ │ ├── llama2_7b_qlora_sql_e3.py │ │ │ └── llama2_7b_qlora_tiny_codes_e1.py │ │ ├── llama2_7b_chat │ │ │ ├── llama2_7b_chat_qlora_alpaca_e3.py │ │ │ ├── llama2_7b_chat_qlora_alpaca_enzh_e3.py │ │ │ ├── llama2_7b_chat_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── llama2_7b_chat_qlora_alpaca_zh_e3.py │ │ │ ├── llama2_7b_chat_qlora_arxiv_gentitle_e3.py │ │ │ ├── llama2_7b_chat_qlora_code_alpaca_e3.py │ │ │ ├── llama2_7b_chat_qlora_colorist_e5.py │ │ │ ├── llama2_7b_chat_qlora_lawyer_e3.py │ │ │ ├── llama2_7b_chat_qlora_medical_e1.py │ │ │ ├── llama2_7b_chat_qlora_oasst1_512_e3.py │ │ │ ├── llama2_7b_chat_qlora_oasst1_e3.py │ │ │ ├── llama2_7b_chat_qlora_open_platypus_e3.py │ │ │ ├── llama2_7b_chat_qlora_openorca_e1.py │ │ │ ├── llama2_7b_chat_qlora_sql_e3.py │ │ │ └── llama2_7b_chat_qlora_tiny_codes_e1.py │ │ ├── llama3_70b_instruct │ │ │ └── llama3_70b_instruct_qlora_alpaca_e3_2k_gpu8.py │ │ ├── llama3_8b │ │ │ ├── README.md │ │ │ └── llama3_8b_full_alpaca_e3.py │ │ ├── llama3_8b_instruct │ │ │ ├── llama3_8b_instruct_full_alpaca_e3.py │ │ │ └── llama3_8b_instruct_qlora_alpaca_e3.py │ │ └── llama_7b │ │ │ ├── llama_7b_qlora_alpaca_e3.py │ │ │ ├── llama_7b_qlora_alpaca_enzh_e3.py │ │ │ ├── llama_7b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ ├── llama_7b_qlora_alpaca_zh_e3.py │ │ │ ├── llama_7b_qlora_arxiv_gentitle_e3.py │ │ │ ├── llama_7b_qlora_code_alpaca_e3.py │ │ │ ├── llama_7b_qlora_colorist_e5.py │ │ │ ├── llama_7b_qlora_lawyer_e3.py │ │ │ ├── llama_7b_qlora_medical_e1.py │ │ │ ├── llama_7b_qlora_moss_sft_all_e1.py │ │ │ ├── llama_7b_qlora_moss_sft_all_e2_gpu8.py │ │ │ ├── llama_7b_qlora_moss_sft_plugins_e1.py │ │ │ ├── llama_7b_qlora_oasst1_512_e3.py │ │ │ ├── llama_7b_qlora_oasst1_e3.py │ │ │ ├── llama_7b_qlora_open_platypus_e3.py │ │ │ ├── llama_7b_qlora_openorca_e1.py │ │ │ ├── llama_7b_qlora_sql_e3.py │ │ │ └── llama_7b_qlora_tiny_codes_e1.py │ ├── llama_speed_benchmark │ │ ├── llama2_70b │ │ │ ├── llama2_70b_full_alpaca_enzh_128k_sp8.py │ │ │ ├── llama2_70b_full_alpaca_enzh_256k_sp16.py │ │ │ ├── llama2_70b_full_alpaca_enzh_32k_sp4.py │ │ │ └── llama2_70b_full_alpaca_enzh_8k_sp1.py │ │ ├── llama2_7b │ │ │ ├── llama2_7b_full_alpaca_enzh_128k_sp8.py │ │ │ ├── llama2_7b_full_alpaca_enzh_1M_sp16.py │ │ │ ├── llama2_7b_full_alpaca_enzh_256k_sp8.py │ │ │ ├── llama2_7b_full_alpaca_enzh_32k_sp1.py │ │ │ └── llama2_7b_full_alpaca_enzh_8k_sp1.py │ │ └── yi_34b │ │ │ ├── yi_34b_200k_full_alpaca_enzh_128k_sp8.py │ │ │ ├── yi_34b_200k_full_alpaca_enzh_256k_sp8.py │ │ │ ├── yi_34b_200k_full_alpaca_enzh_32k_sp2.py │ │ │ └── yi_34b_200k_full_alpaca_enzh_8k_sp1.py │ ├── llava │ │ ├── README.md │ │ ├── README_zh-CN.md │ │ ├── internlm2_chat_1_8b_clip_vit_large_p14_336 │ │ │ ├── finetune │ │ │ │ └── llava_internlm2_chat_1_8b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py │ │ │ └── pretrain │ │ │ │ └── llava_internlm2_chat_1_8b_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ │ ├── internlm2_chat_20b_clip_vit_large_p14_336 │ │ │ ├── finetune │ │ │ │ ├── llava_internlm2_chat_20b_clip_vit_large_p14_336_e1_gpu8_finetune.py │ │ │ │ └── llava_internlm2_chat_20b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py │ │ │ └── pretrain │ │ │ │ └── llava_internlm2_chat_20b_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ │ ├── internlm2_chat_7b_clip_vit_large_p14_336 │ │ │ ├── finetune │ │ │ │ ├── llava_internlm2_chat_7b_clip_vit_large_p14_336_e1_gpu8_finetune.py │ │ │ │ └── llava_internlm2_chat_7b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py │ │ │ └── pretrain │ │ │ │ └── llava_internlm2_chat_7b_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ │ ├── internlm_chat_7b_clip_vit_large_p14_336 │ │ │ ├── finetune │ │ │ │ └── llava_internlm_chat_7b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py │ │ │ └── pretrain │ │ │ │ └── llava_internlm_chat_7b_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ │ ├── llama3_70b_instruct_clip_vit_large_p14_336 │ │ │ └── pretrain │ │ │ │ └── llava_llama3_70b_instruct_quant_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ │ ├── llama3_8b_instruct_clip_vit_large_p14_336 │ │ │ ├── README.md │ │ │ ├── convert_xtuner_weights_to_hf.py │ │ │ ├── convert_xtuner_weights_to_llava.py │ │ │ ├── finetune │ │ │ │ ├── llava_llama3_8b_instruct_full_clip_vit_large_p14_336_e1_gpu8_finetune.py │ │ │ │ ├── llava_llama3_8b_instruct_full_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py │ │ │ │ ├── llava_llama3_8b_instruct_full_clip_vit_large_p14_336_lora_e1_gpu8_internvl_finetune.py │ │ │ │ └── llava_llama3_8b_instruct_qlora_clip_vit_large_p14_336_e1_gpu1_finetune.py │ │ │ └── pretrain │ │ │ │ ├── llava_llama3_8b_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ │ │ │ ├── llava_llama3_8b_instruct_clip_vit_large_p14_336_e1_gpu8_sharegpt4v_pretrain.py │ │ │ │ └── llava_llama3_8b_instruct_quant_clip_vit_large_p14_336_e1_gpu1_pretrain.py │ │ ├── official │ │ │ ├── llava_v15_13b │ │ │ │ ├── llava_v15_13b_finetune.py │ │ │ │ ├── llava_v15_13b_finetune_lora.py │ │ │ │ └── llava_v15_13b_pretrain.py │ │ │ └── llava_v15_7b │ │ │ │ ├── llava_v15_7b_finetune.py │ │ │ │ ├── llava_v15_7b_finetune_lora.py │ │ │ │ └── llava_v15_7b_pretrain.py │ │ ├── phi3_mini_4k_instruct_clip_vit_large_p14_336 │ │ │ ├── README.md │ │ │ ├── convert_phi_to_llama.py │ │ │ ├── convert_xtuner_weights_to_hf.py │ │ │ ├── convert_xtuner_weights_to_llava.py │ │ │ ├── finetune │ │ │ │ ├── llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_e1_gpu8_finetune.py │ │ │ │ └── llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_full_e2_gpu8_internvl_finetune.py │ │ │ └── pretrain │ │ │ │ ├── llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ │ │ │ └── llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_sharegpt4v_pretrain.py │ │ ├── vicuna_13b_v15_clip_vit_large_p14_336 │ │ │ ├── finetune │ │ │ │ └── llava_vicuna_13b_v15_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py │ │ │ └── pretrain │ │ │ │ └── llava_vicuna_13b_v15_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ │ └── vicuna_7b_v15_clip_vit_large_p14_336 │ │ │ ├── finetune │ │ │ ├── llava_vicuna_7b_v15_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py │ │ │ └── llava_vicuna_7b_v15_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune_refcoco.py │ │ │ └── pretrain │ │ │ └── llava_vicuna_7b_v15_clip_vit_large_p14_336_e1_gpu8_pretrain.py │ ├── mistral │ │ ├── mistral_7b_full_finetune_custom_dataset_e1.py │ │ ├── mistral_7b_qlora_skypile_pretrain_e1.py │ │ ├── mistral_7b_w_tokenized_dataset.py │ │ └── mistral_7b_w_untokenized_dataset.py │ ├── mixtral │ │ ├── README.md │ │ ├── mixtral_8x7b │ │ │ ├── mixtral_8x7b_full_oasst1_e3.py │ │ │ └── mixtral_8x7b_qlora_oasst1_e3.py │ │ └── mixtral_8x7b_instruct │ │ │ ├── mixtral_8x7b_instruct_full_oasst1_e3.py │ │ │ └── mixtral_8x7b_instruct_qlora_oasst1_e3.py │ ├── orpo │ │ ├── internlm │ │ │ ├── internlm2_chat_1_8b_orpo_full.py │ │ │ ├── internlm2_chat_1_8b_orpo_full_varlenattn.py │ │ │ ├── internlm2_chat_1_8b_orpo_full_varlenattn_jsonl_dataset.py │ │ │ └── internlm2_chat_7b_orpo_qlora_varlenattn_ultrafeedback_e5.py │ │ └── llama │ │ │ └── llama3_8b_instruct_orpo_qlora_varlenattn_ultrafeedback_e5.py │ ├── phi │ │ └── phi3 │ │ │ ├── phi3_mini_128k_instruct_full_alpaca_e3.py │ │ │ ├── phi3_mini_128k_instruct_qlora_alpaca_e3.py │ │ │ ├── phi3_mini_4k_instruct_full_alpaca_e3.py │ │ │ └── phi3_mini_4k_instruct_qlora_alpaca_e3.py │ ├── qwen │ │ ├── qwen1 │ │ │ ├── qwen_1_8b │ │ │ │ ├── qwen_1_8b_qlora_alpaca_e3.py │ │ │ │ ├── qwen_1_8b_qlora_alpaca_enzh_e3.py │ │ │ │ ├── qwen_1_8b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ │ ├── qwen_1_8b_qlora_alpaca_zh_e3.py │ │ │ │ └── qwen_1_8b_qlora_code_alpaca_e3.py │ │ │ ├── qwen_1_8b_chat │ │ │ │ ├── qwen_1_8b_chat_qlora_alpaca_e3.py │ │ │ │ ├── qwen_1_8b_chat_qlora_alpaca_enzh_e3.py │ │ │ │ ├── qwen_1_8b_chat_qlora_alpaca_enzh_oasst1_e3.py │ │ │ │ ├── qwen_1_8b_chat_qlora_alpaca_zh_e3.py │ │ │ │ └── qwen_1_8b_chat_qlora_code_alpaca_e3.py │ │ │ ├── qwen_72b │ │ │ │ ├── qwen_72b_qlora_alpaca_e3.py │ │ │ │ ├── qwen_72b_qlora_alpaca_enzh_e3.py │ │ │ │ ├── qwen_72b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ │ ├── qwen_72b_qlora_alpaca_zh_e3.py │ │ │ │ └── qwen_72b_qlora_code_alpaca_e3.py │ │ │ ├── qwen_7b │ │ │ │ ├── qwen_7b_qlora_alpaca_e3.py │ │ │ │ ├── qwen_7b_qlora_alpaca_enzh_e3.py │ │ │ │ ├── qwen_7b_qlora_alpaca_enzh_oasst1_e3.py │ │ │ │ ├── qwen_7b_qlora_alpaca_zh_e3.py │ │ │ │ ├── qwen_7b_qlora_arxiv_gentitle_e3.py │ │ │ │ ├── qwen_7b_qlora_code_alpaca_e3.py │ │ │ │ ├── qwen_7b_qlora_colorist_e5.py │ │ │ │ ├── qwen_7b_qlora_lawyer_e3.py │ │ │ │ ├── qwen_7b_qlora_medical_e1.py │ │ │ │ ├── qwen_7b_qlora_moss_sft_all_e1.py │ │ │ │ ├── qwen_7b_qlora_moss_sft_all_e2_gpu8.py │ │ │ │ ├── qwen_7b_qlora_moss_sft_plugins_e1.py │ │ │ │ ├── qwen_7b_qlora_oasst1_512_e3.py │ │ │ │ ├── qwen_7b_qlora_oasst1_e3.py │ │ │ │ ├── qwen_7b_qlora_open_platypus_e3.py │ │ │ │ ├── qwen_7b_qlora_openorca_e1.py │ │ │ │ ├── qwen_7b_qlora_sql_e3.py │ │ │ │ └── qwen_7b_qlora_tiny_codes_e1.py │ │ │ └── qwen_7b_chat │ │ │ │ ├── qwen_7b_chat_qlora_alpaca_e3.py │ │ │ │ ├── qwen_7b_chat_qlora_alpaca_enzh_e3.py │ │ │ │ ├── qwen_7b_chat_qlora_alpaca_enzh_oasst1_e3.py │ │ │ │ ├── qwen_7b_chat_qlora_alpaca_zh_e3.py │ │ │ │ ├── qwen_7b_chat_qlora_arxiv_gentitle_e3.py │ │ │ │ ├── qwen_7b_chat_qlora_code_alpaca_e3.py │ │ │ │ ├── qwen_7b_chat_qlora_colorist_e5.py │ │ │ │ ├── qwen_7b_chat_qlora_lawyer_e3.py │ │ │ │ ├── qwen_7b_chat_qlora_medical_e1.py │ │ │ │ ├── qwen_7b_chat_qlora_oasst1_512_e3.py │ │ │ │ ├── qwen_7b_chat_qlora_oasst1_e3.py │ │ │ │ ├── qwen_7b_chat_qlora_open_platypus_e3.py │ │ │ │ ├── qwen_7b_chat_qlora_openorca_e1.py │ │ │ │ ├── qwen_7b_chat_qlora_sql_e3.py │ │ │ │ └── qwen_7b_chat_qlora_tiny_codes_e1.py │ │ └── qwen1_5 │ │ │ ├── qwen1_5_0_5b │ │ │ ├── qwen1_5_0_5b_full_alpaca_e3.py │ │ │ └── qwen1_5_0_5b_qlora_alpaca_e3.py │ │ │ ├── qwen1_5_0_5b_chat │ │ │ ├── qwen1_5_0_5b_chat_full_alpaca_e3.py │ │ │ └── qwen1_5_0_5b_chat_qlora_alpaca_e3.py │ │ │ ├── qwen1_5_110b │ │ │ ├── qwen1_5_110b_full_alpaca_e3.py │ │ │ └── qwen1_5_110b_qlora_alpaca_e3.py │ │ │ ├── qwen1_5_110b_chat │ │ │ ├── README.md │ │ │ ├── qwen1_5_110b_chat_full_alpaca_e3.py │ │ │ ├── qwen1_5_110b_chat_qlora_alpaca_e3.py │ │ │ └── qwen1_5_110b_chat_qlora_alpaca_e3_16k_2gpus.py │ │ │ ├── qwen1_5_14b │ │ │ ├── qwen1_5_14b_full_alpaca_e3.py │ │ │ └── qwen1_5_14b_qlora_alpaca_e3.py │ │ │ ├── qwen1_5_14b_chat │ │ │ ├── qwen1_5_14b_chat_full_alpaca_e3.py │ │ │ └── qwen1_5_14b_chat_qlora_alpaca_e3.py │ │ │ ├── qwen1_5_1_8b │ │ │ ├── qwen1_5_1_8b_full_alpaca_e3.py │ │ │ └── qwen1_5_1_8b_qlora_alpaca_e3.py │ │ │ ├── qwen1_5_1_8b_chat │ │ │ ├── qwen1_5_1_8b_chat_full_alpaca_e3.py │ │ │ └── qwen1_5_1_8b_chat_qlora_alpaca_e3.py │ │ │ ├── qwen1_5_4b │ │ │ ├── qwen1_5_4b_full_alpaca_e3.py │ │ │ └── qwen1_5_4b_qlora_alpaca_e3.py │ │ │ ├── qwen1_5_4b_chat │ │ │ ├── qwen1_5_4b_chat_full_alpaca_e3.py │ │ │ └── qwen1_5_4b_chat_qlora_alpaca_e3.py │ │ │ ├── qwen1_5_72b │ │ │ ├── qwen1_5_72b_full_alpaca_e3.py │ │ │ └── qwen1_5_72b_qlora_alpaca_e3.py │ │ │ ├── qwen1_5_72b_chat │ │ │ ├── qwen1_5_72b_chat_full_alpaca_e3.py │ │ │ └── qwen1_5_72b_chat_qlora_alpaca_e3.py │ │ │ ├── qwen1_5_7b │ │ │ ├── qwen1_5_7b_full_alpaca_e3.py │ │ │ └── qwen1_5_7b_qlora_alpaca_e3.py │ │ │ └── qwen1_5_7b_chat │ │ │ ├── qwen1_5_7b_chat_full_alpaca_e3.py │ │ │ └── qwen1_5_7b_chat_qlora_alpaca_e3.py │ ├── qwen_moe │ │ └── qwen1_5 │ │ │ └── qwen1_5_moe_a2_7_b_chat │ │ │ └── qwen1_5_moe_a2_7_b_chat_full_alpaca_e3.py │ ├── reward_model │ │ ├── internlm │ │ │ ├── internlm2_chat_1_8b_reward_full_ultrafeedback.py │ │ │ ├── internlm2_chat_1_8b_reward_full_varlenattn_jsonl_dataset.py │ │ │ ├── internlm2_chat_1_8b_reward_full_varlenattn_ultrafeedback.py │ │ │ └── internlm2_chat_1_8b_reward_qlora_varlenattn_ultrafeedback.py │ │ └── llama │ │ │ └── llama3_8b_instruct_reward_full_varlenattn_ultrafeedback.py │ ├── starcoder │ │ └── starcoder_qlora_stack_exchange_example.py │ ├── yi │ │ ├── yi_34b │ │ │ └── yi_34b_qlora_alpaca_enzh_e3.py │ │ └── yi_6b │ │ │ └── yi_6b_qlora_alpaca_enzh_e3.py │ └── zephyr │ │ └── zephyr_7b_beta_qlora_alpaca_e3.py │ ├── dataset │ ├── __init__.py │ ├── collate_fns │ │ ├── __init__.py │ │ ├── default_collate_fn.py │ │ ├── mmlu_collate_fn.py │ │ └── preference_collate_fn.py │ ├── concat_dataset.py │ ├── huggingface.py │ ├── intern_repo.py │ ├── json_dataset.py │ ├── llava.py │ ├── map_fns │ │ ├── __init__.py │ │ ├── dataset_map_fns │ │ │ ├── __init__.py │ │ │ ├── alpaca_map_fn.py │ │ │ ├── alpaca_zh_map_fn.py │ │ │ ├── arxiv_map_fn.py │ │ │ ├── code_alpaca_map_fn.py │ │ │ ├── colors_map_fn.py │ │ │ ├── crime_kg_assitant_map_fn.py │ │ │ ├── default_map_fn.py │ │ │ ├── law_reference_map_fn.py │ │ │ ├── llava_map_fn.py │ │ │ ├── medical_map_fn.py │ │ │ ├── msagent_map_fn.py │ │ │ ├── oasst1_map_fn.py │ │ │ ├── openai_map_fn.py │ │ │ ├── openorca_map_fn.py │ │ │ ├── pretrain_map_fn.py │ │ │ ├── sql_map_fn.py │ │ │ ├── stack_exchange_map_fn.py │ │ │ ├── tiny_codes_map_fn.py │ │ │ └── wizardlm_map_fn.py │ │ └── template_map_fn.py │ ├── modelscope.py │ ├── moss_sft.py │ ├── preference_dataset.py │ ├── refcoco_json.py │ ├── samplers │ │ ├── __init__.py │ │ ├── intern_repo.py │ │ └── length_grouped.py │ └── utils.py │ ├── engine │ ├── __init__.py │ ├── _strategy │ │ ├── __init__.py │ │ └── deepspeed.py │ ├── hooks │ │ ├── __init__.py │ │ ├── dataset_info_hook.py │ │ ├── evaluate_chat_hook.py │ │ ├── hf_checkpoint_hook.py │ │ ├── throughput_hook.py │ │ └── varlen_attn_args_to_messagehub_hook.py │ └── runner │ │ ├── __init__.py │ │ └── loops.py │ ├── entry_point.py │ ├── evaluation │ ├── __init__.py │ └── metrics │ │ ├── __init__.py │ │ ├── mmlu_metric.py │ │ └── reward_metric.py │ ├── model │ ├── __init__.py │ ├── dpo.py │ ├── llava.py │ ├── modules │ │ ├── __init__.py │ │ ├── dispatch │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── baichuan.py │ │ │ ├── cohere.py │ │ │ ├── deepseek_v2.py │ │ │ ├── internlm.py │ │ │ ├── internlm2.py │ │ │ ├── llama.py │ │ │ ├── mistral.py │ │ │ ├── phi3.py │ │ │ ├── qwen2.py │ │ │ ├── triton_kernels │ │ │ │ ├── __init__.py │ │ │ │ ├── layer_norm.py │ │ │ │ ├── rms_norm.py │ │ │ │ └── rotary.py │ │ │ ├── utils.py │ │ │ └── yi.py │ │ └── projector │ │ │ ├── __init__.py │ │ │ ├── configuration_projector.py │ │ │ └── modeling_projector.py │ ├── orpo.py │ ├── reward.py │ ├── sft.py │ ├── transformers_models │ │ ├── __init__.py │ │ ├── deepseek_v2 │ │ │ ├── __init__.py │ │ │ ├── configuration_deepseek.py │ │ │ ├── modeling_deepseek.py │ │ │ └── tokenization_deepseek_fast.py │ │ └── mixtral │ │ │ ├── __init__.py │ │ │ ├── configuration_mixtral.py │ │ │ └── modeling_mixtral.py │ └── utils.py │ ├── parallel │ ├── __init__.py │ └── sequence │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── comm.py │ │ ├── data_collate.py │ │ ├── reduce_loss.py │ │ ├── sampler.py │ │ └── setup_distributed.py │ ├── registry.py │ ├── tools │ ├── chat.py │ ├── check_custom_dataset.py │ ├── copy_cfg.py │ ├── data_preprocess │ │ ├── arxiv.py │ │ └── convert_refcoco.py │ ├── eval_refcoco.py │ ├── get_data_order.py │ ├── list_cfg.py │ ├── list_dataset_format.py │ ├── log_dataset.py │ ├── mmbench.py │ ├── model_converters │ │ ├── merge.py │ │ ├── modeling_internlm2_reward │ │ │ ├── __init__.py │ │ │ ├── configuration_internlm2.py │ │ │ └── modeling_internlm2.py │ │ ├── pth_to_hf.py │ │ └── split.py │ ├── plugins │ │ ├── __init__.py │ │ ├── api.py │ │ ├── calculate.py │ │ ├── search.py │ │ └── solve.py │ ├── process_untokenized_datasets.py │ ├── process_untokenized_datasets_legacy.py │ ├── process_untokenized_llava_data.py │ ├── test.py │ ├── tokenize_ftdp_datasets.py │ ├── train.py │ └── utils.py │ ├── utils │ ├── __init__.py │ ├── constants.py │ ├── fileio.py │ ├── handle_moe_load_and_save.py │ ├── stop_criteria.py │ ├── templates.py │ └── zero_to_any_dtype.py │ └── version.py ├── seg ├── configs │ ├── _base_ │ │ ├── datasets │ │ │ ├── ade_panoptic.py │ │ │ ├── ade_panoptic_ov.py │ │ │ ├── ade_panoptic_ov_720p.py │ │ │ ├── cityscapes_panoptic.py │ │ │ ├── cityscapes_panoptic_720p.py │ │ │ ├── coco_panoptic_lsj.py │ │ │ ├── coco_panoptic_lsj_sam.py │ │ │ ├── coco_panoptic_lsj_sam_720p.py │ │ │ ├── coco_panoptic_video_ade_yt19_yt21_vip_cityscapes.py │ │ │ ├── coco_panoptic_video_lsj.py │ │ │ ├── coco_panoptic_video_yt19_vip_cityscapes_cocopansam.py │ │ │ ├── coco_panoptic_video_yt19_vip_cocopansam.py │ │ │ ├── coco_panoptic_video_yt19_yt21_vip_cityscapes_cocopansam.py │ │ │ ├── davis.py │ │ │ ├── joint_dataset.py │ │ │ ├── objects365v2_detection_lsj.py │ │ │ ├── objects365v2_instance_lsj.py │ │ │ ├── vipseg.py │ │ │ ├── youtube_vis_2019.py │ │ │ ├── youtube_vis_2021.py │ │ │ └── youtube_vis_ovis.py │ │ ├── default_runtime.py │ │ └── schedules │ │ │ ├── schedule_12e.py │ │ │ └── schedule_24e.py │ ├── m2_train_close_set │ │ └── omg_convl_coco_vid_ade_yt19_yt21_vip_city.py │ ├── m2ov_train │ │ ├── omg_convl_vlm_fix_12e_ov_coco_vid_yt19_vip_city_cocopansam.py │ │ └── omg_convl_vlm_fix_12e_ov_coco_vid_yt19_y21_vip_city_cocopansam.py │ └── m2ov_val │ │ ├── datasets │ │ ├── ade.py │ │ ├── cityscapes.py │ │ ├── coco.py │ │ ├── coco_pan_point.py │ │ ├── davis.py │ │ ├── vipseg.py │ │ ├── y19.py │ │ └── y21.py │ │ ├── eval_m2_convl_300q_ov_ade.py │ │ ├── eval_m2_convl_300q_ov_cityscapes.py │ │ ├── eval_m2_convl_300q_ov_coco.py │ │ ├── eval_m2_convl_300q_ov_davis.py │ │ ├── eval_m2_convl_300q_ov_vipseg.py │ │ ├── eval_m2_convl_300q_ov_y19.py │ │ ├── eval_m2_convl_300q_ov_y21.py │ │ ├── eval_m2_convl_ov_coco_pan_point.py │ │ └── models │ │ └── m2_convl_300q.py ├── datasets │ ├── ade_ov.py │ ├── cityscapes.py │ ├── coco_ins_ov.py │ ├── coco_ov.py │ ├── coco_pan_sam.py │ ├── concat_dataset.py │ ├── davis.py │ ├── pipelines │ │ ├── formatting.py │ │ ├── frame_copy.py │ │ ├── frame_sampling.py │ │ ├── loading.py │ │ └── transforms.py │ ├── samplers │ │ ├── batch_sampler.py │ │ └── multi_dataset_sampler.py │ ├── vipseg.py │ └── youtube_vis_dataset.py ├── evaluation │ ├── hooks │ │ └── visual_hook.py │ └── metrics │ │ ├── cityscapes_panoptic_metric.py │ │ ├── ins_cls_iou_metric.py │ │ ├── vip_seg_metric.py │ │ └── vos_metric.py └── models │ ├── backbones │ ├── __init__.py │ └── openclip_backbone.py │ ├── data_preprocessor │ ├── __init__.py │ ├── ovsam_preprocessor.py │ └── vidseg_data_preprocessor.py │ ├── detectors │ ├── __init__.py │ ├── mask2former_vid.py │ └── mask2former_vid_minvis.py │ ├── fusion_head │ ├── __init__.py │ └── omgseg_fusionhead.py │ ├── heads │ ├── __init__.py │ └── mask2former_vid.py │ ├── task_modules │ └── cost.py │ └── utils │ ├── __init__.py │ ├── class_overlapping.py │ ├── load_checkpoint.py │ ├── mask_pool.py │ ├── no_obj.py │ ├── offline_video_metrics.py │ ├── online_pq_utils.py │ ├── pan_seg_transform.py │ └── video_gt_preprocess.py └── tools ├── dataset_convert └── vis_to_coco.py ├── dist.sh ├── eval_scripts ├── eval_davis.py └── eval_video.py ├── gen_cls.py ├── slurm.sh ├── test.py └── train.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.jpg filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /EMB.md: -------------------------------------------------------------------------------- 1 | ## Generate text embedding for each dataset and Download the pretrained models. 2 | 3 | ### For Separate Dataset. (Mainly For Evaluation) 4 | 5 | We adopt the separate dataset embedding for testing. 6 | 7 | 8 | ```commandline 9 | ./tools/dist.sh gen_cls seg/configs/m2ov_val/eval_m2_convl_300q_ov_coco.py 1 10 | ``` 11 | ```commandline 12 | ./tools/dist.sh gen_cls seg/configs/m2ov_val/eval_m2_convl_300q_ov_ade.py 1 13 | ``` 14 | 15 | ```commandline 16 | ./tools/dist.sh gen_cls seg/configs/m2ov_val/eval_m2_convl_300q_ov_cityscapes.py 1 17 | ``` 18 | 19 | ```commandline 20 | ./tools/dist.sh gen_cls seg/configs/m2ov_val/eval_m2_convl_300q_ov_vipseg.py 1 21 | ``` 22 | 23 | ```commandline 24 | ./tools/dist.sh gen_cls seg/configs/m2ov_val/eval_m2_convl_300q_ov_y19.py 1 25 | ``` 26 | 27 | ```commandline 28 | ./tools/dist.sh gen_cls seg/configs/m2ov_val/eval_m2_convl_300q_ov_y21.py 1 29 | ``` 30 | 31 | ### For Merged Dataset Training. (Mainly For Co-Training) 32 | 33 | We adopt the merged dataset embedding for training. 34 | 35 | ```commandline 36 | ./tools/dist.sh gen_cls seg/configs/m2ov_train/omg_convl_vlm_fix_24e_ov_coco_vid_yt19_vip_city_cocopansam.py 1 37 | ``` 38 | 39 | Once you finish converting the embedding, you will obtain the embedding file in your cache folder. 40 | 41 | ### Download Pre-trained Open-ClIP models. 42 | 43 | When generating the class embedding classifier, the scripts will automatically download the pre-trained CLIP models. 44 | 45 | If you are in China, you can use [HF-Mirror](https://hf-mirror.com/). Follow the step to set the default path. 46 | 47 | ```commandline 48 | pip install -U huggingface_hub 49 | ``` 50 | 51 | ```commandline 52 | export HF_ENDPOINT=https://hf-mirror.com 53 | ``` 54 | 55 | 56 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This project is licensed under the MIT license. 2 | Copyrights are respective of each contributor listed at the beginning of each definition file. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | 8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | # OMG-Seg Demo 2 | 3 | We provide a single-file demo in this folder to facilitate getting started. Supposing that you are in the root directory of this project. 4 | 5 | ## Embedding Generation 6 | To use the demo, you need first generate a class name list to tell the OMG-Seg model all possible categories as the vocabulary dictionary. We have already provided a sample vocabulary list in `demo/configs/names/th139_st101.py`. 7 | 8 | Then, we need to generate the class embeddings based on the names. You can do this by the following command: 9 | ```commandline 10 | PYTHONPATH=. python tools/gen_cls.py demo/configs/m2_convl.py 11 | ``` 12 | The script will automatically read the class list, which is imported in `demo/configs/m2_convl.py` (please refer to `CLASSES` and `DATASET_NAME`), and generate the embeddings. 13 | 14 | ## Run the Demo 15 | After generating the embeddings, you can run the demo by: 16 | ```commandline 17 | PYTHONPATH=. python demo/image_demo.py 18 | ``` 19 | for image; and 20 | ```commandline 21 | PYTHONPATH=. python demo/video_demo.py 22 | ``` 23 | for video. 24 | 25 | Please refer to `test_image` and `test_video` for the visualization of the outputs. 26 | 27 | ## Customization 28 | If you want to try your own images or videos, please change the `IMG_PATH`, `VID_PATH`, and `MODEL_PATH`. 29 | 30 | If you want to customize our model, please refer to the config scripts (`demo/configs/m2_convl.py` and `demo/configs/m2_convl_vid.py`) for details. 31 | 32 | Note that all the model-related code have been imported in the config file. You need to find the corresponding path to find the model implementation details. -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002020.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002020.jpg -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002023.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002023.jpg -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002026.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002026.jpg -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002029.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002029.jpg -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002032.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002032.jpg -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002035.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002035.jpg -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002038.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002038.jpg -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002041.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002041.jpg -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002044.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002044.jpg -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002047.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002047.jpg -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002050.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002050.jpg -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002053.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002053.jpg -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002056.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002056.jpg -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002059.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002059.jpg -------------------------------------------------------------------------------- /demo/images/350_6L1vA-xJt-M/00002062.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002062.jpg -------------------------------------------------------------------------------- /demo/images/sa_1002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/sa_1002.jpg -------------------------------------------------------------------------------- /ext/cityscapes_scripts/helpers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/ext/cityscapes_scripts/helpers/__init__.py -------------------------------------------------------------------------------- /ext/cityscapes_scripts/helpers/version.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | 5 | with open(os.path.join(os.path.dirname(__file__), '..', 'VERSION')) as f: 6 | version = f.read().strip() 7 | 8 | if __name__ == "__main__": 9 | print(version) 10 | -------------------------------------------------------------------------------- /ext/davis2017/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | __version__ = '0.1.0' 4 | -------------------------------------------------------------------------------- /ext/meta/sam_meta.py: -------------------------------------------------------------------------------- 1 | meta_dict = { 2 | 'vit_h': dict( 3 | encoder_embed_dim=1280, 4 | encoder_depth=32, 5 | encoder_num_heads=16, 6 | encoder_global_attn_indexes=[7, 15, 23, 31], 7 | # common 8 | prompt_embed_dim=256, 9 | image_size=1024, 10 | vit_patch_size=16, 11 | image_embedding_size=64 12 | ), 13 | 'vit_l': dict( 14 | encoder_embed_dim=1024, 15 | encoder_depth=24, 16 | encoder_num_heads=16, 17 | encoder_global_attn_indexes=[5, 11, 17, 23], 18 | # common 19 | prompt_embed_dim=256, 20 | image_size=1024, 21 | vit_patch_size=16, 22 | image_embedding_size=64 23 | ), 24 | 'vit_b': dict( 25 | encoder_embed_dim=768, 26 | encoder_depth=12, 27 | encoder_num_heads=12, 28 | encoder_global_attn_indexes=[2, 5, 8, 11], 29 | # common 30 | prompt_embed_dim=256, 31 | image_size=1024, 32 | vit_patch_size=16, 33 | image_embedding_size=64 34 | ) 35 | } 36 | 37 | checkpoint_dict = { 38 | 'vit_h': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth', 39 | 'vit_l': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth', 40 | 'vit_b': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth', 41 | } 42 | -------------------------------------------------------------------------------- /ext/open_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .coca_model import CoCa 2 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 3 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss 4 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint 5 | from .loss import ClipLoss, DistillClipLoss, CoCaLoss 6 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \ 7 | convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype 8 | from .openai import load_openai_model, list_openai_models 9 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \ 10 | get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained 11 | from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub 12 | from .tokenizer import SimpleTokenizer, tokenize, decode 13 | from .transform import image_transform, AugmentationCfg 14 | from .zero_shot_classifier import build_zero_shot_classifier, build_zero_shot_classifier_legacy 15 | from .zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES, SIMPLE_IMAGENET_TEMPLATES, IMAGENET_CLASSNAMES 16 | -------------------------------------------------------------------------------- /ext/open_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/ext/open_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /ext/open_clip/constants.py: -------------------------------------------------------------------------------- 1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 3 | -------------------------------------------------------------------------------- /ext/open_clip/generation_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/ext/open_clip/generation_utils.py -------------------------------------------------------------------------------- /ext/open_clip/model_configs/EVA01-g-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva_giant_patch14_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/EVA01-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva_giant_patch14_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/EVA02-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_base_patch16_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/EVA02-E-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_enormous_patch14_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1280, 14 | "heads": 20, 15 | "layers": 32 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/EVA02-E-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_enormous_patch14_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/EVA02-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "timm_model_name": "eva02_large_patch14_clip_336", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/EVA02-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_large_patch14_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /ext/open_clip/model_configs/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/RN50x64.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": [ 6 | 3, 7 | 15, 8 | 36, 9 | 10 10 | ], 11 | "width": 128, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 1024, 18 | "heads": 16, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-M-16-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 16, 8 | "ls_init_value": 1e-4 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 384, 14 | "heads": 6, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-M-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-M-32-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-M-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-S-16-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 256, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 256, 13 | "heads": 4, 14 | "layers": 10 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-S-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-S-32-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 256, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 256, 13 | "heads": 4, 14 | "layers": 10 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-S-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-bigG-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 1664, 7 | "head_width": 104, 8 | "mlp_ratio": 4.9231, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1280, 15 | "heads": 20, 16 | "layers": 32 17 | } 18 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-e-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 56, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.5715, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1280, 15 | "heads": 20, 16 | "layers": 36 17 | } 18 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/coca_ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32, 8 | "attentional_pool": true, 9 | "attn_pooler_heads": 8, 10 | "output_tokens": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 76, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12, 18 | "embed_cls": true, 19 | "output_tokens": true 20 | }, 21 | "multimodal_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 49408, 24 | "width": 512, 25 | "heads": 8, 26 | "layers": 12, 27 | "attn_pooler_heads": 8 28 | }, 29 | "custom_text": true 30 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/coca_ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14, 8 | "attentional_pool": true, 9 | "attn_pooler_heads": 8, 10 | "output_tokens": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 76, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 12, 18 | "embed_cls": true, 19 | "output_tokens": true 20 | }, 21 | "multimodal_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 49408, 24 | "width": 768, 25 | "heads": 12, 26 | "layers": 12, 27 | "attn_pooler_heads": 12 28 | }, 29 | "custom_text": true 30 | } 31 | -------------------------------------------------------------------------------- /ext/open_clip/model_configs/coca_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "multimodal_cfg": { 4 | "width": 768, 5 | "context_length": 76, 6 | "vocab_size": 64000, 7 | "mlp_ratio": 4, 8 | "layers": 12, 9 | "dim_head": 64, 10 | "heads": 12, 11 | "n_queries": 256, 12 | "attn_pooler_heads": 8 13 | }, 14 | "vision_cfg": { 15 | "image_size": 288, 16 | "layers": 12, 17 | "width": 768, 18 | "patch_size": 18, 19 | "output_tokens": true 20 | }, 21 | "text_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 64000, 24 | "layers": 12, 25 | "heads": 12, 26 | "width": 768, 27 | "embed_cls": true, 28 | "output_tokens": true 29 | }, 30 | "custom_text": true 31 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/coca_roberta-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32, 8 | "output_tokens": true 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "roberta-base", 12 | "hf_tokenizer_name": "roberta-base", 13 | "proj": "linear", 14 | "width": 768, 15 | "output_tokens": true 16 | }, 17 | "multimodal_cfg": { 18 | "context_length": 76, 19 | "width": 768, 20 | "heads": 8, 21 | "layers": 12 22 | }, 23 | "custom_text": true 24 | } 25 | -------------------------------------------------------------------------------- /ext/open_clip/model_configs/convnext_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/convnext_base_w.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 640, 16 | "heads": 10, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/convnext_base_w_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 640, 16 | "heads": 10, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/convnext_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/convnext_large_d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "mlp", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 16 18 | } 19 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/convnext_large_d_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "mlp", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 16 18 | } 19 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/convnext_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_small", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/convnext_tiny.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_tiny", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/convnext_xlarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 20 18 | } 19 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/convnext_xxlarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xxlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 24 18 | } 19 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/convnext_xxlarge_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xxlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 24 18 | } 19 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/mt5-base-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "hf_model_name": "google/mt5-base", 11 | "hf_tokenizer_name": "google/mt5-base", 12 | "proj": "mlp", 13 | "pooler_type": "mean_pooler" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /ext/open_clip/model_configs/mt5-xl-ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "google/mt5-xl", 12 | "hf_tokenizer_name": "google/mt5-xl", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /ext/open_clip/model_configs/roberta-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "roberta-base", 12 | "hf_tokenizer_name": "roberta-base", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /ext/open_clip/model_configs/swin_base_patch4_window7_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "swin_base_patch4_window7_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 640, 14 | "heads": 10, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/vit_medium_patch16_gap_256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_medium_patch16_gap_256", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 256 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_relpos_medium_patch16_cls_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /ext/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "hf_model_name": "xlm-roberta-base", 11 | "hf_tokenizer_name": "xlm-roberta-base", 12 | "proj": "mlp", 13 | "pooler_type": "mean_pooler" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /ext/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "xlm-roberta-large", 12 | "hf_tokenizer_name": "xlm-roberta-large", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /ext/open_clip/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.20.0' 2 | -------------------------------------------------------------------------------- /ext/sam/__init__.py: -------------------------------------------------------------------------------- 1 | from .image_encoder import ImageEncoderViT 2 | from .prompt_encoder import PromptEncoder 3 | from .mask_decoder import MaskDecoder 4 | -------------------------------------------------------------------------------- /ext/sam/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | 10 | from typing import Type 11 | 12 | 13 | class MLPBlock(nn.Module): 14 | def __init__( 15 | self, 16 | embedding_dim: int, 17 | mlp_dim: int, 18 | act: Type[nn.Module] = nn.GELU, 19 | ) -> None: 20 | super().__init__() 21 | self.lin1 = nn.Linear(embedding_dim, mlp_dim) 22 | self.lin2 = nn.Linear(mlp_dim, embedding_dim) 23 | self.act = act() 24 | 25 | def forward(self, x: torch.Tensor) -> torch.Tensor: 26 | return self.lin2(self.act(self.lin1(x))) 27 | 28 | 29 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa 30 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa 31 | class LayerNorm2d(nn.Module): 32 | def __init__(self, num_channels: int, eps: float = 1e-6) -> None: 33 | super().__init__() 34 | self.weight = nn.Parameter(torch.ones(num_channels)) 35 | self.bias = nn.Parameter(torch.zeros(num_channels)) 36 | self.eps = eps 37 | 38 | def forward(self, x: torch.Tensor) -> torch.Tensor: 39 | u = x.mean(1, keepdim=True) 40 | s = (x - u).pow(2).mean(1, keepdim=True) 41 | x = (x - u) / torch.sqrt(s + self.eps) 42 | x = self.weight[:, None, None] * x + self.bias[:, None, None] 43 | return x 44 | -------------------------------------------------------------------------------- /ext/templates/__init__.py: -------------------------------------------------------------------------------- 1 | from .vild import VILD_PROMPT 2 | -------------------------------------------------------------------------------- /ext/templates/vild.py: -------------------------------------------------------------------------------- 1 | # https://github.com/bytedance/fc-clip/blob/93f3122518e8a3ef98926e5ea761a776d5050430/fcclip/fcclip.py#L26C1-L41C2 2 | VILD_PROMPT = [ 3 | "a photo of a {}.", 4 | "This is a photo of a {}", 5 | "There is a {} in the scene", 6 | "There is the {} in the scene", 7 | "a photo of a {} in the scene", 8 | "a photo of a small {}.", 9 | "a photo of a medium {}.", 10 | "a photo of a large {}.", 11 | "This is a photo of a small {}.", 12 | "This is a photo of a medium {}.", 13 | "This is a photo of a large {}.", 14 | "There is a small {} in the scene.", 15 | "There is a medium {} in the scene.", 16 | "There is a large {} in the scene.", 17 | ] 18 | -------------------------------------------------------------------------------- /figs/method_comparison.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/figs/method_comparison.jpg -------------------------------------------------------------------------------- /figs/omg_teaser.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/figs/omg_teaser.jpg -------------------------------------------------------------------------------- /omg_llava/.owners.yml: -------------------------------------------------------------------------------- 1 | assign: 2 | issues: disabled 3 | pull_requests: disabled 4 | strategy: 5 | random 6 | # daily-shift-based 7 | schedule: 8 | '*/1 * * * *' 9 | -------------------------------------------------------------------------------- /omg_llava/.pre-commit-config-zh-cn.yaml: -------------------------------------------------------------------------------- 1 | exclude: ^tests/data/ 2 | repos: 3 | - repo: https://gitee.com/openmmlab/mirrors-flake8 4 | rev: 5.0.4 5 | hooks: 6 | - id: flake8 7 | - repo: https://gitee.com/openmmlab/mirrors-isort 8 | rev: 5.11.5 9 | hooks: 10 | - id: isort 11 | - repo: https://gitee.com/openmmlab/mirrors-yapf 12 | rev: v0.32.0 13 | hooks: 14 | - id: yapf 15 | - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks 16 | rev: v4.3.0 17 | hooks: 18 | - id: trailing-whitespace 19 | - id: check-yaml 20 | - id: end-of-file-fixer 21 | - id: requirements-txt-fixer 22 | - id: double-quote-string-fixer 23 | - id: check-merge-conflict 24 | - id: fix-encoding-pragma 25 | args: ["--remove"] 26 | - id: mixed-line-ending 27 | args: ["--fix=lf"] 28 | - repo: https://gitee.com/openmmlab/mirrors-codespell 29 | rev: v2.2.1 30 | hooks: 31 | - id: codespell 32 | - repo: https://gitee.com/openmmlab/mirrors-mdformat 33 | rev: 0.7.9 34 | hooks: 35 | - id: mdformat 36 | args: ["--number"] 37 | additional_dependencies: 38 | - mdformat-openmmlab 39 | - mdformat_frontmatter 40 | - linkify-it-py 41 | - repo: https://gitee.com/openmmlab/mirrors-docformatter 42 | rev: v1.3.1 43 | hooks: 44 | - id: docformatter 45 | args: ["--in-place", "--wrap-descriptions", "79"] 46 | - repo: https://github.com/asottile/pyupgrade 47 | rev: v3.0.0 48 | hooks: 49 | - id: pyupgrade 50 | args: ["--py36-plus"] 51 | -------------------------------------------------------------------------------- /omg_llava/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: ^tests/data/ 2 | repos: 3 | - repo: https://github.com/PyCQA/flake8 4 | rev: 5.0.4 5 | hooks: 6 | - id: flake8 7 | - repo: https://github.com/PyCQA/isort 8 | rev: 5.11.5 9 | hooks: 10 | - id: isort 11 | - repo: https://github.com/pre-commit/mirrors-yapf 12 | rev: v0.32.0 13 | hooks: 14 | - id: yapf 15 | exclude: 'xtuner/parallel/sequence/__init__.py' 16 | - repo: https://github.com/pre-commit/pre-commit-hooks 17 | rev: v4.3.0 18 | hooks: 19 | - id: trailing-whitespace 20 | - id: check-yaml 21 | - id: end-of-file-fixer 22 | - id: requirements-txt-fixer 23 | - id: double-quote-string-fixer 24 | - id: check-merge-conflict 25 | - id: fix-encoding-pragma 26 | args: ["--remove"] 27 | - id: mixed-line-ending 28 | args: ["--fix=lf"] 29 | - repo: https://github.com/codespell-project/codespell 30 | rev: v2.2.1 31 | hooks: 32 | - id: codespell 33 | - repo: https://github.com/executablebooks/mdformat 34 | rev: 0.7.9 35 | hooks: 36 | - id: mdformat 37 | args: ["--number"] 38 | additional_dependencies: 39 | - mdformat-openmmlab 40 | - mdformat_frontmatter 41 | - linkify-it-py 42 | exclude: 'docs/zh_cn/user_guides/sequence_parallel.md' 43 | - repo: https://github.com/myint/docformatter 44 | rev: v1.3.1 45 | hooks: 46 | - id: docformatter 47 | args: ["--in-place", "--wrap-descriptions", "79"] 48 | - repo: https://github.com/asottile/pyupgrade 49 | rev: v3.0.0 50 | hooks: 51 | - id: pyupgrade 52 | args: ["--py36-plus"] 53 | -------------------------------------------------------------------------------- /omg_llava/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include xtuner/configs *.py *.yml *.json 2 | recursive-include xtuner/tools *.sh *.py 3 | -------------------------------------------------------------------------------- /omg_llava/figs/omg_llava.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/figs/omg_llava.png -------------------------------------------------------------------------------- /omg_llava/omg_llava/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/omg_llava/__init__.py -------------------------------------------------------------------------------- /omg_llava/omg_llava/configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/omg_llava/configs/__init__.py -------------------------------------------------------------------------------- /omg_llava/omg_llava/dataset/collect_fns/__init__.py: -------------------------------------------------------------------------------- 1 | from .omg_llava_collate_fn import omg_llava_collate_fn -------------------------------------------------------------------------------- /omg_llava/omg_llava/dataset/process_functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .gcg_process import glamm_refcocog_map_fn, glamm_granf_map_fn, glamm_openpsg_map_fn, glamm_flickr_map_fn 2 | from .mdpv_points_process import mdpv_points_map_fn 3 | from .referring_seg_process import referring_seg_map_fn, referring_seg_gcg_format_map_fn 4 | from .region_caption_process import osprey_region_caption_map_fn, osprey_region_caption_gcg_format_map_fn, osprey_region_conversation_map_fn 5 | from .semantic_seg_process import semantic_seg_map_fn, pascal_part_map_fn, semantic_seg_gcg_format_map_fn, pascal_part_gcg_format_map_fn 6 | from .decoupled_gcg_process import glamm_openpsg_decoupled_given_objects_map_fn, glamm_openpsg_decoupled_given_description_map_fn,\ 7 | glamm_flickr_decoupled_given_objects_map_fn, glamm_flickr_decoupled_given_description_map_fn,\ 8 | glamm_granf_decoupled_given_objects_map_fn, glamm_granf_decoupled_given_description_map_fn,\ 9 | glamm_refcocog_decoupled_given_description_map_fn, glamm_refcocog_decoupled_given_objects_map_fn -------------------------------------------------------------------------------- /omg_llava/omg_llava/dataset/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import expand2square, expand2square_mask, expand2square_points, expand2square_bbox -------------------------------------------------------------------------------- /omg_llava/omg_llava/dataset/utils/ade20k_classes.json: -------------------------------------------------------------------------------- 1 | [ 2 | "wall", "building", "sky", "floor", "tree", "ceiling", "road", 3 | "bed", "windowpane", "grass", "cabinet", "sidewalk", 4 | "person", "earth", "door", "table", "mountain", "plant", 5 | "curtain", "chair", "car", "water", "painting", "sofa", 6 | "shelf", "house", "sea", "mirror", "rug", "field", "armchair", 7 | "seat", "fence", "desk", "rock", "wardrobe", "lamp", 8 | "bathtub", "railing", "cushion", "base", "box", "column", 9 | "signboard", "chest of drawers", "counter", "sand", "sink", 10 | "skyscraper", "fireplace", "refrigerator", "grandstand", 11 | "path", "stairs", "runway", "case", "pool table", "pillow", 12 | "screen door", "stairway", "river", "bridge", "bookcase", 13 | "blind", "coffee table", "toilet", "flower", "book", "hill", 14 | "bench", "countertop", "stove", "palm", "kitchen island", 15 | "computer", "swivel chair", "boat", "bar", "arcade machine", 16 | "hovel", "bus", "towel", "light", "truck", "tower", 17 | "chandelier", "awning", "streetlight", "booth", 18 | "television receiver", "airplane", "dirt track", "apparel", 19 | "pole", "land", "bannister", "escalator", "ottoman", "bottle", 20 | "buffet", "poster", "stage", "van", "ship", "fountain", 21 | "conveyer belt", "canopy", "washer", "plaything", 22 | "swimming pool", "stool", "barrel", "basket", "waterfall", 23 | "tent", "bag", "minibike", "cradle", "oven", "ball", "food", 24 | "step", "tank", "trade name", "microwave", "pot", "animal", 25 | "bicycle", "lake", "dishwasher", "screen", "blanket", 26 | "sculpture", "hood", "sconce", "vase", "traffic light", 27 | "tray", "ashcan", "fan", "pier", "crt screen", "plate", 28 | "monitor", "bulletin board", "shower", "radiator", "glass", 29 | "clock", "flag" 30 | ] -------------------------------------------------------------------------------- /omg_llava/omg_llava/engine/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset_info_hook import DatasetInfoHook_withSpecoalTokens 2 | from .evaluate_chat_hook import EvaluateChatHook_withSpecialTokens -------------------------------------------------------------------------------- /omg_llava/omg_llava/engine/dataset_info_hook.py: -------------------------------------------------------------------------------- 1 | from xtuner.registry import BUILDER 2 | from xtuner.engine.hooks import DatasetInfoHook 3 | 4 | class DatasetInfoHook_withSpecoalTokens(DatasetInfoHook): 5 | def __init__(self, tokenizer, is_intern_repo_dataset=False): 6 | self.tokenizer = BUILDER.build(tokenizer) 7 | self.is_intern_repo_dataset = is_intern_repo_dataset 8 | # add special tokens 9 | # Adding special tokens for pixel grounding 10 | segmentation_tokens = ['[SEG]'] 11 | # Adding tokens for GCG 12 | phrase_tokens = ['

', '

'] 13 | # add for visual prompt 14 | region_tokens = [''] 15 | point_tokens = [''] 16 | special_tokens = segmentation_tokens + phrase_tokens + region_tokens + point_tokens 17 | self.tokenizer.add_tokens(special_tokens, special_tokens=True) -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .convnext_clip import OpenCLIPBackbone, OpenCLIPBackbone_omgseg 2 | from .modules import ProjectorConfig_OMG_LLaVA, ProjectorModel_OMG_LLaVA 3 | from .omg_seg import OMGSegVisualEncoder, Mask2FormerVideoSemSamHead 4 | from .omg_llava import OMG_LLaVA -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .openclip_backbone import OpenCLIPBackbone, OpenCLIPBackbone_omgseg -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .coca_model import CoCa 2 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 3 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss 4 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint 5 | from .loss import ClipLoss, DistillClipLoss, CoCaLoss 6 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \ 7 | convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype 8 | from .openai import load_openai_model, list_openai_models 9 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \ 10 | get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained 11 | from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub 12 | from .tokenizer import SimpleTokenizer, tokenize, decode 13 | from .transform import image_transform, AugmentationCfg 14 | from .zero_shot_classifier import build_zero_shot_classifier, build_zero_shot_classifier_legacy 15 | from .zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES, SIMPLE_IMAGENET_TEMPLATES, IMAGENET_CLASSNAMES 16 | -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/omg_llava/model/convnext_clip/open_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/constants.py: -------------------------------------------------------------------------------- 1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 3 | -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/generation_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/omg_llava/model/convnext_clip/open_clip/generation_utils.py -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/EVA01-g-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva_giant_patch14_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/EVA01-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva_giant_patch14_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/EVA02-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_base_patch16_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/EVA02-E-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_enormous_patch14_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1280, 14 | "heads": 20, 15 | "layers": 32 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/EVA02-E-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_enormous_patch14_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/EVA02-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "timm_model_name": "eva02_large_patch14_clip_336", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/EVA02-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_large_patch14_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/RN50x64.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": [ 6 | 3, 7 | 15, 8 | 36, 9 | 10 10 | ], 11 | "width": 128, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 1024, 18 | "heads": 16, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-M-16-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 16, 8 | "ls_init_value": 1e-4 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 384, 14 | "heads": 6, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-M-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-M-32-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-M-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-S-16-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 256, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 256, 13 | "heads": 4, 14 | "layers": 10 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-S-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-S-32-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 256, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 256, 13 | "heads": 4, 14 | "layers": 10 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-S-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-bigG-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 1664, 7 | "head_width": 104, 8 | "mlp_ratio": 4.9231, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1280, 15 | "heads": 20, 16 | "layers": 32 17 | } 18 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-e-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 56, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.5715, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1280, 15 | "heads": 20, 16 | "layers": 36 17 | } 18 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/coca_ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32, 8 | "attentional_pool": true, 9 | "attn_pooler_heads": 8, 10 | "output_tokens": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 76, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12, 18 | "embed_cls": true, 19 | "output_tokens": true 20 | }, 21 | "multimodal_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 49408, 24 | "width": 512, 25 | "heads": 8, 26 | "layers": 12, 27 | "attn_pooler_heads": 8 28 | }, 29 | "custom_text": true 30 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/coca_ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14, 8 | "attentional_pool": true, 9 | "attn_pooler_heads": 8, 10 | "output_tokens": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 76, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 12, 18 | "embed_cls": true, 19 | "output_tokens": true 20 | }, 21 | "multimodal_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 49408, 24 | "width": 768, 25 | "heads": 12, 26 | "layers": 12, 27 | "attn_pooler_heads": 12 28 | }, 29 | "custom_text": true 30 | } 31 | -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/coca_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "multimodal_cfg": { 4 | "width": 768, 5 | "context_length": 76, 6 | "vocab_size": 64000, 7 | "mlp_ratio": 4, 8 | "layers": 12, 9 | "dim_head": 64, 10 | "heads": 12, 11 | "n_queries": 256, 12 | "attn_pooler_heads": 8 13 | }, 14 | "vision_cfg": { 15 | "image_size": 288, 16 | "layers": 12, 17 | "width": 768, 18 | "patch_size": 18, 19 | "output_tokens": true 20 | }, 21 | "text_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 64000, 24 | "layers": 12, 25 | "heads": 12, 26 | "width": 768, 27 | "embed_cls": true, 28 | "output_tokens": true 29 | }, 30 | "custom_text": true 31 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/coca_roberta-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32, 8 | "output_tokens": true 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "roberta-base", 12 | "hf_tokenizer_name": "roberta-base", 13 | "proj": "linear", 14 | "width": 768, 15 | "output_tokens": true 16 | }, 17 | "multimodal_cfg": { 18 | "context_length": 76, 19 | "width": 768, 20 | "heads": 8, 21 | "layers": 12 22 | }, 23 | "custom_text": true 24 | } 25 | -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_base_w.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 640, 16 | "heads": 10, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_base_w_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 640, 16 | "heads": 10, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_large_d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "mlp", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 16 18 | } 19 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_large_d_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "mlp", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 16 18 | } 19 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_small", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_tiny.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_tiny", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 224 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_xlarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 20 18 | } 19 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_xxlarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xxlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 24 18 | } 19 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_xxlarge_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xxlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "timm_drop": 0.0, 9 | "timm_drop_path": 0.1, 10 | "image_size": 320 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 1024, 16 | "heads": 16, 17 | "layers": 24 18 | } 19 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/mt5-base-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "hf_model_name": "google/mt5-base", 11 | "hf_tokenizer_name": "google/mt5-base", 12 | "proj": "mlp", 13 | "pooler_type": "mean_pooler" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/mt5-xl-ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "google/mt5-xl", 12 | "hf_tokenizer_name": "google/mt5-xl", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/roberta-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "roberta-base", 12 | "hf_tokenizer_name": "roberta-base", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/swin_base_patch4_window7_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "swin_base_patch4_window7_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 640, 14 | "heads": 10, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/vit_medium_patch16_gap_256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_medium_patch16_gap_256", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 256 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_relpos_medium_patch16_cls_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "hf_model_name": "xlm-roberta-base", 11 | "hf_tokenizer_name": "xlm-roberta-base", 12 | "proj": "mlp", 13 | "pooler_type": "mean_pooler" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "xlm-roberta-large", 12 | "hf_tokenizer_name": "xlm-roberta-large", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/convnext_clip/open_clip/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.20.0' 2 | -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from xtuner.model import * 2 | from .projector import ProjectorModel_OMG_LLaVA, ProjectorConfig_OMG_LLaVA 3 | 4 | __all__ = ['ProjectorConfig_OMG_LLaVA', 'ProjectorModel_OMG_LLaVA', ] 5 | -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/modules/projector/__init__.py: -------------------------------------------------------------------------------- 1 | from xtuner.model.modules.projector import * 2 | from transformers import AutoConfig, AutoModel 3 | from .configuration_projector import ProjectorConfig_OMG_LLaVA 4 | from .modeling_projector import ProjectorModel_OMG_LLaVA 5 | 6 | AutoConfig.register('projector', ProjectorConfig_OMG_LLaVA) 7 | AutoModel.register(ProjectorConfig_OMG_LLaVA, ProjectorModel_OMG_LLaVA) 8 | 9 | __all__ = ['ProjectorConfig_OMG_LLaVA', 'ProjectorModel_OMG_LLaVA'] 10 | -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/modules/projector/configuration_projector.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from transformers import PretrainedConfig 3 | 4 | class ProjectorConfig_OMG_LLaVA(PretrainedConfig): 5 | model_type = 'projector' 6 | _auto_class = 'AutoConfig' 7 | 8 | def __init__( 9 | self, 10 | visual_hidden_size=4096, 11 | llm_hidden_size=4096, 12 | depth=2, 13 | hidden_act='gelu', 14 | bias=True, 15 | query_channels=256, 16 | feat_channels=1536, 17 | pixel_shuffle_ratio=None, 18 | additional_bg_tokens=10, 19 | visual_prompt_proj=False, 20 | add_cross_attn_layer=False, 21 | **kwargs, 22 | ): 23 | self.visual_hidden_size = visual_hidden_size 24 | self.llm_hidden_size = llm_hidden_size 25 | self.depth = depth 26 | self.hidden_act = hidden_act 27 | self.bias = bias 28 | self.query_channels=query_channels 29 | self.feat_channels=feat_channels 30 | if pixel_shuffle_ratio is not None: 31 | self.feat_channels = self.feat_channels * pixel_shuffle_ratio * pixel_shuffle_ratio 32 | self.additional_bg_tokens = additional_bg_tokens 33 | self.visual_prompt_proj = visual_prompt_proj 34 | self.add_cross_attn_layer = add_cross_attn_layer 35 | super().__init__(**kwargs) -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/omg_seg/__init__.py: -------------------------------------------------------------------------------- 1 | from .omg_seg_visual_encoder import OMGSegVisualEncoder 2 | from .mask2former_vid_semanticsam import Mask2FormerVideoSemSamHead -------------------------------------------------------------------------------- /omg_llava/omg_llava/model/omg_seg/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | def mask_pool(x, mask): 5 | """ 6 | Args: 7 | x: [B, C, H, W] 8 | mask: [B, Q, H, W] 9 | """ 10 | if not x.shape[-2:] == mask.shape[-2:]: 11 | # reshape mask to x 12 | mask = F.interpolate(mask, size=x.shape[-2:], mode='bilinear', align_corners=False) 13 | with torch.no_grad(): 14 | mask = mask.detach() 15 | mask = (mask > 0).to(mask.dtype) 16 | denorm = mask.sum(dim=(-1, -2), keepdim=True) + 1e-8 17 | 18 | mask_pooled_x = torch.einsum( 19 | "bchw,bqhw->bqc", 20 | x, 21 | mask / denorm, 22 | ) 23 | 24 | return mask_pooled_x 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /omg_llava/omg_llava/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/omg_llava/tools/__init__.py -------------------------------------------------------------------------------- /omg_llava/requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements/runtime.txt 2 | -r requirements/deepspeed.txt 3 | -r requirements/modelscope.txt 4 | 5 | gradio==4.37.2 6 | gradio-image-prompter 7 | pycocotools 8 | timm 9 | ftfy 10 | kornia -------------------------------------------------------------------------------- /omg_llava/requirements/deepspeed.txt: -------------------------------------------------------------------------------- 1 | # Minimum 0.12.3, see https://github.com/microsoft/DeepSpeed/pull/4587 2 | deepspeed>=0.12.3 3 | mpi4py-mpich 4 | -------------------------------------------------------------------------------- /omg_llava/requirements/docs.txt: -------------------------------------------------------------------------------- 1 | docutils 2 | myst-parser==2.0.0 3 | sphinx==6.2.1 4 | sphinx-argparse 5 | sphinx-book-theme==1.0.1 6 | sphinx-copybutton==0.5.2 7 | sphinx_markdown_tables 8 | -------------------------------------------------------------------------------- /omg_llava/requirements/modelscope.txt: -------------------------------------------------------------------------------- 1 | modelscope 2 | -------------------------------------------------------------------------------- /omg_llava/requirements/runtime.txt: -------------------------------------------------------------------------------- 1 | # Minimum 0.40.0.post4 to fix some 4-bit precision bugs 2 | bitsandbytes>=0.40.0.post4 3 | # Minimum 2.16.0 to fix some bugs, see https://github.com/huggingface/datasets/pull/6444 4 | datasets>=2.16.0 5 | einops 6 | # Minimum 0.1.2 to fix some bugs, see https://github.com/InternLM/lagent/pull/44 7 | lagent>=0.1.2 8 | # Minimum 0.10.3 to support distributed evaluation for MMBench 9 | # see https://github.com/open-mmlab/mmengine/pull/1469 10 | mmengine>=0.10.3 11 | openpyxl 12 | # Minimum 0.4.0 to support QLoRA, see https://github.com/huggingface/peft/pull/476 13 | triton==2.1.0 14 | peft>=0.4.0 15 | scikit-image 16 | scipy 17 | SentencePiece 18 | tiktoken 19 | torch 20 | torchvision 21 | # Minimum 4.36.0 to support `Cache` data structure used by KV Cache 22 | # Registering a causal mask in `LlamaModel` is not friendly for very large 23 | # `max_position_embeddings`. Refer to 24 | # https://github.com/huggingface/transformers/blob/v4.38.0/src/transformers/models/llama/modeling_llama.py#L921-L923 25 | # transformers>=4.36.0,!=4.38.0,!=4.38.1,!=4.38.2 26 | transformers==4.36.0 27 | transformers_stream_generator 28 | -------------------------------------------------------------------------------- /omg_llava/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length = 79 3 | multi_line_output = 0 4 | extra_standard_library = setuptools 5 | known_first_party = xtuner 6 | known_third_party = pytest,yaml 7 | no_lines_before = STDLIB,LOCALFOLDER 8 | default_section = THIRDPARTY 9 | 10 | [yapf] 11 | BASED_ON_STYLE = pep8 12 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true 13 | SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true 14 | 15 | [codespell] 16 | ignore-words-list = nd, ba, warmup 17 | -------------------------------------------------------------------------------- /omg_llava/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/test.jpg -------------------------------------------------------------------------------- /omg_llava/xtuner/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import os 3 | 4 | from mmengine.utils import digit_version 5 | 6 | from .entry_point import cli 7 | from .version import __version__, version_info 8 | 9 | HF_CEPH_HUB = os.getenv('HF_CEPH_HUB', '') 10 | HF_USE_CEPH = os.getenv('HF_USE_CEPH', 0) or HF_CEPH_HUB != '' 11 | DS_CEPH_DIR = os.getenv('DS_CEPH_DIR', None) 12 | if HF_USE_CEPH: 13 | from .utils.fileio import (patch_hf_auto_from_pretrained, 14 | patch_hf_save_pretrained) 15 | patch_hf_auto_from_pretrained(HF_CEPH_HUB) 16 | patch_hf_save_pretrained() 17 | 18 | if DS_CEPH_DIR: 19 | from .utils.fileio import patch_deepspeed_engine 20 | patch_deepspeed_engine() 21 | 22 | __all__ = [ 23 | '__version__', 'version_info', 'digit_version', 'cli', 'HF_USE_CEPH', 24 | 'DS_CEPH_DIR' 25 | ] 26 | -------------------------------------------------------------------------------- /omg_llava/xtuner/apis/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .datasets import * # noqa: F401, F403 3 | from .model import * # noqa: F401, F403 4 | from .training_args import * # noqa: F401, F403 5 | -------------------------------------------------------------------------------- /omg_llava/xtuner/apis/datasets/arxiv.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import arxiv_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def arxiv_dataset(tokenizer, 13 | data_file=None, 14 | max_length=2048, 15 | prompt_template=PROMPT_TEMPLATE.default, 16 | remove_unused_columns=True, 17 | pack_to_max_length=True): 18 | template_map_fn = template_map_fn_factory(template=prompt_template) 19 | # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv # noqa: E501 20 | # 2. Process data with `./tools/data_preprocess/arxiv.py` 21 | if data_file is None: 22 | data_file = './data/arxiv_postprocess_csAIcsCLcsCV_20200101.json' 23 | dataset_org = load_dataset(path='json', data_files=dict(train=data_file)) 24 | dataset = process_hf_dataset( 25 | dataset=dataset_org, 26 | tokenizer=tokenizer, 27 | max_length=max_length, 28 | dataset_map_fn=arxiv_map_fn, 29 | template_map_fn=template_map_fn, 30 | remove_unused_columns=remove_unused_columns, 31 | shuffle_before_pack=True, 32 | pack_to_max_length=pack_to_max_length) 33 | 34 | return dataset 35 | 36 | 37 | def arxiv_data_collator(return_hf_format=False): 38 | return partial(default_collate_fn, return_hf_format=return_hf_format) 39 | -------------------------------------------------------------------------------- /omg_llava/xtuner/apis/datasets/code_alpaca.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import code_alpaca_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def code_alpaca_dataset(tokenizer, 13 | path='HuggingFaceH4/CodeAlpaca_20K', 14 | max_length=2048, 15 | prompt_template=PROMPT_TEMPLATE.default, 16 | remove_unused_columns=True, 17 | pack_to_max_length=True): 18 | template_map_fn = template_map_fn_factory(template=prompt_template) 19 | dataset_org = load_dataset(path) 20 | dataset = process_hf_dataset( 21 | dataset=dataset_org, 22 | tokenizer=tokenizer, 23 | max_length=max_length, 24 | dataset_map_fn=code_alpaca_map_fn, 25 | template_map_fn=template_map_fn, 26 | remove_unused_columns=remove_unused_columns, 27 | shuffle_before_pack=True, 28 | pack_to_max_length=pack_to_max_length) 29 | 30 | return dataset 31 | 32 | 33 | def code_alpaca_data_collator(return_hf_format=False): 34 | return partial(default_collate_fn, return_hf_format=return_hf_format) 35 | -------------------------------------------------------------------------------- /omg_llava/xtuner/apis/datasets/colorist.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import colors_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def colorist_dataset(tokenizer, 13 | path='burkelibbey/colors', 14 | max_length=2048, 15 | prompt_template=PROMPT_TEMPLATE.default, 16 | remove_unused_columns=True, 17 | pack_to_max_length=True): 18 | template_map_fn = template_map_fn_factory(template=prompt_template) 19 | dataset_org = load_dataset(path) 20 | dataset = process_hf_dataset( 21 | dataset=dataset_org, 22 | tokenizer=tokenizer, 23 | max_length=max_length, 24 | dataset_map_fn=colors_map_fn, 25 | template_map_fn=template_map_fn, 26 | remove_unused_columns=remove_unused_columns, 27 | shuffle_before_pack=True, 28 | pack_to_max_length=pack_to_max_length) 29 | 30 | return dataset 31 | 32 | 33 | def colorist_data_collator(return_hf_format=False): 34 | return partial(default_collate_fn, return_hf_format=return_hf_format) 35 | -------------------------------------------------------------------------------- /omg_llava/xtuner/apis/datasets/medical.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import medical_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def medical_dataset(tokenizer, 13 | path='shibing624/medical', 14 | max_length=2048, 15 | prompt_template=PROMPT_TEMPLATE.default, 16 | remove_unused_columns=False, 17 | pack_to_max_length=True): 18 | template_map_fn = template_map_fn_factory(template=prompt_template) 19 | dataset_org = load_dataset(path) 20 | dataset = process_hf_dataset( 21 | dataset=dataset_org, 22 | tokenizer=tokenizer, 23 | max_length=max_length, 24 | dataset_map_fn=medical_map_fn, 25 | template_map_fn=template_map_fn, 26 | remove_unused_columns=remove_unused_columns, 27 | shuffle_before_pack=True, 28 | pack_to_max_length=pack_to_max_length) 29 | 30 | return dataset 31 | 32 | 33 | def medical_data_collator(return_hf_format=False): 34 | return partial(default_collate_fn, return_hf_format=return_hf_format) 35 | -------------------------------------------------------------------------------- /omg_llava/xtuner/apis/datasets/oasst1.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def oasst1_dataset(tokenizer, 13 | path='timdettmers/openassistant-guanaco', 14 | max_length=2048, 15 | prompt_template=PROMPT_TEMPLATE.default, 16 | remove_unused_columns=False, 17 | pack_to_max_length=True): 18 | template_map_fn = template_map_fn_factory(template=prompt_template) 19 | dataset_org = load_dataset(path) 20 | dataset = process_hf_dataset( 21 | dataset=dataset_org, 22 | tokenizer=tokenizer, 23 | max_length=max_length, 24 | dataset_map_fn=oasst1_map_fn, 25 | template_map_fn=template_map_fn, 26 | remove_unused_columns=remove_unused_columns, 27 | shuffle_before_pack=True, 28 | pack_to_max_length=pack_to_max_length) 29 | 30 | return dataset 31 | 32 | 33 | def oasst1_data_collator(return_hf_format=False): 34 | return partial(default_collate_fn, return_hf_format=return_hf_format) 35 | -------------------------------------------------------------------------------- /omg_llava/xtuner/apis/datasets/open_orca.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import openorca_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def openorca_dataset(tokenizer, 13 | path='Open-Orca/OpenOrca', 14 | max_length=2048, 15 | prompt_template=PROMPT_TEMPLATE.default, 16 | remove_unused_columns=True, 17 | pack_to_max_length=True): 18 | template_map_fn = template_map_fn_factory(template=prompt_template) 19 | dataset_org = load_dataset(path) 20 | dataset = process_hf_dataset( 21 | dataset=dataset_org, 22 | tokenizer=tokenizer, 23 | max_length=max_length, 24 | dataset_map_fn=openorca_map_fn, 25 | template_map_fn=template_map_fn, 26 | remove_unused_columns=remove_unused_columns, 27 | shuffle_before_pack=True, 28 | pack_to_max_length=pack_to_max_length) 29 | 30 | return dataset 31 | 32 | 33 | def openorca_data_collator(return_hf_format=False): 34 | return partial(default_collate_fn, return_hf_format=return_hf_format) 35 | -------------------------------------------------------------------------------- /omg_llava/xtuner/apis/datasets/sql.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import sql_map_fn, template_map_fn_factory 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def sql_dataset(tokenizer, 13 | path='b-mc2/sql-create-context', 14 | max_length=2048, 15 | prompt_template=PROMPT_TEMPLATE.default, 16 | remove_unused_columns=True, 17 | pack_to_max_length=True): 18 | template_map_fn = template_map_fn_factory(template=prompt_template) 19 | dataset_org = load_dataset(path) 20 | dataset = process_hf_dataset( 21 | dataset=dataset_org, 22 | tokenizer=tokenizer, 23 | max_length=max_length, 24 | dataset_map_fn=sql_map_fn, 25 | template_map_fn=template_map_fn, 26 | remove_unused_columns=remove_unused_columns, 27 | shuffle_before_pack=True, 28 | pack_to_max_length=pack_to_max_length) 29 | 30 | return dataset 31 | 32 | 33 | def sql_data_collator(return_hf_format=False): 34 | return partial(default_collate_fn, return_hf_format=return_hf_format) 35 | -------------------------------------------------------------------------------- /omg_llava/xtuner/apis/datasets/tiny_codes.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import template_map_fn_factory, tiny_codes_map_fn 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def tiny_codes_dataset(tokenizer, 13 | path='nampdn-ai/tiny-codes', 14 | max_length=2048, 15 | prompt_template=PROMPT_TEMPLATE.default, 16 | remove_unused_columns=True, 17 | pack_to_max_length=True): 18 | template_map_fn = template_map_fn_factory(template=prompt_template) 19 | dataset_org = load_dataset(path) 20 | dataset = process_hf_dataset( 21 | dataset=dataset_org, 22 | tokenizer=tokenizer, 23 | max_length=max_length, 24 | dataset_map_fn=tiny_codes_map_fn, 25 | template_map_fn=template_map_fn, 26 | remove_unused_columns=remove_unused_columns, 27 | shuffle_before_pack=True, 28 | pack_to_max_length=pack_to_max_length) 29 | 30 | return dataset 31 | 32 | 33 | def tiny_codes_data_collator(return_hf_format=False): 34 | return partial(default_collate_fn, return_hf_format=return_hf_format) 35 | -------------------------------------------------------------------------------- /omg_llava/xtuner/apis/datasets/wizardlm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from datasets import load_dataset 5 | 6 | from xtuner.dataset import process_hf_dataset 7 | from xtuner.dataset.collate_fns import default_collate_fn 8 | from xtuner.dataset.map_fns import template_map_fn_factory, wizardlm_map_fn 9 | from xtuner.utils import PROMPT_TEMPLATE 10 | 11 | 12 | def wizardlm_dataset(tokenizer, 13 | path='WizardLM/WizardLM_evol_instruct_V2_196k', 14 | max_length=2048, 15 | prompt_template=PROMPT_TEMPLATE.default, 16 | remove_unused_columns=False, 17 | pack_to_max_length=True): 18 | template_map_fn = template_map_fn_factory(template=prompt_template) 19 | dataset_org = load_dataset(path) 20 | dataset = process_hf_dataset( 21 | dataset=dataset_org, 22 | tokenizer=tokenizer, 23 | max_length=max_length, 24 | dataset_map_fn=wizardlm_map_fn, 25 | template_map_fn=template_map_fn, 26 | remove_unused_columns=remove_unused_columns, 27 | shuffle_before_pack=True, 28 | pack_to_max_length=pack_to_max_length) 29 | 30 | return dataset 31 | 32 | 33 | def wizardlm_data_collator(return_hf_format=False): 34 | return partial(default_collate_fn, return_hf_format=return_hf_format) 35 | -------------------------------------------------------------------------------- /omg_llava/xtuner/configs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import os 3 | 4 | 5 | def get_cfgs_name_path(): 6 | path = os.path.dirname(__file__) 7 | mapping = {} 8 | for root, dirs, files in os.walk(path): 9 | for file_ in files: 10 | if file_.endswith( 11 | ('.py', '.json') 12 | ) and not file_.startswith('.') and not file_.startswith('_'): 13 | mapping[os.path.splitext(file_)[0]] = os.path.join(root, file_) 14 | return mapping 15 | 16 | 17 | cfgs_name_path = get_cfgs_name_path() 18 | 19 | __all__ = ['cfgs_name_path'] 20 | -------------------------------------------------------------------------------- /omg_llava/xtuner/configs/cohere/README.md: -------------------------------------------------------------------------------- 1 | # Cohere 8x7B 2 | 3 | ## Install 4 | 5 | ```bash 6 | # Install the latest xtuner 7 | pip install -U 'xtuner[deepspeed]' 8 | 9 | # Cohere requires the latest version of transformers. 10 | pip install git+https://github.com/huggingface/transformers.git 11 | 12 | # Sequence parallel requires flash-attn 13 | pip install flash-attn 14 | ``` 15 | 16 | ## Full Parameter Fine-tune 17 | 18 | Full parameter fine-tune needs 64 A100-80G 19 | 20 | ### slurm 21 | 22 | Note: `$PARTITION` means the virtual partition of slurm. 23 | 24 | ```bash 25 | srun -p $PARTITION --job-name=Cohere --nodes=8 --gres=gpu:8 --ntasks-per-node=8 xtuner train cohere_100b_128k_sp32 --deepspeed deepspeed_zero3 --launcher slurm 26 | ``` 27 | 28 | ### torchrun 29 | 30 | Note: `$NODE_0_ADDR` means the ip address of the node_0 machine. 31 | 32 | ```bash 33 | # excuete on node 0 34 | NPROC_PER_NODE=8 NNODES=8 PORT=29600 ADDR=$NODE_0_ADDR NODE_RANK=0 xtuner train cohere_100b_128k_sp32 --deepspeed deepspeed_zero3 35 | 36 | # excuete on node 1 37 | NPROC_PER_NODE=8 NNODES=8 PORT=29600 ADDR=$NODE_0_ADDR NODE_RANK=1 xtuner train cohere_100b_128k_sp32 --deepspeed deepspeed_zero3 38 | ``` 39 | 40 | ### Speed 41 | 42 | 16 * A100 80G: 43 | 44 | | Model | Sequence Length | GPUs Number | Sequence Parallel World Size | Tokens per Second | TFLOPs | 45 | | :---------: | :-------------: | :---------: | :--------------------------: | :---------------: | :----: | 46 | | Cohere_100b | 128k | 64 | 32 | 97.3 | 173.4 | 47 | | Cohere_100b | 128k | 128 | 16 | 102.1 | 182.7 | 48 | | Cohere_100b | 128k | 256 | 16 | 101.3 | 181.3 | 49 | -------------------------------------------------------------------------------- /omg_llava/xtuner/configs/deepspeed/deepspeed_zero1.json: -------------------------------------------------------------------------------- 1 | { 2 | "gradient_accumulation_steps": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_clipping": "auto", 5 | "zero_allow_untested_optimizer": true, 6 | "zero_force_ds_cpu_optimizer": false, 7 | "zero_optimization": { 8 | "stage": 1, 9 | "overlap_comm": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "initial_scale_power": 16 14 | }, 15 | "bf16": { 16 | "enabled": "auto" 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /omg_llava/xtuner/configs/deepspeed/deepspeed_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "gradient_accumulation_steps": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_clipping": "auto", 5 | "zero_allow_untested_optimizer": true, 6 | "zero_force_ds_cpu_optimizer": false, 7 | "zero_optimization": { 8 | "stage": 2, 9 | "overlap_comm": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "initial_scale_power": 16 14 | }, 15 | "bf16": { 16 | "enabled": "auto" 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /omg_llava/xtuner/configs/deepspeed/deepspeed_zero2_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "gradient_accumulation_steps": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_clipping": "auto", 5 | "zero_allow_untested_optimizer": true, 6 | "zero_force_ds_cpu_optimizer": false, 7 | "zero_optimization": { 8 | "stage": 2, 9 | "overlap_comm": true, 10 | "offload_optimizer": { 11 | "device": "cpu", 12 | "pin_memory": true 13 | } 14 | }, 15 | "fp16": { 16 | "enabled": "auto", 17 | "initial_scale_power": 16 18 | }, 19 | "bf16": { 20 | "enabled": "auto" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /omg_llava/xtuner/configs/deepspeed/deepspeed_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "gradient_accumulation_steps": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_clipping": "auto", 5 | "zero_allow_untested_optimizer": true, 6 | "zero_force_ds_cpu_optimizer": false, 7 | "zero_optimization": { 8 | "stage": 3, 9 | "overlap_comm": true, 10 | "stage3_gather_16bit_weights_on_model_save": true 11 | }, 12 | "fp16": { 13 | "enabled": "auto", 14 | "initial_scale_power": 16 15 | }, 16 | "bf16": { 17 | "enabled": "auto" 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /omg_llava/xtuner/configs/deepspeed/deepspeed_zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "gradient_accumulation_steps": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_clipping": "auto", 5 | "zero_allow_untested_optimizer": true, 6 | "zero_force_ds_cpu_optimizer": false, 7 | "zero_optimization": { 8 | "stage": 3, 9 | "overlap_comm": true, 10 | "offload_optimizer": { 11 | "device": "cpu", 12 | "pin_memory": true 13 | }, 14 | "offload_param": { 15 | "device": "cpu", 16 | "pin_memory": true 17 | }, 18 | "stage3_gather_16bit_weights_on_model_save": true 19 | }, 20 | "fp16": { 21 | "enabled": "auto", 22 | "initial_scale_power": 16 23 | }, 24 | "bf16": { 25 | "enabled": "auto" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /omg_llava/xtuner/configs/qwen/qwen1_5/qwen1_5_110b_chat/README.md: -------------------------------------------------------------------------------- 1 | # Qwen 110B 2 | 3 | ## Install 4 | 5 | ```bash 6 | # Install the latest xtuner 7 | pip install -U 'xtuner[deepspeed]' 8 | 9 | # We recommend installing flash_attn 10 | # pip install flash-attn 11 | 12 | # install the latest transformers 13 | pip install -U transformers 14 | ``` 15 | 16 | ## QLoRA Fine-tune 17 | 18 | Training Qwen 110B with 32k context capability requires only 2 * A100 80G. 19 | 20 | ```bash 21 | xtuner train xtuner/configs/qwen/qwen1_5/qwen1_5_110b_chat/qwen1_5_110b_chat_qlora_alpaca_e3_16k_2gpus.py --deepspeed deepspeed_zero3 22 | ``` 23 | 24 |
25 | 26 |
27 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import warnings 3 | 4 | from .concat_dataset import ConcatDataset 5 | from .huggingface import process_hf_dataset 6 | from .intern_repo import (build_packed_dataset, 7 | load_intern_repo_tokenized_dataset, 8 | load_intern_repo_untokenized_dataset) 9 | from .json_dataset import load_json_file 10 | from .llava import LLaVADataset 11 | from .modelscope import process_ms_dataset 12 | from .moss_sft import MOSSSFTDataset 13 | from .refcoco_json import (InvRefCOCOJsonDataset, RefCOCOJsonDataset, 14 | RefCOCOJsonEvalDataset) 15 | from .utils import decode_base64_to_image, expand2square, load_image 16 | 17 | # ignore FutureWarning in hf datasets 18 | warnings.simplefilter(action='ignore', category=FutureWarning) 19 | 20 | __all__ = [ 21 | 'process_hf_dataset', 'ConcatDataset', 'MOSSSFTDataset', 22 | 'process_ms_dataset', 'LLaVADataset', 'expand2square', 23 | 'decode_base64_to_image', 'load_image', 'process_ms_dataset', 24 | 'load_intern_repo_tokenized_dataset', 25 | 'load_intern_repo_untokenized_dataset', 'build_packed_dataset', 26 | 'RefCOCOJsonDataset', 'RefCOCOJsonEvalDataset', 'InvRefCOCOJsonDataset', 27 | 'load_json_file' 28 | ] 29 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/collate_fns/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .default_collate_fn import default_collate_fn 3 | from .mmlu_collate_fn import mmlu_collate_fn 4 | 5 | __all__ = ['default_collate_fn', 'mmlu_collate_fn'] 6 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/collate_fns/mmlu_collate_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Dict, Sequence 3 | 4 | import torch 5 | from torch.nn.utils.rnn import pad_sequence 6 | 7 | from xtuner.utils import DEFAULT_PAD_TOKEN_INDEX, IGNORE_INDEX 8 | 9 | 10 | def mmlu_collate_fn(instances: Sequence[Dict], 11 | pad_index: int = DEFAULT_PAD_TOKEN_INDEX, 12 | return_hf_format: bool = False) -> Dict[str, torch.Tensor]: 13 | input_ids = [] 14 | labels = [] 15 | data_samples = {'labels': [], 'subjects': []} 16 | for example in instances: 17 | input_ids.append(torch.tensor(example['input_ids'])) 18 | labels.append(torch.tensor(example['labels'])) 19 | data_samples['labels'].append(example['output']) 20 | data_samples['subjects'].append(example['subject']) 21 | if len(instances) > 1: 22 | input_ids = pad_sequence( 23 | input_ids, batch_first=True, padding_value=pad_index) 24 | labels = pad_sequence( 25 | labels, batch_first=True, padding_value=IGNORE_INDEX) 26 | else: 27 | input_ids = torch.stack(input_ids) 28 | labels = torch.stack(labels) 29 | 30 | data_dict = { 31 | 'input_ids': input_ids, 32 | 'attention_mask': input_ids.ne(pad_index), 33 | 'labels': labels 34 | } 35 | 36 | if return_hf_format: 37 | return data_dict 38 | else: 39 | return {'data': data_dict, 'data_samples': data_samples} 40 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/concat_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from torch.utils.data import ConcatDataset as TorchConcatDataset 3 | 4 | from xtuner.registry import BUILDER 5 | 6 | 7 | class ConcatDataset(TorchConcatDataset): 8 | 9 | def __init__(self, datasets): 10 | datasets_instance = [] 11 | for cfg in datasets: 12 | datasets_instance.append(BUILDER.build(cfg)) 13 | super().__init__(datasets=datasets_instance) 14 | 15 | def __repr__(self): 16 | main_str = 'Dataset as a concatenation of multiple datasets. \n' 17 | main_str += ',\n'.join( 18 | [f'{repr(dataset)}' for dataset in self.datasets]) 19 | return main_str 20 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/json_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from datasets import Dataset, concatenate_datasets 5 | 6 | 7 | def load_json_file(data_files=None, data_dir=None, suffix=None): 8 | assert (data_files is not None) != (data_dir is not None) 9 | if data_dir is not None: 10 | data_files = os.listdir(data_dir) 11 | data_files = [os.path.join(data_dir, fn) for fn in data_files] 12 | if suffix is not None: 13 | data_files = [fp for fp in data_files if fp.endswith(suffix)] 14 | elif isinstance(data_files, str): 15 | data_files = [data_files] 16 | 17 | dataset_list = [] 18 | for fp in data_files: 19 | with open(fp, encoding='utf-8') as file: 20 | data = json.load(file) 21 | ds = Dataset.from_list(data) 22 | dataset_list.append(ds) 23 | dataset = concatenate_datasets(dataset_list) 24 | return dataset 25 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dataset_map_fns import * # noqa: F401, F403 3 | from .template_map_fn import template_map_fn # noqa: F401 4 | from .template_map_fn import template_map_fn_factory # noqa: F401 5 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/alpaca_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | 4 | def alpaca_map_fn(example): 5 | if example.get('output') == '': 6 | return {'conversation': []} 7 | else: 8 | return { 9 | 'conversation': [{ 10 | 'input': f"{example['instruction']}\n{example['input']}", 11 | 'output': example['output'] 12 | }] 13 | } 14 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/alpaca_zh_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | 4 | def alpaca_zh_map_fn(example): 5 | return { 6 | 'conversation': [{ 7 | 'input': f"{example['instruction_zh']}\n{example['input_zh']}", 8 | 'output': example['output_zh'] 9 | }] 10 | } 11 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/arxiv_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def arxiv_map_fn(example): 6 | return { 7 | 'conversation': [{ 8 | 'system': SYSTEM_TEMPLATE.arxiv_gentile, 9 | 'input': example['abstract'], 10 | 'output': example['title'] 11 | }] 12 | } 13 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/code_alpaca_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def code_alpaca_map_fn(example): 6 | return { 7 | 'conversation': [{ 8 | 'system': SYSTEM_TEMPLATE.coder, 9 | 'input': example['prompt'], 10 | 'output': example['completion'] 11 | }] 12 | } 13 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/colors_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def colors_map_fn(example): 6 | desc = ':'.join(example['description'].split(':')[1:]).strip() 7 | return { 8 | 'conversation': [{ 9 | 'system': SYSTEM_TEMPLATE.colorist, 10 | 'input': desc, 11 | 'output': example['color'] 12 | }] 13 | } 14 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/crime_kg_assitant_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def crime_kg_assitant_map_fn(example): 6 | return { 7 | 'conversation': [{ 8 | 'system': SYSTEM_TEMPLATE.lawyer, 9 | 'input': example['input'], 10 | 'output': example['output'] 11 | }] 12 | } 13 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/default_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def default_map_fn(example): 3 | return { 4 | 'conversation': [{ 5 | 'input': example['input'], 6 | 'output': example['output'] 7 | }] 8 | } 9 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/law_reference_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def law_reference_map_fn(example): 6 | return { 7 | 'conversation': [{ 8 | 'system': SYSTEM_TEMPLATE.lawyer, 9 | 'input': example['question'], 10 | 'output': example['answer'] 11 | }] 12 | } 13 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/llava_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import DEFAULT_IMAGE_TOKEN 3 | 4 | 5 | def llava_image_only_map_fn(example): 6 | # input contains the DEFAULT_IMAGE_TOKEN only 7 | messages = example['conversations'] 8 | input = '' 9 | conversation = [] 10 | while messages and messages[0]['from'] == 'gpt': 11 | # Skip the first one if it is from gpt 12 | messages = messages[1:] 13 | for msg in messages: 14 | if msg['from'] == 'human': 15 | assert DEFAULT_IMAGE_TOKEN in msg['value'] 16 | input += DEFAULT_IMAGE_TOKEN 17 | elif msg['from'] == 'gpt': 18 | conversation.append({'input': input, 'output': msg['value']}) 19 | input = '' 20 | else: 21 | raise NotImplementedError 22 | return {'conversation': conversation} 23 | 24 | 25 | def llava_map_fn(example): 26 | messages = example['conversations'] 27 | input = '' 28 | conversation = [] 29 | while messages and messages[0]['from'] == 'gpt': 30 | # Skip the first one if it is from gpt 31 | messages = messages[1:] 32 | for msg in messages: 33 | if msg['from'] == 'human': 34 | if DEFAULT_IMAGE_TOKEN in msg['value']: 35 | msg['value'] = msg['value'].replace(DEFAULT_IMAGE_TOKEN, 36 | '').strip() 37 | msg['value'] = DEFAULT_IMAGE_TOKEN + '\n' + msg['value'] 38 | msg['value'] = msg['value'].strip() 39 | input += msg['value'] 40 | 41 | elif msg['from'] == 'gpt': 42 | conversation.append({'input': input, 'output': msg['value']}) 43 | input = '' 44 | else: 45 | raise NotImplementedError 46 | return {'conversation': conversation} 47 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/medical_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def medical_map_fn(example): 6 | return { 7 | 'conversation': [{ 8 | 'system': SYSTEM_TEMPLATE.medical, 9 | 'input': '{instruction}\n{input}'.format(**example), 10 | 'output': example['output'] 11 | }] 12 | } 13 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/oasst1_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def oasst1_map_fn(example): 3 | r"""Example before preprocessing: 4 | example['text'] = '### Human: Can you explain xxx' 5 | '### Assistant: Sure! xxx' 6 | '### Human: I didn't understand how xxx' 7 | '### Assistant: It has to do with a process xxx.' 8 | 9 | Example after preprocessing: 10 | example['conversation'] = [ 11 | { 12 | 'input': 'Can you explain xxx', 13 | 'output': 'Sure! xxx' 14 | }, 15 | { 16 | 'input': 'I didn't understand how xxx', 17 | 'output': 'It has to do with a process xxx.' 18 | } 19 | ] 20 | """ 21 | data = [] 22 | for sentence in example['text'].strip().split('###'): 23 | sentence = sentence.strip() 24 | if sentence[:6] == 'Human:': 25 | data.append(sentence[6:].strip()) 26 | elif sentence[:10] == 'Assistant:': 27 | data.append(sentence[10:].strip()) 28 | if len(data) % 2: 29 | # The last round of conversation solely consists of input 30 | # without any output. 31 | # Discard the input part of the last round, as this part is ignored in 32 | # the loss calculation. 33 | data.pop() 34 | conversation = [] 35 | for i in range(0, len(data), 2): 36 | single_turn_conversation = {'input': data[i], 'output': data[i + 1]} 37 | conversation.append(single_turn_conversation) 38 | return {'conversation': conversation} 39 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/openai_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def openai_map_fn(example): 3 | """ 4 | Example before preprocessing: 5 | example["messages"] = [ 6 | { "role": "system", "content": "You are an assistant that 7 | occasionally misspells words." }, 8 | { "role": "user", "content": "Tell me a story." }, 9 | { "role": "assistant", "content": "One day a student 10 | went to schoool." } 11 | ] 12 | Example after preprocessing: 13 | example["conversation"] = [ 14 | { 15 | "system": "You are an assistant that occasionally misspells 16 | words.", 17 | "input": "Tell me a story.", 18 | "output": "One day a student went to schoool." 19 | } 20 | ] 21 | """ 22 | messages = example['messages'] 23 | system = '' 24 | input = '' 25 | conversation = [] 26 | while messages and messages[0]['role'] == 'assistant': 27 | # Skip the first one if it is from assistant 28 | messages = messages[1:] 29 | for msg in messages: 30 | if msg['role'] == 'system': 31 | system = msg['content'] 32 | elif msg['role'] == 'user': 33 | input += msg['content'] 34 | elif msg['role'] == 'assistant': 35 | conversation.append({ 36 | 'system': system, 37 | 'input': input, 38 | 'output': msg['content'] 39 | }) 40 | system = '' 41 | input = '' 42 | else: 43 | raise NotImplementedError 44 | return {'conversation': conversation} 45 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/openorca_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def openorca_map_fn(example): 3 | return { 4 | 'conversation': [{ 5 | 'system': example['system_prompt'], 6 | 'input': example['question'], 7 | 'output': example['response'] 8 | }] 9 | } 10 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/pretrain_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def pretrain_map_fn(example): 3 | r"""Example before preprocessing: 4 | example['text'] = 'xxx' 5 | 6 | Example after preprocessing: 7 | example['conversation'] = [ 8 | { 9 | 'input': '', 10 | 'output': 'xxx' 11 | }, 12 | ] 13 | """ 14 | return { 15 | 'conversation': [{ 16 | 'input': '', 17 | 'output': example['text'].strip(), 18 | 'need_eos_token': False 19 | }] 20 | } 21 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/sql_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def sql_map_fn(example): 6 | return { 7 | 'conversation': [{ 8 | 'system': SYSTEM_TEMPLATE.sql, 9 | 'input': '{context}\n{question}'.format(**example), 10 | 'output': example['answer'] 11 | }] 12 | } 13 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/stack_exchange_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def stack_exchange_map_fn(example): 3 | return { 4 | 'conversation': [{ 5 | 'input': example['question'], 6 | 'output': example['response'] 7 | }] 8 | } 9 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/tiny_codes_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.utils import SYSTEM_TEMPLATE 3 | 4 | 5 | def tiny_codes_map_fn(example): 6 | return { 7 | 'conversation': [{ 8 | 'system': SYSTEM_TEMPLATE.coder, 9 | 'input': example['prompt'], 10 | 'output': example['response'] 11 | }] 12 | } 13 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/dataset_map_fns/wizardlm_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def wizardlm_map_fn(example): 3 | messages = example['conversations'] 4 | input = '' 5 | conversation = [] 6 | while messages and messages[0]['from'] == 'gpt': 7 | # Skip the first one if it is from gpt 8 | messages = messages[1:] 9 | for msg in messages: 10 | if msg['from'] == 'human': 11 | input += msg['value'] 12 | elif msg['from'] == 'gpt': 13 | conversation.append({'input': input, 'output': msg['value']}) 14 | input = '' 15 | else: 16 | raise NotImplementedError 17 | return {'conversation': conversation} 18 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/map_fns/template_map_fn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from functools import partial 3 | 4 | from mmengine.utils.misc import get_object_from_string 5 | 6 | 7 | def template_map_fn(example, template): 8 | conversation = example.get('conversation', []) 9 | for i, single_turn_conversation in enumerate(conversation): 10 | input = single_turn_conversation.get('input', '') 11 | if input is None: 12 | input = '' 13 | input_text = template.INSTRUCTION.format(input=input, round=i + 1) 14 | system = single_turn_conversation.get('system', '') 15 | if system != '' and system is not None: 16 | system = template.SYSTEM.format(system=system) 17 | input_text = system + input_text 18 | single_turn_conversation['input'] = input_text 19 | 20 | if template.get('SUFFIX', None): 21 | output_text = single_turn_conversation.get('output', '') 22 | output_text += template.SUFFIX 23 | single_turn_conversation['output'] = output_text 24 | 25 | # SUFFIX_AS_EOS is False ==> need_eos_token is True 26 | single_turn_conversation['need_eos_token'] = \ 27 | not template.get('SUFFIX_AS_EOS', False) 28 | single_turn_conversation['sep'] = template.get('SEP', '') 29 | 30 | return {'conversation': conversation} 31 | 32 | 33 | def template_map_fn_factory(template): 34 | if isinstance(template, str): # for resume 35 | template = get_object_from_string(template) 36 | return partial(template_map_fn, template=template) 37 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/modelscope.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmengine.config import Config, ConfigDict 3 | 4 | from xtuner.registry import BUILDER 5 | from .huggingface import process_hf_dataset 6 | 7 | 8 | def process_ms_dataset(dataset, split='train', *args, **kwargs): 9 | """Post-process the dataset loaded from the ModelScope Hub.""" 10 | 11 | if isinstance(dataset, (Config, ConfigDict)): 12 | dataset = BUILDER.build(dataset) 13 | if isinstance(dataset, dict): 14 | dataset = dataset[split] 15 | dataset = dataset.to_hf_dataset() 16 | return process_hf_dataset(dataset, *args, **kwargs) 17 | -------------------------------------------------------------------------------- /omg_llava/xtuner/dataset/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from .intern_repo import InternlmRepoSampler, InternRepoSampler 2 | from .length_grouped import LengthGroupedSampler 3 | 4 | __all__ = ['LengthGroupedSampler', 'InternRepoSampler', 'InternlmRepoSampler'] 5 | -------------------------------------------------------------------------------- /omg_llava/xtuner/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from ._strategy import DeepSpeedStrategy 3 | from .hooks import (DatasetInfoHook, EvaluateChatHook, ThroughputHook, 4 | VarlenAttnArgsToMessageHubHook) 5 | from .runner import TrainLoop 6 | 7 | __all__ = [ 8 | 'EvaluateChatHook', 'DatasetInfoHook', 'ThroughputHook', 9 | 'VarlenAttnArgsToMessageHubHook', 'DeepSpeedStrategy', 'TrainLoop' 10 | ] 11 | -------------------------------------------------------------------------------- /omg_llava/xtuner/engine/_strategy/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .deepspeed import DeepSpeedStrategy 3 | 4 | __all__ = ['DeepSpeedStrategy'] 5 | -------------------------------------------------------------------------------- /omg_llava/xtuner/engine/hooks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dataset_info_hook import DatasetInfoHook 3 | from .evaluate_chat_hook import EvaluateChatHook 4 | from .hf_checkpoint_hook import HFCheckpointHook 5 | from .throughput_hook import ThroughputHook 6 | from .varlen_attn_args_to_messagehub_hook import VarlenAttnArgsToMessageHubHook 7 | 8 | __all__ = [ 9 | 'EvaluateChatHook', 'DatasetInfoHook', 'ThroughputHook', 10 | 'VarlenAttnArgsToMessageHubHook', 'HFCheckpointHook' 11 | ] 12 | -------------------------------------------------------------------------------- /omg_llava/xtuner/engine/runner/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .loops import TrainLoop 3 | 4 | __all__ = ['TrainLoop'] 5 | -------------------------------------------------------------------------------- /omg_llava/xtuner/engine/runner/loops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Dict, Optional, Union 3 | 4 | from mmengine.runner import IterBasedTrainLoop 5 | from torch.utils.data import DataLoader 6 | 7 | 8 | class TrainLoop(IterBasedTrainLoop): 9 | 10 | def __init__(self, 11 | runner, 12 | dataloader: Union[DataLoader, Dict], 13 | max_iters: Optional[int] = None, 14 | max_epochs: Union[int, float] = None, 15 | **kwargs) -> None: 16 | 17 | if max_iters is None and max_epochs is None: 18 | raise RuntimeError('Please specify the `max_iters` or ' 19 | '`max_epochs` in `train_cfg`.') 20 | elif max_iters is not None and max_epochs is not None: 21 | raise RuntimeError('Only one of `max_iters` or `max_epochs` can ' 22 | 'exist in `train_cfg`.') 23 | else: 24 | if max_iters is not None: 25 | iters = int(max_iters) 26 | assert iters == max_iters, ('`max_iters` should be a integer ' 27 | f'number, but get {max_iters}') 28 | elif max_epochs is not None: 29 | if isinstance(dataloader, dict): 30 | diff_rank_seed = runner._randomness_cfg.get( 31 | 'diff_rank_seed', False) 32 | dataloader = runner.build_dataloader( 33 | dataloader, 34 | seed=runner.seed, 35 | diff_rank_seed=diff_rank_seed) 36 | iters = max_epochs * len(dataloader) 37 | else: 38 | raise NotImplementedError 39 | super().__init__( 40 | runner=runner, dataloader=dataloader, max_iters=iters, **kwargs) 41 | -------------------------------------------------------------------------------- /omg_llava/xtuner/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .metrics import MMLUMetric 3 | 4 | __all__ = ['MMLUMetric'] 5 | -------------------------------------------------------------------------------- /omg_llava/xtuner/evaluation/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .mmlu_metric import MMLUMetric 3 | 4 | __all__ = ['MMLUMetric'] 5 | -------------------------------------------------------------------------------- /omg_llava/xtuner/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .llava import LLaVAModel 3 | from .sft import SupervisedFinetune 4 | 5 | __all__ = ['SupervisedFinetune', 'LLaVAModel'] 6 | -------------------------------------------------------------------------------- /omg_llava/xtuner/model/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .dispatch import dispatch_modules 2 | from .projector import ProjectorConfig, ProjectorModel 3 | 4 | __all__ = ['dispatch_modules', 'ProjectorConfig', 'ProjectorModel'] 5 | -------------------------------------------------------------------------------- /omg_llava/xtuner/model/modules/dispatch/triton_kernels/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .layer_norm import layer_norm_forward 3 | from .rms_norm import rms_norm_forward 4 | from .rotary import apply_rotary_emb 5 | 6 | __all__ = ['rms_norm_forward', 'layer_norm_forward', 'apply_rotary_emb'] 7 | -------------------------------------------------------------------------------- /omg_llava/xtuner/model/modules/dispatch/triton_kernels/layer_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | def layer_norm_forward(self, hidden_states): 7 | input_dtype = hidden_states.dtype 8 | hidden_states = hidden_states.to(torch.float32) 9 | hidden_states = F.layer_norm( 10 | hidden_states, (hidden_states.shape[-1], ), eps=self.variance_epsilon) 11 | hidden_states = self.weight.to(torch.float32) * hidden_states 12 | return hidden_states.to(input_dtype) 13 | -------------------------------------------------------------------------------- /omg_llava/xtuner/model/modules/projector/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from transformers import AutoConfig, AutoModel 3 | 4 | from .configuration_projector import ProjectorConfig 5 | from .modeling_projector import ProjectorModel 6 | 7 | AutoConfig.register('projector', ProjectorConfig) 8 | AutoModel.register(ProjectorConfig, ProjectorModel) 9 | 10 | __all__ = ['ProjectorConfig', 'ProjectorModel'] 11 | -------------------------------------------------------------------------------- /omg_llava/xtuner/model/modules/projector/configuration_projector.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from transformers import PretrainedConfig 3 | 4 | 5 | class ProjectorConfig(PretrainedConfig): 6 | model_type = 'projector' 7 | _auto_class = 'AutoConfig' 8 | 9 | def __init__( 10 | self, 11 | visual_hidden_size=4096, 12 | llm_hidden_size=4096, 13 | depth=2, 14 | hidden_act='gelu', 15 | bias=True, 16 | **kwargs, 17 | ): 18 | self.visual_hidden_size = visual_hidden_size 19 | self.llm_hidden_size = llm_hidden_size 20 | self.depth = depth 21 | self.hidden_act = hidden_act 22 | self.bias = bias 23 | super().__init__(**kwargs) 24 | -------------------------------------------------------------------------------- /omg_llava/xtuner/model/transformers_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .deepseek_v2 import (DeepseekTokenizerFast, DeepseekV2Config, 2 | DeepseekV2ForCausalLM, DeepseekV2Model) 3 | from .mixtral import MixtralConfig, MixtralForCausalLM, MixtralModel 4 | 5 | __all__ = [ 6 | 'DeepseekTokenizerFast', 'DeepseekV2Config', 'DeepseekV2ForCausalLM', 7 | 'DeepseekV2Model', 'MixtralConfig', 'MixtralForCausalLM', 'MixtralModel' 8 | ] 9 | -------------------------------------------------------------------------------- /omg_llava/xtuner/model/transformers_models/deepseek_v2/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration_deepseek import DeepseekV2Config 2 | from .modeling_deepseek import DeepseekV2ForCausalLM, DeepseekV2Model 3 | from .tokenization_deepseek_fast import DeepseekTokenizerFast 4 | 5 | __all__ = [ 6 | 'DeepseekV2ForCausalLM', 'DeepseekV2Model', 'DeepseekV2Config', 7 | 'DeepseekTokenizerFast' 8 | ] 9 | -------------------------------------------------------------------------------- /omg_llava/xtuner/model/transformers_models/deepseek_v2/tokenization_deepseek_fast.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | 3 | from transformers.models.llama import LlamaTokenizerFast 4 | 5 | 6 | class DeepseekTokenizerFast(LlamaTokenizerFast): 7 | 8 | def convert_ids_to_tokens( 9 | self, 10 | ids: Union[int, List[int]], 11 | skip_special_tokens: bool = False) -> Union[str, List[str]]: 12 | """Converts a single index or a sequence of indices in a token or a 13 | sequence of tokens, using the vocabulary and added tokens. 14 | 15 | Args: 16 | ids (`int` or `List[int]`): 17 | The token id (or token ids) to convert to tokens. 18 | skip_special_tokens (`bool`, *optional*, defaults to `False`): 19 | Whether or not to remove special tokens in the decoding. 20 | 21 | Returns: 22 | `str` or `List[str]`: The decoded token(s). 23 | """ 24 | if isinstance(ids, int): 25 | return self._convert_id_to_token(ids) 26 | tokens = [] 27 | for index in ids: 28 | index = int(index) 29 | if skip_special_tokens and index in self.all_special_ids: 30 | continue 31 | token = self._tokenizer.id_to_token(index) 32 | tokens.append(token if token is not None else '') 33 | return tokens 34 | 35 | def _convert_id_to_token(self, index: int) -> Optional[str]: 36 | token = self._tokenizer.id_to_token(int(index)) 37 | return token if token is not None else '' 38 | -------------------------------------------------------------------------------- /omg_llava/xtuner/model/transformers_models/mixtral/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration_mixtral import MixtralConfig 2 | from .modeling_mixtral import MixtralForCausalLM, MixtralModel 3 | 4 | __all__ = ['MixtralForCausalLM', 'MixtralModel', 'MixtralConfig'] 5 | -------------------------------------------------------------------------------- /omg_llava/xtuner/parallel/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .sequence import * # noqa: F401, F403 3 | -------------------------------------------------------------------------------- /omg_llava/xtuner/parallel/sequence/reduce_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | 4 | from .setup_distributed import get_sequence_parallel_group 5 | 6 | 7 | class _ReduceLoss(torch.autograd.Function): 8 | 9 | @staticmethod 10 | def forward(ctx, mean_loss, loss_scale, process_group): 11 | ctx.mode = process_group 12 | if loss_scale == 0: 13 | # convert nan to 0 just for logging 14 | mean_loss = torch.nan_to_num(mean_loss) 15 | loss_sum = mean_loss * loss_scale 16 | dist.all_reduce(loss_sum, group=process_group) 17 | dist.all_reduce(loss_scale, group=process_group) 18 | loss = loss_sum / loss_scale 19 | return loss 20 | 21 | @staticmethod 22 | def backward(ctx, grad_output): 23 | return grad_output, None, None 24 | 25 | 26 | def reduce_sequence_parallel_loss(mean_loss, 27 | loss_scale, 28 | sp_group: dist.ProcessGroup = None): 29 | if dist.get_world_size(sp_group) == 1: 30 | return mean_loss 31 | if sp_group is None: 32 | # avoid bc breaking 33 | sp_group = get_sequence_parallel_group() 34 | return _ReduceLoss.apply(mean_loss, loss_scale, sp_group) 35 | -------------------------------------------------------------------------------- /omg_llava/xtuner/parallel/sequence/sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import math 3 | from typing import Optional, Sized 4 | 5 | from mmengine.dataset import DefaultSampler 6 | from mmengine.dist import sync_random_seed 7 | 8 | from .setup_distributed import (get_data_parallel_rank, 9 | get_data_parallel_world_size) 10 | 11 | 12 | class SequenceParallelSampler(DefaultSampler): 13 | 14 | def __init__(self, 15 | dataset: Sized, 16 | shuffle: bool = True, 17 | seed: Optional[int] = None, 18 | round_up: bool = True) -> None: 19 | rank = get_data_parallel_rank() 20 | world_size = get_data_parallel_world_size() 21 | self.rank = rank 22 | self.world_size = world_size 23 | 24 | self.dataset = dataset 25 | self.shuffle = shuffle 26 | if seed is None: 27 | seed = sync_random_seed() 28 | self.seed = seed 29 | self.epoch = 0 30 | self.round_up = round_up 31 | 32 | if self.round_up: 33 | self.num_samples = math.ceil(len(self.dataset) / world_size) 34 | self.total_size = self.num_samples * self.world_size 35 | else: 36 | self.num_samples = math.ceil( 37 | (len(self.dataset) - rank) / world_size) 38 | self.total_size = len(self.dataset) 39 | -------------------------------------------------------------------------------- /omg_llava/xtuner/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmengine.registry import Registry 3 | 4 | __all__ = ['BUILDER', 'MAP_FUNC'] 5 | 6 | BUILDER = Registry('builder') 7 | MAP_FUNC = Registry('map_fn') 8 | -------------------------------------------------------------------------------- /omg_llava/xtuner/tools/copy_cfg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import os.path as osp 4 | import shutil 5 | 6 | from mmengine.utils import mkdir_or_exist 7 | 8 | from xtuner.configs import cfgs_name_path 9 | 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('config_name', help='config name') 14 | parser.add_argument('save_dir', help='save directory for copied config') 15 | args = parser.parse_args() 16 | return args 17 | 18 | 19 | def add_copy_suffix(string): 20 | file_name, ext = osp.splitext(string) 21 | return f'{file_name}_copy{ext}' 22 | 23 | 24 | def main(): 25 | args = parse_args() 26 | mkdir_or_exist(args.save_dir) 27 | config_path = cfgs_name_path[args.config_name] 28 | save_path = osp.join(args.save_dir, 29 | add_copy_suffix(osp.basename(config_path))) 30 | shutil.copyfile(config_path, save_path) 31 | print(f'Copy to {save_path}') 32 | 33 | 34 | if __name__ == '__main__': 35 | main() 36 | -------------------------------------------------------------------------------- /omg_llava/xtuner/tools/data_preprocess/convert_refcoco.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import json 4 | 5 | from xtuner.dataset.refcoco_json import RefCOCOJsonDataset 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument( 11 | '--ann-path', 12 | default='data/refcoco_annotations', 13 | help='Refcoco annotation path', 14 | ) 15 | parser.add_argument( 16 | '--image-path', 17 | default='data/llava_data/llava_images/coco/train2017', 18 | help='COCO image path', 19 | ) 20 | parser.add_argument( 21 | '--save-path', default='./', help='The folder to save converted data') 22 | args = parser.parse_args() 23 | return args 24 | 25 | 26 | if __name__ == '__main__': 27 | args = parse_args() 28 | 29 | data_info = [ 30 | ('refcoco', 'unc'), 31 | ('refcoco+', 'unc'), 32 | ('refcocog', 'umd'), 33 | ] 34 | all_data = [] 35 | for dataset, split in data_info: 36 | data = RefCOCOJsonDataset.get_data_json( 37 | ann_path=args.ann_path, 38 | image_path=args.image_path, 39 | dataset=dataset, 40 | splitBy=split, 41 | )[0] 42 | all_data.extend(data) 43 | save_path = args.save_path + '/train.json' 44 | with open(save_path, 'w') as f: 45 | print(f'save to {save_path} with {len(all_data)} items.') 46 | print(all_data[0]) 47 | json.dump(all_data, f, indent=4) 48 | -------------------------------------------------------------------------------- /omg_llava/xtuner/tools/get_data_order.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import os 4 | 5 | 6 | def parse_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--data-folder', help='Data folder') 9 | parser.add_argument('--save-folder', help='The folder to save data order.') 10 | parser.add_argument( 11 | '--file-type', 12 | default='.bin', 13 | help='We want to get the order of the file in this type.') 14 | args = parser.parse_args() 15 | return args 16 | 17 | 18 | def save_data_order(data_folder, save_folder, file_type='.bin'): 19 | assert os.path.exists(data_folder), f'{data_folder} does not exist.' 20 | triples = list(os.walk(data_folder, followlinks=True)) 21 | data_order = [] 22 | for root, dirs, files in triples: 23 | dirs.sort() 24 | print(f'Reading {root}...') 25 | for fn in sorted(files): 26 | if fn.endswith(file_type): 27 | fp = os.path.join(root, fn) 28 | # Using relative paths so that you can get the same result 29 | # on different clusters 30 | fp = fp.replace(data_folder, '')[1:] 31 | data_order.append(fp) 32 | 33 | save_path = os.path.join(save_folder, 'data_order.txt') 34 | with open(save_path, 'w') as f: 35 | for fp in data_order: 36 | f.write(fp + '\n') 37 | 38 | 39 | if __name__ == '__main__': 40 | args = parse_args() 41 | save_data_order(args.data_folder, args.save_folder, args.file_type) 42 | -------------------------------------------------------------------------------- /omg_llava/xtuner/tools/list_cfg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | 4 | from xtuner.configs import cfgs_name_path 5 | 6 | 7 | def parse_args(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument( 10 | '-p', '--pattern', default=None, help='Pattern for fuzzy matching') 11 | args = parser.parse_args() 12 | return args 13 | 14 | 15 | def main(pattern=None): 16 | args = parse_args() 17 | configs_names = sorted(list(cfgs_name_path.keys())) 18 | print('==========================CONFIGS===========================') 19 | if args.pattern is not None: 20 | print(f'PATTERN: {args.pattern}') 21 | print('-------------------------------') 22 | for name in configs_names: 23 | if args.pattern is None or args.pattern.lower() in name.lower(): 24 | print(name) 25 | print('=============================================================') 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /omg_llava/xtuner/tools/list_dataset_format.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from xtuner.dataset.map_fns import DATASET_FORMAT_MAPPING 3 | 4 | 5 | def main(): 6 | dataset_format = DATASET_FORMAT_MAPPING.keys() 7 | print('======================DATASET_FORMAT======================') 8 | for format in dataset_format: 9 | print(format) 10 | print('==========================================================') 11 | 12 | 13 | if __name__ == '__main__': 14 | main() 15 | -------------------------------------------------------------------------------- /omg_llava/xtuner/tools/model_converters/modeling_internlm2_reward/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/xtuner/tools/model_converters/modeling_internlm2_reward/__init__.py -------------------------------------------------------------------------------- /omg_llava/xtuner/tools/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .api import plugins_api 3 | 4 | __all__ = ['plugins_api'] 5 | -------------------------------------------------------------------------------- /omg_llava/xtuner/tools/plugins/api.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import re 3 | 4 | 5 | def plugins_api(input_str, 6 | calculate_open=True, 7 | solve_open=True, 8 | search_open=True): 9 | 10 | pattern = r'(Solve|solve|Solver|solver|Calculate|calculate|Calculator|calculator|Search)\("([^"]*)"\)' # noqa: E501 11 | 12 | matches = re.findall(pattern, input_str) 13 | 14 | converted_str = '<|Results|>:\n' 15 | 16 | for i in range(len(matches)): 17 | if matches[i][0] in [ 18 | 'Calculate', 'calculate' 19 | 'Calculator', 'calculator' 20 | ]: 21 | if calculate_open: 22 | from .calculate import Calculate 23 | result = Calculate(matches[i][1]) 24 | else: 25 | result = None 26 | converted_str += f"Calculate(\"{matches[i][1]}\") => {result}\n" 27 | elif matches[i][0] in ['Solve', 'solve', 'Solver', 'solver']: 28 | if solve_open: 29 | from .solve import Solve 30 | result = Solve(matches[i][1]) 31 | else: 32 | result = None 33 | converted_str += f"Solve(\"{matches[i][1]}\") =>\n{result}\n" 34 | elif matches[i][0] == 'Search': 35 | if search_open: 36 | from .search import Search 37 | result = Search(matches[i][1]) 38 | else: 39 | result = None 40 | converted_str += f"Search(\"{matches[i][1]}\") =>\n{result}" 41 | 42 | converted_str += '\n' 43 | return converted_str 44 | -------------------------------------------------------------------------------- /omg_llava/xtuner/tools/plugins/calculate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from math import * # noqa: F401, F403 3 | 4 | 5 | def Calculate(expression): 6 | res = '' 7 | for exp in expression.split(';'): 8 | try: 9 | res += '{:.2f};'.format(eval(exp.replace('^', '**'))) 10 | except Exception: 11 | res += 'No result.' 12 | if res[-1] == ';': 13 | res = res[:-1] 14 | return res 15 | -------------------------------------------------------------------------------- /omg_llava/xtuner/tools/plugins/search.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import os 3 | import sys 4 | 5 | import requests 6 | 7 | try: 8 | SERPER_API_KEY = os.environ['SERPER_API_KEY'] 9 | except Exception: 10 | print('Please obtain the `SERPER_API_KEY` from https://serper.dev and ' 11 | 'set it using `export SERPER_API_KEY=xxx`.') 12 | sys.exit(1) 13 | 14 | 15 | def parse_results(results, k=10): 16 | snippets = [] 17 | 18 | for result in results['organic'][:k]: 19 | if 'snippet' in result: 20 | snippets.append(result['snippet']) 21 | for attribute, value in result.get('attributes', {}).items(): 22 | snippets.append(f'{attribute}: {value}.') 23 | return snippets 24 | 25 | 26 | def search(api_key, search_term, **kwargs): 27 | headers = { 28 | 'X-API-KEY': api_key, 29 | 'Content-Type': 'application/json', 30 | } 31 | params = { 32 | 'q': search_term, 33 | **{key: value 34 | for key, value in kwargs.items() if value is not None}, 35 | } 36 | try: 37 | response = requests.post( 38 | 'https://google.serper.dev/search', 39 | headers=headers, 40 | params=params, 41 | timeout=5) 42 | except Exception as e: 43 | return -1, str(e) 44 | return response.status_code, response.json() 45 | 46 | 47 | def Search(q, k=10): 48 | status_code, response = search(SERPER_API_KEY, q) 49 | if status_code != 200: 50 | ret = 'None\n' 51 | else: 52 | text = parse_results(response, k=k) 53 | ret = '' 54 | for idx, res in enumerate(text): 55 | ret += f"<|{idx+1}|>: '{res}'\n" 56 | return ret 57 | -------------------------------------------------------------------------------- /omg_llava/xtuner/tools/process_untokenized_llava_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import warnings 4 | 5 | from mmengine import Config 6 | 7 | from xtuner.registry import BUILDER 8 | 9 | # ignore FutureWarning in hf datasets 10 | warnings.simplefilter(action='ignore', category=FutureWarning) 11 | 12 | 13 | def parse_args(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('config', help='config file name or path.') 16 | parser.add_argument('--save-folder', help='The folder to save data order.') 17 | args = parser.parse_args() 18 | return args 19 | 20 | 21 | def build_llava_dataset(config): 22 | dataset = BUILDER.build(config.train_dataloader.dataset) 23 | return dataset 24 | 25 | 26 | if __name__ == '__main__': 27 | args = parse_args() 28 | cfg = Config.fromfile(args.config) 29 | 30 | llava_dataset = build_llava_dataset(cfg) 31 | text_data = llava_dataset.text_data 32 | 33 | text_data.save_to_disk(args.save_folder) 34 | -------------------------------------------------------------------------------- /omg_llava/xtuner/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .constants import (DEFAULT_IMAGE_TOKEN, DEFAULT_PAD_TOKEN_INDEX, 3 | IGNORE_INDEX, IMAGE_TOKEN_INDEX) 4 | from .handle_moe_load_and_save import (SUPPORT_MODELS, get_origin_state_dict, 5 | load_state_dict_into_model) 6 | from .stop_criteria import StopWordStoppingCriteria 7 | from .templates import PROMPT_TEMPLATE, SYSTEM_TEMPLATE 8 | 9 | __all__ = [ 10 | 'IGNORE_INDEX', 'DEFAULT_PAD_TOKEN_INDEX', 'PROMPT_TEMPLATE', 11 | 'DEFAULT_IMAGE_TOKEN', 'SYSTEM_TEMPLATE', 'StopWordStoppingCriteria', 12 | 'IMAGE_TOKEN_INDEX', 'load_state_dict_into_model', 'get_origin_state_dict', 13 | 'SUPPORT_MODELS' 14 | ] 15 | -------------------------------------------------------------------------------- /omg_llava/xtuner/utils/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | IGNORE_INDEX = -100 3 | DEFAULT_PAD_TOKEN_INDEX = 0 4 | IMAGE_TOKEN_INDEX = -200 5 | DEFAULT_IMAGE_TOKEN = '' 6 | -------------------------------------------------------------------------------- /omg_llava/xtuner/utils/stop_criteria.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from transformers import StoppingCriteria 3 | 4 | 5 | class StopWordStoppingCriteria(StoppingCriteria): 6 | """StopWord stopping criteria.""" 7 | 8 | def __init__(self, tokenizer, stop_word): 9 | self.tokenizer = tokenizer 10 | self.stop_word = stop_word 11 | self.length = len(self.stop_word) 12 | 13 | def __call__(self, input_ids, *args, **kwargs) -> bool: 14 | cur_text = self.tokenizer.decode(input_ids[0]) 15 | cur_text = cur_text.replace('\r', '').replace('\n', '') 16 | return cur_text[-self.length:] == self.stop_word 17 | -------------------------------------------------------------------------------- /omg_llava/xtuner/version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | __version__ = '0.1.21' 3 | short_version = __version__ 4 | 5 | 6 | def parse_version_info(version_str): 7 | """Parse a version string into a tuple. 8 | 9 | Args: 10 | version_str (str): The version string. 11 | Returns: 12 | tuple[int or str]: The version info, e.g., "1.3.0" is parsed into 13 | (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1'). 14 | """ 15 | version_info = [] 16 | for x in version_str.split('.'): 17 | if x.isdigit(): 18 | version_info.append(int(x)) 19 | elif x.find('rc') != -1: 20 | patch_version = x.split('rc') 21 | version_info.append(int(patch_version[0])) 22 | version_info.append(f'rc{patch_version[1]}') 23 | return tuple(version_info) 24 | 25 | 26 | version_info = parse_version_info(__version__) 27 | -------------------------------------------------------------------------------- /seg/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, 3 | LoggerHook, ParamSchedulerHook) 4 | from mmengine.runner import LogProcessor 5 | from mmengine.visualization import LocalVisBackend 6 | 7 | from mmdet.engine.hooks import DetVisualizationHook 8 | from mmdet.visualization import DetLocalVisualizer 9 | 10 | default_scope = None 11 | 12 | default_hooks = dict( 13 | timer=dict(type=IterTimerHook), 14 | logger=dict(type=LoggerHook, interval=50), 15 | param_scheduler=dict(type=ParamSchedulerHook), 16 | checkpoint=dict(type=CheckpointHook, interval=1, max_keep_ckpts=1), 17 | sampler_seed=dict(type=DistSamplerSeedHook), 18 | visualization=dict(type=DetVisualizationHook)) 19 | 20 | env_cfg = dict( 21 | cudnn_benchmark=False, 22 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 23 | dist_cfg=dict(backend='nccl'), 24 | ) 25 | 26 | vis_backends = [dict(type=LocalVisBackend)] 27 | visualizer = dict( 28 | type=DetLocalVisualizer, vis_backends=vis_backends, name='visualizer') 29 | log_processor = dict(type=LogProcessor, window_size=50, by_epoch=True) 30 | 31 | log_level = 'INFO' 32 | load_from = None 33 | resume = False 34 | -------------------------------------------------------------------------------- /seg/configs/_base_/schedules/schedule_12e.py: -------------------------------------------------------------------------------- 1 | from mmengine.optim import LinearLR, MultiStepLR, OptimWrapper 2 | from mmengine.runner import EpochBasedTrainLoop, ValLoop, TestLoop 3 | from torch.optim import AdamW 4 | 5 | # training schedule for 50e 6 | train_cfg = dict( 7 | type=EpochBasedTrainLoop, 8 | max_epochs=12, 9 | val_interval=2, 10 | ) 11 | val_cfg = dict(type=ValLoop) 12 | test_cfg = dict(type=TestLoop) 13 | 14 | # learning rate 15 | param_scheduler = [ 16 | dict( 17 | type=LinearLR, 18 | start_factor=0.001, 19 | by_epoch=False, 20 | begin=0, 21 | end=500 22 | ), 23 | dict( 24 | type=MultiStepLR, 25 | begin=0, 26 | end=12, 27 | by_epoch=True, 28 | milestones=[8, 11], 29 | gamma=0.1 30 | ) 31 | ] 32 | 33 | _embed_multi = dict(lr_mult=1.0, decay_mult=0.0) 34 | optim_wrapper = dict( 35 | type=OptimWrapper, 36 | optimizer=dict( 37 | type=AdamW, 38 | lr=0.0001, 39 | weight_decay=0.05, 40 | eps=1e-8, 41 | betas=(0.9, 0.999) 42 | ), 43 | paramwise_cfg=dict( 44 | custom_keys={ 45 | 'backbone': dict(lr_mult=0.1, decay_mult=1.0), 46 | 'query_embed': _embed_multi, 47 | 'query_feat': _embed_multi, 48 | 'level_embed': _embed_multi, 49 | }, 50 | norm_decay_mult=0.0 51 | ), 52 | clip_grad=dict(max_norm=0.01, norm_type=2) 53 | ) 54 | 55 | # Default setting for scaling LR automatically 56 | # - `enable` means enable scaling LR automatically 57 | # or not by default. 58 | # - `base_batch_size` = (8 GPUs) x (2 samples per GPU). 59 | auto_scale_lr = dict(enable=True, base_batch_size=16) 60 | -------------------------------------------------------------------------------- /seg/configs/_base_/schedules/schedule_24e.py: -------------------------------------------------------------------------------- 1 | from mmengine.optim import LinearLR, MultiStepLR, OptimWrapper 2 | from mmengine.runner import EpochBasedTrainLoop, ValLoop, TestLoop 3 | from torch.optim import AdamW 4 | 5 | # training schedule for 50e 6 | train_cfg = dict( 7 | type=EpochBasedTrainLoop, 8 | max_epochs=24, 9 | val_interval=2, 10 | ) 11 | val_cfg = dict(type=ValLoop) 12 | test_cfg = dict(type=TestLoop) 13 | 14 | # learning rate 15 | param_scheduler = [ 16 | dict( 17 | type=LinearLR, 18 | start_factor=0.001, 19 | by_epoch=False, 20 | begin=0, 21 | end=500 22 | ), 23 | dict( 24 | type=MultiStepLR, 25 | begin=0, 26 | end=24, 27 | by_epoch=True, 28 | milestones=[16, 22], 29 | gamma=0.1 30 | ) 31 | ] 32 | 33 | _embed_multi = dict(lr_mult=1.0, decay_mult=0.0) 34 | optim_wrapper = dict( 35 | type=OptimWrapper, 36 | optimizer=dict( 37 | type=AdamW, 38 | lr=0.0001, 39 | weight_decay=0.05, 40 | eps=1e-8, 41 | betas=(0.9, 0.999) 42 | ), 43 | paramwise_cfg=dict( 44 | custom_keys={ 45 | 'backbone': dict(lr_mult=0.1, decay_mult=1.0), 46 | 'query_embed': _embed_multi, 47 | 'query_feat': _embed_multi, 48 | 'level_embed': _embed_multi, 49 | }, 50 | norm_decay_mult=0.0 51 | ), 52 | clip_grad=dict(max_norm=0.01, norm_type=2) 53 | ) 54 | 55 | # Default setting for scaling LR automatically 56 | # - `enable` means enable scaling LR automatically 57 | # or not by default. 58 | # - `base_batch_size` = (8 GPUs) x (2 samples per GPU). 59 | auto_scale_lr = dict(enable=True, base_batch_size=16) 60 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/datasets/ade.py: -------------------------------------------------------------------------------- 1 | from mmdet.models import BatchFixedSizePad 2 | from mmengine import read_base 3 | 4 | from seg.models.data_preprocessor import VideoSegDataPreprocessor 5 | 6 | with read_base(): 7 | from ..._base_.default_runtime import * 8 | from ..._base_.schedules.schedule_12e import * 9 | from ..._base_.datasets.ade_panoptic_ov import train_dataloader, image_size 10 | from ..._base_.datasets.ade_panoptic import val_dataloader, val_evaluator, test_dataloader, test_evaluator 11 | from ..._base_.datasets.joint_dataset import train_dataloader as training_loader 12 | 13 | batch_augments = [ 14 | dict( 15 | type=BatchFixedSizePad, 16 | size=(image_size[1], image_size[0]), 17 | img_pad_value=0, 18 | pad_mask=True, 19 | mask_pad_value=0, 20 | pad_seg=True, 21 | seg_pad_value=255 22 | ) 23 | ] 24 | data_preprocessor = dict( 25 | type=VideoSegDataPreprocessor, 26 | mean=[123.675, 116.28, 103.53], 27 | std=[58.395, 57.12, 57.375], 28 | bgr_to_rgb=True, 29 | pad_size_divisor=32, 30 | pad_mask=True, 31 | mask_pad_value=0, 32 | pad_seg=True, 33 | seg_pad_value=255, 34 | batch_augments=batch_augments 35 | ) 36 | 37 | num_things_classes = 100 38 | num_stuff_classes = 50 39 | num_classes = num_things_classes + num_stuff_classes 40 | 41 | ov_datasets_name = 'ADEPanopticOVDataset' 42 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/datasets/cityscapes.py: -------------------------------------------------------------------------------- 1 | from mmengine.config import read_base 2 | 3 | from mmdet.models import BatchFixedSizePad 4 | 5 | from seg.models.data_preprocessor import VideoSegDataPreprocessor 6 | from seg.models.utils import NO_OBJ 7 | 8 | with read_base(): 9 | from ..._base_.default_runtime import * 10 | from ..._base_.datasets.cityscapes_panoptic import * 11 | from ..._base_.schedules.schedule_12e import * 12 | 13 | batch_augments = [ 14 | dict( 15 | type=BatchFixedSizePad, 16 | size=(image_size[1], image_size[0]), 17 | img_pad_value=0, 18 | pad_mask=True, 19 | mask_pad_value=0, 20 | pad_seg=True, 21 | seg_pad_value=255 22 | ) 23 | ] 24 | data_preprocessor = dict( 25 | type=VideoSegDataPreprocessor, 26 | mean=[123.675, 116.28, 103.53], 27 | std=[58.395, 57.12, 57.375], 28 | bgr_to_rgb=True, 29 | pad_size_divisor=32, 30 | pad_mask=True, 31 | mask_pad_value=0, 32 | pad_seg=True, 33 | seg_pad_value=NO_OBJ, 34 | batch_augments=batch_augments 35 | ) 36 | 37 | num_things_classes = 11 38 | num_stuff_classes = 8 39 | num_classes = num_things_classes + num_stuff_classes 40 | 41 | ov_datasets_name = 'CityscapesPanopticDataset' 42 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/datasets/coco.py: -------------------------------------------------------------------------------- 1 | from mmengine.config import read_base 2 | 3 | from mmdet.models import BatchFixedSizePad 4 | 5 | from seg.models.data_preprocessor import VideoSegDataPreprocessor 6 | 7 | with read_base(): 8 | from ..._base_.default_runtime import * 9 | from ..._base_.datasets.coco_panoptic_lsj import * 10 | from ..._base_.schedules.schedule_12e import * 11 | 12 | batch_augments = [ 13 | dict( 14 | type=BatchFixedSizePad, 15 | size=(image_size[1], image_size[0]), 16 | img_pad_value=0, 17 | pad_mask=True, 18 | mask_pad_value=0, 19 | pad_seg=True, 20 | seg_pad_value=255 21 | ) 22 | ] 23 | data_preprocessor = dict( 24 | type=VideoSegDataPreprocessor, 25 | mean=[123.675, 116.28, 103.53], 26 | std=[58.395, 57.12, 57.375], 27 | bgr_to_rgb=True, 28 | pad_size_divisor=32, 29 | pad_mask=True, 30 | mask_pad_value=0, 31 | pad_seg=True, 32 | seg_pad_value=255, 33 | batch_augments=batch_augments 34 | ) 35 | 36 | num_things_classes = 80 37 | num_stuff_classes = 53 38 | num_classes = num_things_classes + num_stuff_classes 39 | 40 | ov_datasets_name = 'CocoPanopticOVDataset' 41 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/datasets/coco_pan_point.py: -------------------------------------------------------------------------------- 1 | from mmengine.config import read_base 2 | 3 | from seg.evaluation.metrics.ins_cls_iou_metric import InsClsIoUMetric 4 | from seg.models.data_preprocessor import OVSAMDataPreprocessor 5 | 6 | with read_base(): 7 | from ..._base_.default_runtime import * 8 | from ..._base_.datasets.coco_panoptic_lsj_sam import * 9 | from ..._base_.schedules.schedule_12e import * 10 | 11 | data_preprocessor = dict( 12 | type=OVSAMDataPreprocessor, 13 | mean=[123.675, 116.28, 103.53], 14 | std=[58.395, 57.12, 57.375], 15 | bgr_to_rgb=True, 16 | pad_size_divisor=32, 17 | pad_mask=True, 18 | mask_pad_value=0, 19 | pad_seg=True, 20 | seg_pad_value=255, 21 | batch_augments=None, 22 | use_point_pseudo_box=True 23 | ) 24 | 25 | num_things_classes = 80 26 | num_stuff_classes = 0 27 | num_classes = num_things_classes + num_stuff_classes 28 | 29 | ov_datasets_name = 'CocoPanopticOVDataset' 30 | 31 | val_evaluator = dict( 32 | type=InsClsIoUMetric, 33 | with_score=False, 34 | ) 35 | test_evaluator = val_evaluator 36 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/datasets/davis.py: -------------------------------------------------------------------------------- 1 | from mmengine.config import read_base 2 | 3 | from mmdet.models import BatchFixedSizePad 4 | 5 | from seg.models.data_preprocessor import VideoSegDataPreprocessor 6 | from seg.models.utils import NO_OBJ 7 | 8 | with read_base(): 9 | from ..._base_.default_runtime import * 10 | from ..._base_.datasets.davis import * 11 | from ..._base_.schedules.schedule_12e import * 12 | 13 | batch_augments = [ 14 | dict( 15 | type=BatchFixedSizePad, 16 | size=(image_size[1], image_size[0]), 17 | img_pad_value=0, 18 | pad_mask=True, 19 | mask_pad_value=0, 20 | pad_seg=True, 21 | seg_pad_value=NO_OBJ 22 | ) 23 | ] 24 | data_preprocessor = dict( 25 | type=VideoSegDataPreprocessor, 26 | mean=[123.675, 116.28, 103.53], 27 | std=[58.395, 57.12, 57.375], 28 | bgr_to_rgb=True, 29 | pad_size_divisor=32, 30 | pad_mask=True, 31 | mask_pad_value=0, 32 | pad_seg=True, 33 | seg_pad_value=NO_OBJ, 34 | batch_augments=batch_augments 35 | ) 36 | 37 | num_things_classes = 80 38 | num_stuff_classes = 0 39 | num_classes = num_things_classes + num_stuff_classes 40 | 41 | ov_datasets_name = 'CocoOVDataset' 42 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/datasets/vipseg.py: -------------------------------------------------------------------------------- 1 | from mmengine.config import read_base 2 | 3 | from mmdet.models import BatchFixedSizePad 4 | 5 | from seg.models.data_preprocessor import VideoSegDataPreprocessor 6 | 7 | with read_base(): 8 | from ..._base_.default_runtime import * 9 | from ..._base_.datasets.vipseg import * 10 | from ..._base_.schedules.schedule_12e import * 11 | 12 | batch_augments = [ 13 | dict( 14 | type=BatchFixedSizePad, 15 | size=(image_size[1], image_size[0]), 16 | img_pad_value=0, 17 | pad_mask=True, 18 | mask_pad_value=0, 19 | pad_seg=True, 20 | seg_pad_value=255 21 | ) 22 | ] 23 | data_preprocessor = dict( 24 | type=VideoSegDataPreprocessor, 25 | mean=[123.675, 116.28, 103.53], 26 | std=[58.395, 57.12, 57.375], 27 | bgr_to_rgb=True, 28 | pad_size_divisor=32, 29 | pad_mask=True, 30 | mask_pad_value=0, 31 | pad_seg=True, 32 | seg_pad_value=255, 33 | batch_augments=batch_augments 34 | ) 35 | 36 | num_things_classes = 58 37 | num_stuff_classes = 66 38 | num_classes = num_things_classes + num_stuff_classes 39 | 40 | ov_datasets_name = 'VIPSegDataset' 41 | default_hooks.update( 42 | logger=dict(type=LoggerHook, interval=1), 43 | ) 44 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/datasets/y19.py: -------------------------------------------------------------------------------- 1 | from mmengine.config import read_base 2 | 3 | from mmdet.models import BatchFixedSizePad 4 | 5 | from seg.models.data_preprocessor import VideoSegDataPreprocessor 6 | 7 | with read_base(): 8 | from ..._base_.default_runtime import * 9 | from ..._base_.datasets.youtube_vis_2019 import * 10 | from ..._base_.schedules.schedule_12e import * 11 | 12 | batch_augments = [ 13 | dict( 14 | type=BatchFixedSizePad, 15 | size=(image_size[1], image_size[0]), 16 | img_pad_value=0, 17 | pad_mask=True, 18 | mask_pad_value=0, 19 | pad_seg=True, 20 | seg_pad_value=255 21 | ) 22 | ] 23 | data_preprocessor = dict( 24 | type=VideoSegDataPreprocessor, 25 | mean=[123.675, 116.28, 103.53], 26 | std=[58.395, 57.12, 57.375], 27 | bgr_to_rgb=True, 28 | pad_size_divisor=32, 29 | pad_mask=True, 30 | mask_pad_value=0, 31 | pad_seg=True, 32 | seg_pad_value=255, 33 | batch_augments=batch_augments 34 | ) 35 | 36 | num_things_classes = 40 37 | num_stuff_classes = 0 38 | num_classes = num_things_classes + num_stuff_classes 39 | 40 | ov_datasets_name = 'YouTubeVISDataset_2019' 41 | default_hooks.update( 42 | logger=dict(type=LoggerHook, interval=1), 43 | ) 44 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/datasets/y21.py: -------------------------------------------------------------------------------- 1 | from mmengine.config import read_base 2 | 3 | from mmdet.models import BatchFixedSizePad 4 | 5 | from seg.models.data_preprocessor import VideoSegDataPreprocessor 6 | 7 | with read_base(): 8 | from ..._base_.default_runtime import * 9 | from ..._base_.datasets.youtube_vis_2021 import * 10 | from ..._base_.schedules.schedule_12e import * 11 | from ..._base_.datasets.joint_dataset import train_dataloader as training_loader 12 | 13 | 14 | batch_augments = [ 15 | dict( 16 | type=BatchFixedSizePad, 17 | size=(image_size[1], image_size[0]), 18 | img_pad_value=0, 19 | pad_mask=True, 20 | mask_pad_value=0, 21 | pad_seg=True, 22 | seg_pad_value=255 23 | ) 24 | ] 25 | data_preprocessor = dict( 26 | type=VideoSegDataPreprocessor, 27 | mean=[123.675, 116.28, 103.53], 28 | std=[58.395, 57.12, 57.375], 29 | bgr_to_rgb=True, 30 | pad_size_divisor=32, 31 | pad_mask=True, 32 | mask_pad_value=0, 33 | pad_seg=True, 34 | seg_pad_value=255, 35 | batch_augments=batch_augments 36 | ) 37 | 38 | num_things_classes = 40 39 | num_stuff_classes = 0 40 | num_classes = num_things_classes + num_stuff_classes 41 | 42 | ov_datasets_name = 'YouTubeVISDataset_2021' 43 | default_hooks.update( 44 | logger=dict(type=LoggerHook, interval=1), 45 | ) 46 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/eval_m2_convl_300q_ov_ade.py: -------------------------------------------------------------------------------- 1 | from mmengine import read_base 2 | 3 | with read_base(): 4 | from .datasets.ade import * 5 | from .models.m2_convl_300q import * 6 | 7 | model.update( 8 | data_preprocessor=data_preprocessor, 9 | panoptic_head=dict( 10 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}', 11 | num_things_classes=num_things_classes, 12 | num_stuff_classes=num_stuff_classes, 13 | ), 14 | panoptic_fusion_head=dict( 15 | num_things_classes=num_things_classes, 16 | num_stuff_classes=num_stuff_classes, 17 | ), 18 | test_cfg=dict( 19 | panoptic_on=True, 20 | semantic_on=False, 21 | instance_on=False, 22 | ), 23 | ) 24 | overlapping = dict( 25 | train=training_loader.dataset, 26 | test=test_dataloader.dataset 27 | ) 28 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/eval_m2_convl_300q_ov_cityscapes.py: -------------------------------------------------------------------------------- 1 | from mmengine import read_base 2 | 3 | with read_base(): 4 | from .datasets.cityscapes import * 5 | from .models.m2_convl_300q import * 6 | 7 | model.update( 8 | data_preprocessor=data_preprocessor, 9 | panoptic_head=dict( 10 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}', 11 | num_things_classes=num_things_classes, 12 | num_stuff_classes=num_stuff_classes, 13 | ), 14 | panoptic_fusion_head=dict( 15 | num_things_classes=num_things_classes, 16 | num_stuff_classes=num_stuff_classes, 17 | ), 18 | test_cfg=dict( 19 | panoptic_on=True, 20 | semantic_on=False, 21 | instance_on=False, 22 | ), 23 | ) 24 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/eval_m2_convl_300q_ov_coco.py: -------------------------------------------------------------------------------- 1 | from mmengine import read_base 2 | 3 | with read_base(): 4 | from .datasets.coco import * 5 | from .models.m2_convl_300q import * 6 | 7 | model.update( 8 | data_preprocessor=data_preprocessor, 9 | panoptic_head=dict( 10 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}', 11 | num_things_classes=num_things_classes, 12 | num_stuff_classes=num_stuff_classes, 13 | ), 14 | panoptic_fusion_head=dict( 15 | num_things_classes=num_things_classes, 16 | num_stuff_classes=num_stuff_classes, 17 | ), 18 | test_cfg=dict( 19 | panoptic_on=True, 20 | semantic_on=False, 21 | instance_on=True, 22 | ), 23 | ) 24 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/eval_m2_convl_300q_ov_davis.py: -------------------------------------------------------------------------------- 1 | from mmengine import read_base 2 | 3 | from seg.models.detectors import Mask2formerVideoMinVIS 4 | 5 | with read_base(): 6 | from .datasets.davis import * 7 | from .models.m2_convl_300q import * 8 | 9 | model.update( 10 | data_preprocessor=data_preprocessor, 11 | type=Mask2formerVideoMinVIS, 12 | clip_size=5, 13 | clip_size_small=3, 14 | whole_clip_thr=0, 15 | small_clip_thr=15, 16 | overlap=0, 17 | panoptic_head=dict( 18 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}', 19 | num_things_classes=num_things_classes, 20 | num_stuff_classes=num_stuff_classes, 21 | ), 22 | panoptic_fusion_head=dict( 23 | num_things_classes=num_things_classes, 24 | num_stuff_classes=num_stuff_classes, 25 | ), 26 | test_cfg=dict( 27 | panoptic_on=False, 28 | semantic_on=False, 29 | instance_on=False, 30 | proposal_on=True, 31 | num_proposals=25, 32 | ), 33 | ) 34 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/eval_m2_convl_300q_ov_vipseg.py: -------------------------------------------------------------------------------- 1 | from mmengine import read_base 2 | 3 | from seg.models.detectors import Mask2formerVideoMinVIS 4 | 5 | with read_base(): 6 | from .datasets.vipseg import * 7 | from .models.m2_convl_300q import * 8 | 9 | model.update( 10 | data_preprocessor=data_preprocessor, 11 | type=Mask2formerVideoMinVIS, 12 | clip_size=2, 13 | clip_size_small=3, 14 | whole_clip_thr=0, 15 | small_clip_thr=15, 16 | overlap=0, 17 | panoptic_head=dict( 18 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}', 19 | num_things_classes=num_things_classes, 20 | num_stuff_classes=num_stuff_classes, 21 | ), 22 | panoptic_fusion_head=dict( 23 | num_things_classes=num_things_classes, 24 | num_stuff_classes=num_stuff_classes, 25 | ), 26 | test_cfg=dict( 27 | panoptic_on=True, 28 | semantic_on=False, 29 | instance_on=False, 30 | ), 31 | ) 32 | 33 | val_evaluator = dict( 34 | type=VIPSegMetric, 35 | metric=['VPQ@1', 'VPQ@2', 'VPQ@4', 'VPQ@6'], 36 | format_only=True, 37 | ) 38 | test_evaluator = val_evaluator 39 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/eval_m2_convl_300q_ov_y19.py: -------------------------------------------------------------------------------- 1 | from mmengine import read_base 2 | 3 | from seg.models.detectors import Mask2formerVideoMinVIS 4 | 5 | with read_base(): 6 | from .datasets.y19 import * 7 | from .models.m2_convl_300q import * 8 | 9 | model.update( 10 | data_preprocessor=data_preprocessor, 11 | type=Mask2formerVideoMinVIS, 12 | clip_size=5, 13 | clip_size_small=3, 14 | whole_clip_thr=0, 15 | small_clip_thr=15, 16 | overlap=0, 17 | panoptic_head=dict( 18 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}', 19 | num_things_classes=num_things_classes, 20 | num_stuff_classes=num_stuff_classes, 21 | ), 22 | panoptic_fusion_head=dict( 23 | num_things_classes=num_things_classes, 24 | num_stuff_classes=num_stuff_classes, 25 | ), 26 | test_cfg=dict( 27 | panoptic_on=False, 28 | semantic_on=False, 29 | instance_on=True, 30 | ), 31 | ) 32 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/eval_m2_convl_300q_ov_y21.py: -------------------------------------------------------------------------------- 1 | from mmengine import read_base 2 | 3 | from seg.models.detectors import Mask2formerVideoMinVIS 4 | 5 | with read_base(): 6 | from .datasets.y21 import * 7 | from .models.m2_convl_300q import * 8 | 9 | model.update( 10 | data_preprocessor=data_preprocessor, 11 | type=Mask2formerVideoMinVIS, 12 | clip_size=5, 13 | clip_size_small=3, 14 | whole_clip_thr=0, 15 | small_clip_thr=15, 16 | overlap=0, 17 | panoptic_head=dict( 18 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}', 19 | num_things_classes=num_things_classes, 20 | num_stuff_classes=num_stuff_classes, 21 | ), 22 | panoptic_fusion_head=dict( 23 | num_things_classes=num_things_classes, 24 | num_stuff_classes=num_stuff_classes, 25 | ), 26 | test_cfg=dict( 27 | panoptic_on=False, 28 | semantic_on=False, 29 | instance_on=True, 30 | ), 31 | ) 32 | -------------------------------------------------------------------------------- /seg/configs/m2ov_val/eval_m2_convl_ov_coco_pan_point.py: -------------------------------------------------------------------------------- 1 | from mmengine import read_base 2 | 3 | with read_base(): 4 | from .datasets.coco_pan_point import * 5 | from .models.m2_convl_300q import * 6 | 7 | model.update( 8 | data_preprocessor=data_preprocessor, 9 | inference_sam=True, 10 | panoptic_head=dict( 11 | enable_box_query=True, 12 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}', 13 | num_things_classes=num_things_classes, 14 | num_stuff_classes=num_stuff_classes, 15 | ), 16 | panoptic_fusion_head=dict( 17 | num_things_classes=num_things_classes, 18 | num_stuff_classes=num_stuff_classes, 19 | ), 20 | test_cfg=dict( 21 | panoptic_on=False, 22 | semantic_on=False, 23 | instance_on=True, 24 | ), 25 | ) 26 | -------------------------------------------------------------------------------- /seg/datasets/pipelines/frame_sampling.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import Dict, List, Optional 3 | 4 | import numpy as np 5 | from mmdet.registry import TRANSFORMS 6 | from mmdet.datasets.transforms import BaseFrameSample 7 | 8 | 9 | @TRANSFORMS.register_module() 10 | class VideoClipSample(BaseFrameSample): 11 | def __init__(self, 12 | num_selected: int = 1, 13 | interval: int = 1, 14 | collect_video_keys: List[str] = ['video_id', 'video_length']): 15 | self.num_selected = num_selected 16 | self.interval = interval 17 | super().__init__(collect_video_keys=collect_video_keys) 18 | 19 | def transform(self, video_infos: dict) -> Optional[Dict[str, List]]: 20 | """Transform the video information. 21 | 22 | Args: 23 | video_infos (dict): The whole video information. 24 | 25 | Returns: 26 | dict: The data information of the sampled frames. 27 | """ 28 | len_with_interval = self.num_selected + (self.num_selected - 1) * (self.interval - 1) 29 | len_video = video_infos['video_length'] 30 | if len_with_interval > len_video: 31 | return None 32 | 33 | first_frame_id = random.sample(range(len_video - len_with_interval + 1), 1)[0] 34 | 35 | sampled_frames_ids = first_frame_id + np.arange(self.num_selected) * self.interval 36 | results = self.prepare_data(video_infos, sampled_frames_ids) 37 | 38 | return results 39 | 40 | def __repr__(self) -> str: 41 | repr_str = self.__class__.__name__ 42 | repr_str += f'num_selected=({self.num_selected}' 43 | repr_str += f'interval={self.interval}' 44 | repr_str += f'collect_video_keys={self.collect_video_keys})' 45 | return repr_str 46 | -------------------------------------------------------------------------------- /seg/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .openclip_backbone import OpenCLIPBackbone 2 | from .openclip_backbone import OpenCLIPBackboneText 3 | -------------------------------------------------------------------------------- /seg/models/data_preprocessor/__init__.py: -------------------------------------------------------------------------------- 1 | from .vidseg_data_preprocessor import VideoSegDataPreprocessor 2 | from .ovsam_preprocessor import OVSAMDataPreprocessor, OVSAMVideoSegDataPreprocessor 3 | -------------------------------------------------------------------------------- /seg/models/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .mask2former_vid import Mask2formerVideo 2 | from .mask2former_vid_minvis import Mask2formerVideoMinVIS 3 | -------------------------------------------------------------------------------- /seg/models/fusion_head/__init__.py: -------------------------------------------------------------------------------- 1 | from .omgseg_fusionhead import OMGFusionHead 2 | -------------------------------------------------------------------------------- /seg/models/heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .mask2former_vid import Mask2FormerVideoHead 2 | -------------------------------------------------------------------------------- /seg/models/task_modules/cost.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union 2 | 3 | import torch 4 | from mmdet.models.task_modules.assigners.match_cost import BaseMatchCost 5 | from mmengine.structures import InstanceData 6 | from torch import Tensor 7 | 8 | from mmdet.registry import TASK_UTILS 9 | 10 | 11 | @TASK_UTILS.register_module() 12 | class FlexibleClassificationCost(BaseMatchCost): 13 | def __init__(self, weight: Union[float, int] = 1) -> None: 14 | super().__init__(weight=weight) 15 | 16 | def __call__(self, 17 | pred_instances: InstanceData, 18 | gt_instances: InstanceData, 19 | img_meta: Optional[dict] = None, 20 | **kwargs) -> Tensor: 21 | """Compute match cost. 22 | 23 | Args: 24 | pred_instances (:obj:`InstanceData`): ``scores`` inside is 25 | predicted classification logits, of shape 26 | (num_queries, num_class). 27 | gt_instances (:obj:`InstanceData`): ``labels`` inside should have 28 | shape (num_gt, ). 29 | img_meta (Optional[dict]): _description_. Defaults to None. 30 | 31 | Returns: 32 | Tensor: Match Cost matrix of shape (num_preds, num_gts). 33 | """ 34 | _pred_scores = pred_instances.scores 35 | gt_labels = gt_instances.labels 36 | 37 | pred_scores = _pred_scores[..., :-1] 38 | iou_score = _pred_scores[..., -1:] 39 | 40 | pred_scores = pred_scores.softmax(-1) 41 | iou_score = iou_score.sigmoid() 42 | pred_scores = torch.cat([pred_scores, iou_score], dim=-1) 43 | cls_cost = -pred_scores[:, gt_labels] 44 | 45 | return cls_cost * self.weight 46 | -------------------------------------------------------------------------------- /seg/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .video_gt_preprocess import preprocess_video_panoptic_gt 2 | from .mask_pool import mask_pool 3 | from .pan_seg_transform import INSTANCE_OFFSET_HB, mmpan2hbpan, mmgt2hbpan 4 | from .class_overlapping import calculate_class_overlapping 5 | from .online_pq_utils import cal_pq, IoUObj, NO_OBJ_ID 6 | from .no_obj import NO_OBJ 7 | from .offline_video_metrics import vpq_eval, stq 8 | -------------------------------------------------------------------------------- /seg/models/utils/class_overlapping.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | 4 | def calculate_class_overlapping(classes1: List[str], classes2: List[str]) -> List[bool]: 5 | words1 = [word for item in classes1 for word in item.split(',')] 6 | results = [] 7 | for item in classes2: 8 | flag: bool = False 9 | for word in item.split(','): 10 | if word in words1: 11 | flag = True 12 | break 13 | results.append(flag) 14 | return results 15 | -------------------------------------------------------------------------------- /seg/models/utils/load_checkpoint.py: -------------------------------------------------------------------------------- 1 | from mmengine.runner.checkpoint import CheckpointLoader 2 | 3 | 4 | def load_checkpoint_with_prefix(filename, prefix=None, map_location='cpu', logger='current'): 5 | """Load partial pretrained model with specific prefix. 6 | 7 | Args: 8 | prefix (str): The prefix of sub-module. 9 | filename (str): Accept local filepath, URL, ``torchvision://xxx``, 10 | ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for 11 | details. 12 | map_location (str | None): Same as :func:`torch.load`. 13 | Defaults to None. 14 | logger: logger 15 | 16 | Returns: 17 | dict or OrderedDict: The loaded checkpoint. 18 | """ 19 | 20 | checkpoint = CheckpointLoader.load_checkpoint(filename, map_location=map_location, logger=logger) 21 | 22 | if 'state_dict' in checkpoint: 23 | state_dict = checkpoint['state_dict'] 24 | else: 25 | state_dict = checkpoint 26 | if not prefix: 27 | return state_dict 28 | if not prefix.endswith('.'): 29 | prefix += '.' 30 | prefix_len = len(prefix) 31 | 32 | state_dict = { 33 | k[prefix_len:]: v 34 | for k, v in state_dict.items() if k.startswith(prefix) 35 | } 36 | 37 | assert state_dict, f'{prefix} is not in the pretrained model' 38 | return state_dict 39 | -------------------------------------------------------------------------------- /seg/models/utils/mask_pool.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | # https://github.com/NVlabs/ODISE/blob/e97b06c424c575fec9fc5368dd4b3e050d91abc4/odise/modeling/meta_arch/odise.py#L923 6 | 7 | def mask_pool(x, mask): 8 | """ 9 | Args: 10 | x: [B, C, H, W] 11 | mask: [B, Q, H, W] 12 | """ 13 | if not x.shape[-2:] == mask.shape[-2:]: 14 | # reshape mask to x 15 | mask = F.interpolate(mask, size=x.shape[-2:], mode='bilinear', align_corners=False) 16 | with torch.no_grad(): 17 | mask = mask.detach() 18 | mask = (mask > 0).to(mask.dtype) 19 | denorm = mask.sum(dim=(-1, -2), keepdim=True) + 1e-8 20 | 21 | mask_pooled_x = torch.einsum( 22 | "bchw,bqhw->bqc", 23 | x, 24 | mask / denorm, 25 | ) 26 | return mask_pooled_x 27 | 28 | -------------------------------------------------------------------------------- /seg/models/utils/no_obj.py: -------------------------------------------------------------------------------- 1 | NO_OBJ = 65535 2 | -------------------------------------------------------------------------------- /seg/models/utils/pan_seg_transform.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import torch 4 | import numpy as np 5 | from mmdet.evaluation import INSTANCE_OFFSET 6 | 7 | INSTANCE_OFFSET_HB = 10000 8 | 9 | 10 | def mmpan2hbpan(pred_pan_map, num_classes): 11 | pan_seg_map = - np.ones_like(pred_pan_map) 12 | for itm in np.unique(pred_pan_map): 13 | if itm >= INSTANCE_OFFSET: 14 | # cls labels (from segmentation maps) 15 | cls = itm % INSTANCE_OFFSET 16 | # id labels (from tracking maps) 17 | ins = itm // INSTANCE_OFFSET 18 | pan_seg_map[pred_pan_map == itm] = cls * INSTANCE_OFFSET_HB + ins 19 | elif itm == num_classes: 20 | pan_seg_map[pred_pan_map == itm] = num_classes * INSTANCE_OFFSET_HB 21 | else: 22 | pan_seg_map[pred_pan_map == itm] = itm * INSTANCE_OFFSET_HB 23 | assert -1 not in pan_seg_map 24 | return pan_seg_map 25 | 26 | 27 | def mmgt2hbpan(data_samples): 28 | pan_map = copy.deepcopy(data_samples.gt_sem_seg.sem_seg[0]) 29 | pan_map = pan_map * INSTANCE_OFFSET_HB 30 | gt_instances = data_samples.gt_instances 31 | for idx in range(len(gt_instances)): 32 | mask = torch.tensor(gt_instances.masks.masks[idx], dtype=torch.bool) 33 | instance_id = gt_instances.instances_ids[idx].item() 34 | pan_map[mask] = instance_id 35 | 36 | return pan_map 37 | -------------------------------------------------------------------------------- /tools/dist.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | FILE=$1 4 | CONFIG=$2 5 | GPUS=$3 6 | NNODES=${NNODES:-1} 7 | NODE_RANK=${NODE_RANK:-0} 8 | PORT=${PORT:-$((28500 + $RANDOM % 2000))} 9 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 10 | 11 | 12 | if command -v torchrun &> /dev/null 13 | then 14 | echo "Using torchrun mode." 15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 \ 16 | torchrun --nnodes=${NNODES} \ 17 | --nnodes=${NNODES} \ 18 | --node_rank=${NODE_RANK} \ 19 | --master_addr=${MASTER_ADDR} \ 20 | --master_port=${PORT} \ 21 | --nproc_per_node=${GPUS} \ 22 | $(dirname "$0")/${FILE}.py ${CONFIG} --launcher pytorch ${@:4} 23 | else 24 | echo "Using launch mode." 25 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 \ 26 | python -m torch.distributed.launch \ 27 | --nnodes=${NNODES} \ 28 | --node_rank=${NODE_RANK} \ 29 | --master_addr=${MASTER_ADDR} \ 30 | --master_port=${PORT} \ 31 | --nproc_per_node=${GPUS} \ 32 | $(dirname "$0")/${FILE}.py ${CONFIG} --launcher pytorch ${@:4} 33 | fi 34 | -------------------------------------------------------------------------------- /tools/slurm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | FILE=$1 6 | CONFIG=$2 7 | GPUS=${GPUS:-8} 8 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 9 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 10 | MASTER_PORT=${MASTER_PORT:-$((28500 + $RANDOM % 2000))} 11 | PARTITION=${PARTITION:-DUMMY} 12 | JOB_NAME=${JOB_NAME:-DUMMY} 13 | QUOTATYPE=${QUOTATYPE:-auto} 14 | SRUN_ARGS=${SRUN_ARGS:-""} 15 | PY_ARGS=${@:3} 16 | 17 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 \ 18 | CUDA_HOME=$(dirname $(dirname $(which nvcc))) \ 19 | MASTER_PORT=$MASTER_PORT \ 20 | srun -p ${PARTITION} \ 21 | --job-name=${JOB_NAME} \ 22 | --gres=gpu:${GPUS_PER_NODE} \ 23 | --ntasks=${GPUS} \ 24 | --ntasks-per-node=${GPUS_PER_NODE} \ 25 | --cpus-per-task=${CPUS_PER_TASK} \ 26 | --kill-on-bad-exit=1 \ 27 | --quotatype=${QUOTATYPE} \ 28 | ${SRUN_ARGS} \ 29 | python -u tools/${FILE}.py ${CONFIG} --launcher="slurm" ${PY_ARGS} 30 | --------------------------------------------------------------------------------