├── .gitattributes
├── .gitignore
├── DATASET.md
├── EMB.md
├── INSTALL.md
├── LICENSE
├── OMG_Seg_README.md
├── README.md
├── demo
├── README.md
├── configs
│ ├── m2_convl.py
│ ├── m2_convl_vid.py
│ └── names
│ │ └── th139_st101.py
├── image_demo.py
├── images
│ ├── 350_6L1vA-xJt-M
│ │ ├── 00002020.jpg
│ │ ├── 00002023.jpg
│ │ ├── 00002026.jpg
│ │ ├── 00002029.jpg
│ │ ├── 00002032.jpg
│ │ ├── 00002035.jpg
│ │ ├── 00002038.jpg
│ │ ├── 00002041.jpg
│ │ ├── 00002044.jpg
│ │ ├── 00002047.jpg
│ │ ├── 00002050.jpg
│ │ ├── 00002053.jpg
│ │ ├── 00002056.jpg
│ │ ├── 00002059.jpg
│ │ └── 00002062.jpg
│ └── sa_1002.jpg
└── video_demo.py
├── ext
├── cityscapes_scripts
│ ├── createPanopticImgs.py
│ └── helpers
│ │ ├── __init__.py
│ │ ├── annotation.py
│ │ ├── csHelpers.py
│ │ ├── labels.py
│ │ ├── labels_cityPersons.py
│ │ └── version.py
├── class_names
│ └── VIPSeg.py
├── davis2017
│ ├── __init__.py
│ ├── davis.py
│ ├── evaluation.py
│ ├── metrics.py
│ ├── results.py
│ └── utils.py
├── meta
│ └── sam_meta.py
├── open_clip
│ ├── __init__.py
│ ├── bpe_simple_vocab_16e6.txt.gz
│ ├── coca_model.py
│ ├── constants.py
│ ├── factory.py
│ ├── generation_utils.py
│ ├── hf_configs.py
│ ├── hf_model.py
│ ├── loss.py
│ ├── model.py
│ ├── model_configs
│ │ ├── EVA01-g-14-plus.json
│ │ ├── EVA01-g-14.json
│ │ ├── EVA02-B-16.json
│ │ ├── EVA02-E-14-plus.json
│ │ ├── EVA02-E-14.json
│ │ ├── EVA02-L-14-336.json
│ │ ├── EVA02-L-14.json
│ │ ├── RN101-quickgelu.json
│ │ ├── RN101.json
│ │ ├── RN50-quickgelu.json
│ │ ├── RN50.json
│ │ ├── RN50x16.json
│ │ ├── RN50x4.json
│ │ ├── RN50x64.json
│ │ ├── ViT-B-16-plus-240.json
│ │ ├── ViT-B-16-plus.json
│ │ ├── ViT-B-16.json
│ │ ├── ViT-B-32-plus-256.json
│ │ ├── ViT-B-32-quickgelu.json
│ │ ├── ViT-B-32.json
│ │ ├── ViT-H-14.json
│ │ ├── ViT-H-16.json
│ │ ├── ViT-L-14-280.json
│ │ ├── ViT-L-14-336.json
│ │ ├── ViT-L-14.json
│ │ ├── ViT-L-16-320.json
│ │ ├── ViT-L-16.json
│ │ ├── ViT-M-16-alt.json
│ │ ├── ViT-M-16.json
│ │ ├── ViT-M-32-alt.json
│ │ ├── ViT-M-32.json
│ │ ├── ViT-S-16-alt.json
│ │ ├── ViT-S-16.json
│ │ ├── ViT-S-32-alt.json
│ │ ├── ViT-S-32.json
│ │ ├── ViT-bigG-14.json
│ │ ├── ViT-e-14.json
│ │ ├── ViT-g-14.json
│ │ ├── coca_ViT-B-32.json
│ │ ├── coca_ViT-L-14.json
│ │ ├── coca_base.json
│ │ ├── coca_roberta-ViT-B-32.json
│ │ ├── convnext_base.json
│ │ ├── convnext_base_w.json
│ │ ├── convnext_base_w_320.json
│ │ ├── convnext_large.json
│ │ ├── convnext_large_d.json
│ │ ├── convnext_large_d_320.json
│ │ ├── convnext_small.json
│ │ ├── convnext_tiny.json
│ │ ├── convnext_xlarge.json
│ │ ├── convnext_xxlarge.json
│ │ ├── convnext_xxlarge_320.json
│ │ ├── mt5-base-ViT-B-32.json
│ │ ├── mt5-xl-ViT-H-14.json
│ │ ├── roberta-ViT-B-32.json
│ │ ├── swin_base_patch4_window7_224.json
│ │ ├── vit_medium_patch16_gap_256.json
│ │ ├── vit_relpos_medium_patch16_cls_224.json
│ │ ├── xlm-roberta-base-ViT-B-32.json
│ │ └── xlm-roberta-large-ViT-H-14.json
│ ├── modified_resnet.py
│ ├── openai.py
│ ├── pretrained.py
│ ├── push_to_hf_hub.py
│ ├── timm_model.py
│ ├── tokenizer.py
│ ├── transform.py
│ ├── transformer.py
│ ├── utils.py
│ ├── version.py
│ ├── zero_shot_classifier.py
│ └── zero_shot_metadata.py
├── sam
│ ├── __init__.py
│ ├── common.py
│ ├── image_encoder.py
│ ├── mask_decoder.py
│ ├── prompt_encoder.py
│ └── transformer.py
└── templates
│ ├── __init__.py
│ └── vild.py
├── figs
├── method_comparison.jpg
└── omg_teaser.jpg
├── omg_llava
├── .owners.yml
├── .pre-commit-config-zh-cn.yaml
├── .pre-commit-config.yaml
├── INSTALL.md
├── MANIFEST.in
├── README.md
├── figs
│ └── omg_llava.png
├── omg_llava
│ ├── __init__.py
│ ├── configs
│ │ ├── __init__.py
│ │ ├── finetune
│ │ │ ├── omg_llava_7b_finetune_8gpus.py
│ │ │ └── specific_tasks_finetune
│ │ │ │ ├── finetune_gcg.py
│ │ │ │ └── finetune_refseg.py
│ │ └── pretrain
│ │ │ └── omg_llava_7b_pretrain_8gpus.py
│ ├── dataset
│ │ ├── CombineDataset.py
│ │ ├── DecoupledGCGDataset.py
│ │ ├── GCGDataset.py
│ │ ├── LlavaDataset.py
│ │ ├── MDPVPointsDataset.py
│ │ ├── ReferringSegDataset.py
│ │ ├── RegionCaptionDataset.py
│ │ ├── SemanticSegDataset.py
│ │ ├── __init__.py
│ │ ├── collect_fns
│ │ │ ├── __init__.py
│ │ │ └── omg_llava_collate_fn.py
│ │ ├── process_functions
│ │ │ ├── __init__.py
│ │ │ ├── decoupled_gcg_process.py
│ │ │ ├── gcg_process.py
│ │ │ ├── mdpv_points_process.py
│ │ │ ├── referring_seg_process.py
│ │ │ ├── region_caption_process.py
│ │ │ └── semantic_seg_process.py
│ │ └── utils
│ │ │ ├── __init__.py
│ │ │ ├── ade20k_classes.json
│ │ │ ├── cocostuff_classes.txt
│ │ │ ├── grefer.py
│ │ │ ├── refcoco_refer.py
│ │ │ └── utils.py
│ ├── engine
│ │ ├── __init__.py
│ │ ├── dataset_info_hook.py
│ │ └── evaluate_chat_hook.py
│ ├── model
│ │ ├── __init__.py
│ │ ├── convnext_clip
│ │ │ ├── __init__.py
│ │ │ ├── open_clip
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bpe_simple_vocab_16e6.txt.gz
│ │ │ │ ├── coca_model.py
│ │ │ │ ├── constants.py
│ │ │ │ ├── factory.py
│ │ │ │ ├── generation_utils.py
│ │ │ │ ├── hf_configs.py
│ │ │ │ ├── hf_model.py
│ │ │ │ ├── loss.py
│ │ │ │ ├── model.py
│ │ │ │ ├── model_configs
│ │ │ │ │ ├── EVA01-g-14-plus.json
│ │ │ │ │ ├── EVA01-g-14.json
│ │ │ │ │ ├── EVA02-B-16.json
│ │ │ │ │ ├── EVA02-E-14-plus.json
│ │ │ │ │ ├── EVA02-E-14.json
│ │ │ │ │ ├── EVA02-L-14-336.json
│ │ │ │ │ ├── EVA02-L-14.json
│ │ │ │ │ ├── RN101-quickgelu.json
│ │ │ │ │ ├── RN101.json
│ │ │ │ │ ├── RN50-quickgelu.json
│ │ │ │ │ ├── RN50.json
│ │ │ │ │ ├── RN50x16.json
│ │ │ │ │ ├── RN50x4.json
│ │ │ │ │ ├── RN50x64.json
│ │ │ │ │ ├── ViT-B-16-plus-240.json
│ │ │ │ │ ├── ViT-B-16-plus.json
│ │ │ │ │ ├── ViT-B-16.json
│ │ │ │ │ ├── ViT-B-32-plus-256.json
│ │ │ │ │ ├── ViT-B-32-quickgelu.json
│ │ │ │ │ ├── ViT-B-32.json
│ │ │ │ │ ├── ViT-H-14.json
│ │ │ │ │ ├── ViT-H-16.json
│ │ │ │ │ ├── ViT-L-14-280.json
│ │ │ │ │ ├── ViT-L-14-336.json
│ │ │ │ │ ├── ViT-L-14.json
│ │ │ │ │ ├── ViT-L-16-320.json
│ │ │ │ │ ├── ViT-L-16.json
│ │ │ │ │ ├── ViT-M-16-alt.json
│ │ │ │ │ ├── ViT-M-16.json
│ │ │ │ │ ├── ViT-M-32-alt.json
│ │ │ │ │ ├── ViT-M-32.json
│ │ │ │ │ ├── ViT-S-16-alt.json
│ │ │ │ │ ├── ViT-S-16.json
│ │ │ │ │ ├── ViT-S-32-alt.json
│ │ │ │ │ ├── ViT-S-32.json
│ │ │ │ │ ├── ViT-bigG-14.json
│ │ │ │ │ ├── ViT-e-14.json
│ │ │ │ │ ├── ViT-g-14.json
│ │ │ │ │ ├── coca_ViT-B-32.json
│ │ │ │ │ ├── coca_ViT-L-14.json
│ │ │ │ │ ├── coca_base.json
│ │ │ │ │ ├── coca_roberta-ViT-B-32.json
│ │ │ │ │ ├── convnext_base.json
│ │ │ │ │ ├── convnext_base_w.json
│ │ │ │ │ ├── convnext_base_w_320.json
│ │ │ │ │ ├── convnext_large.json
│ │ │ │ │ ├── convnext_large_d.json
│ │ │ │ │ ├── convnext_large_d_320.json
│ │ │ │ │ ├── convnext_small.json
│ │ │ │ │ ├── convnext_tiny.json
│ │ │ │ │ ├── convnext_xlarge.json
│ │ │ │ │ ├── convnext_xxlarge.json
│ │ │ │ │ ├── convnext_xxlarge_320.json
│ │ │ │ │ ├── mt5-base-ViT-B-32.json
│ │ │ │ │ ├── mt5-xl-ViT-H-14.json
│ │ │ │ │ ├── roberta-ViT-B-32.json
│ │ │ │ │ ├── swin_base_patch4_window7_224.json
│ │ │ │ │ ├── vit_medium_patch16_gap_256.json
│ │ │ │ │ ├── vit_relpos_medium_patch16_cls_224.json
│ │ │ │ │ ├── xlm-roberta-base-ViT-B-32.json
│ │ │ │ │ └── xlm-roberta-large-ViT-H-14.json
│ │ │ │ ├── modified_resnet.py
│ │ │ │ ├── openai.py
│ │ │ │ ├── pretrained.py
│ │ │ │ ├── push_to_hf_hub.py
│ │ │ │ ├── timm_model.py
│ │ │ │ ├── tokenizer.py
│ │ │ │ ├── transform.py
│ │ │ │ ├── transformer.py
│ │ │ │ ├── utils.py
│ │ │ │ ├── version.py
│ │ │ │ ├── zero_shot_classifier.py
│ │ │ │ └── zero_shot_metadata.py
│ │ │ └── openclip_backbone.py
│ │ ├── modules
│ │ │ ├── __init__.py
│ │ │ └── projector
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configuration_projector.py
│ │ │ │ └── modeling_projector.py
│ │ ├── omg_llava.py
│ │ ├── omg_seg
│ │ │ ├── __init__.py
│ │ │ ├── mask2former_vid.py
│ │ │ ├── mask2former_vid_semanticsam.py
│ │ │ ├── omg_seg_visual_encoder.py
│ │ │ └── utils.py
│ │ └── utils.py
│ └── tools
│ │ ├── __init__.py
│ │ ├── app.py
│ │ ├── app_utils.py
│ │ ├── chat_omg_llava.py
│ │ ├── chat_omg_llava_msseg.py
│ │ ├── convert_deepspeed2pth.py
│ │ ├── evaluate_gcg.py
│ │ ├── evaluate_region_cap.py
│ │ ├── gcg_omg_seg_llava.py
│ │ ├── mmbench_omg_seg_llava.py
│ │ ├── refcoco_omg_seg_llava.py
│ │ ├── region_cap_mask_omg_seg_llava.py
│ │ └── utils_refcoco.py
├── requirements.txt
├── requirements
│ ├── deepspeed.txt
│ ├── docs.txt
│ ├── modelscope.txt
│ └── runtime.txt
├── setup.cfg
├── setup.py
├── test.jpg
└── xtuner
│ ├── __init__.py
│ ├── apis
│ ├── __init__.py
│ ├── datasets
│ │ ├── __init__.py
│ │ ├── alpaca.py
│ │ ├── arxiv.py
│ │ ├── code_alpaca.py
│ │ ├── colorist.py
│ │ ├── lawyer.py
│ │ ├── medical.py
│ │ ├── moss_003_sft.py
│ │ ├── oasst1.py
│ │ ├── open_orca.py
│ │ ├── sql.py
│ │ ├── tiny_codes.py
│ │ └── wizardlm.py
│ ├── model.py
│ └── training_args.py
│ ├── configs
│ ├── __init__.py
│ ├── baichuan
│ │ ├── baichuan2_13b_base
│ │ │ ├── baichuan2_13b_base_qlora_alpaca_e3.py
│ │ │ ├── baichuan2_13b_base_qlora_alpaca_enzh_e3.py
│ │ │ ├── baichuan2_13b_base_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── baichuan2_13b_base_qlora_alpaca_zh_e3.py
│ │ │ ├── baichuan2_13b_base_qlora_arxiv_gentitle_e3.py
│ │ │ ├── baichuan2_13b_base_qlora_code_alpaca_e3.py
│ │ │ ├── baichuan2_13b_base_qlora_colorist_e5.py
│ │ │ ├── baichuan2_13b_base_qlora_lawyer_e3.py
│ │ │ ├── baichuan2_13b_base_qlora_oasst1_512_e3.py
│ │ │ ├── baichuan2_13b_base_qlora_oasst1_e3.py
│ │ │ ├── baichuan2_13b_base_qlora_open_platypus_e3.py
│ │ │ └── baichuan2_13b_base_qlora_sql_e3.py
│ │ ├── baichuan2_13b_chat
│ │ │ ├── baichuan2_13b_chat_qlora_alpaca_e3.py
│ │ │ ├── baichuan2_13b_chat_qlora_alpaca_enzh_e3.py
│ │ │ ├── baichuan2_13b_chat_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── baichuan2_13b_chat_qlora_alpaca_zh_e3.py
│ │ │ ├── baichuan2_13b_chat_qlora_code_alpaca_e3.py
│ │ │ ├── baichuan2_13b_chat_qlora_lawyer_e3.py
│ │ │ ├── baichuan2_13b_chat_qlora_oasst1_512_e3.py
│ │ │ ├── baichuan2_13b_chat_qlora_oasst1_e3.py
│ │ │ └── baichuan2_13b_chat_qlora_open_platypus_e3.py
│ │ ├── baichuan2_7b_base
│ │ │ ├── baichuan2_7b_base_qlora_alpaca_e3.py
│ │ │ ├── baichuan2_7b_base_qlora_alpaca_enzh_e3.py
│ │ │ ├── baichuan2_7b_base_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── baichuan2_7b_base_qlora_alpaca_zh_e3.py
│ │ │ ├── baichuan2_7b_base_qlora_arxiv_gentitle_e3.py
│ │ │ ├── baichuan2_7b_base_qlora_code_alpaca_e3.py
│ │ │ ├── baichuan2_7b_base_qlora_colorist_e5.py
│ │ │ ├── baichuan2_7b_base_qlora_lawyer_e3.py
│ │ │ ├── baichuan2_7b_base_qlora_oasst1_512_e3.py
│ │ │ ├── baichuan2_7b_base_qlora_oasst1_e3.py
│ │ │ ├── baichuan2_7b_base_qlora_open_platypus_e3.py
│ │ │ └── baichuan2_7b_base_qlora_sql_e3.py
│ │ ├── baichuan2_7b_chat
│ │ │ ├── baichuan2_7b_chat_qlora_alpaca_e3.py
│ │ │ ├── baichuan2_7b_chat_qlora_alpaca_enzh_e3.py
│ │ │ ├── baichuan2_7b_chat_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── baichuan2_7b_chat_qlora_alpaca_zh_e3.py
│ │ │ ├── baichuan2_7b_chat_qlora_code_alpaca_e3.py
│ │ │ ├── baichuan2_7b_chat_qlora_lawyer_e3.py
│ │ │ ├── baichuan2_7b_chat_qlora_oasst1_512_e3.py
│ │ │ ├── baichuan2_7b_chat_qlora_oasst1_e3.py
│ │ │ └── baichuan2_7b_chat_qlora_open_platypus_e3.py
│ │ ├── baichuan_13b_base
│ │ │ ├── baichuan_13b_base_qlora_alpaca_e3.py
│ │ │ ├── baichuan_13b_base_qlora_alpaca_enzh_e3.py
│ │ │ ├── baichuan_13b_base_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── baichuan_13b_base_qlora_alpaca_zh_e3.py
│ │ │ ├── baichuan_13b_base_qlora_arxiv_gentitle_e3.py
│ │ │ ├── baichuan_13b_base_qlora_code_alpaca_e3.py
│ │ │ ├── baichuan_13b_base_qlora_colorist_e5.py
│ │ │ ├── baichuan_13b_base_qlora_lawyer_e3.py
│ │ │ ├── baichuan_13b_base_qlora_medical_e1.py
│ │ │ ├── baichuan_13b_base_qlora_moss_sft_all_e1.py
│ │ │ ├── baichuan_13b_base_qlora_moss_sft_all_e2_gpu8.py
│ │ │ ├── baichuan_13b_base_qlora_moss_sft_plugins_e1.py
│ │ │ ├── baichuan_13b_base_qlora_oasst1_512_e3.py
│ │ │ ├── baichuan_13b_base_qlora_oasst1_e3.py
│ │ │ ├── baichuan_13b_base_qlora_open_platypus_e3.py
│ │ │ ├── baichuan_13b_base_qlora_openorca_e1.py
│ │ │ ├── baichuan_13b_base_qlora_sql_e3.py
│ │ │ └── baichuan_13b_base_qlora_tiny_codes_e1.py
│ │ ├── baichuan_13b_chat
│ │ │ ├── baichuan_13b_chat_qlora_alpaca_e3.py
│ │ │ ├── baichuan_13b_chat_qlora_alpaca_enzh_e3.py
│ │ │ ├── baichuan_13b_chat_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── baichuan_13b_chat_qlora_alpaca_zh_e3.py
│ │ │ ├── baichuan_13b_chat_qlora_arxiv_gentitle_e3.py
│ │ │ ├── baichuan_13b_chat_qlora_code_alpaca_e3.py
│ │ │ ├── baichuan_13b_chat_qlora_colorist_e5.py
│ │ │ ├── baichuan_13b_chat_qlora_lawyer_e3.py
│ │ │ ├── baichuan_13b_chat_qlora_medical_e1.py
│ │ │ ├── baichuan_13b_chat_qlora_oasst1_512_e3.py
│ │ │ ├── baichuan_13b_chat_qlora_oasst1_e3.py
│ │ │ ├── baichuan_13b_chat_qlora_open_platypus_e3.py
│ │ │ ├── baichuan_13b_chat_qlora_openorca_e1.py
│ │ │ ├── baichuan_13b_chat_qlora_sql_e3.py
│ │ │ └── baichuan_13b_chat_qlora_tiny_codes_e1.py
│ │ └── baichuan_7b
│ │ │ ├── baichuan_7b_qlora_alpaca_e3.py
│ │ │ ├── baichuan_7b_qlora_alpaca_enzh_e3.py
│ │ │ ├── baichuan_7b_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── baichuan_7b_qlora_alpaca_zh_e3.py
│ │ │ ├── baichuan_7b_qlora_arxiv_gentitle_e3.py
│ │ │ ├── baichuan_7b_qlora_code_alpaca_e3.py
│ │ │ ├── baichuan_7b_qlora_colorist_e5.py
│ │ │ ├── baichuan_7b_qlora_lawyer_e3.py
│ │ │ ├── baichuan_7b_qlora_medical_e1.py
│ │ │ ├── baichuan_7b_qlora_moss_sft_all_e1.py
│ │ │ ├── baichuan_7b_qlora_moss_sft_all_e2_gpu8.py
│ │ │ ├── baichuan_7b_qlora_moss_sft_plugins_e1.py
│ │ │ ├── baichuan_7b_qlora_oasst1_512_e3.py
│ │ │ ├── baichuan_7b_qlora_oasst1_e3.py
│ │ │ ├── baichuan_7b_qlora_open_platypus_e3.py
│ │ │ ├── baichuan_7b_qlora_openorca_e1.py
│ │ │ ├── baichuan_7b_qlora_sql_e3.py
│ │ │ └── baichuan_7b_qlora_tiny_codes_e1.py
│ ├── chatglm
│ │ ├── chatglm2_6b
│ │ │ ├── chatglm2_6b_qlora_alpaca_e3.py
│ │ │ ├── chatglm2_6b_qlora_alpaca_enzh_e3.py
│ │ │ ├── chatglm2_6b_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── chatglm2_6b_qlora_alpaca_zh_e3.py
│ │ │ ├── chatglm2_6b_qlora_arxiv_gentitle_e3.py
│ │ │ ├── chatglm2_6b_qlora_code_alpaca_e3.py
│ │ │ ├── chatglm2_6b_qlora_colorist_e5.py
│ │ │ ├── chatglm2_6b_qlora_lawyer_e3.py
│ │ │ ├── chatglm2_6b_qlora_medical_e1.py
│ │ │ ├── chatglm2_6b_qlora_oasst1_512_e3.py
│ │ │ ├── chatglm2_6b_qlora_oasst1_e3.py
│ │ │ ├── chatglm2_6b_qlora_open_platypus_e3.py
│ │ │ ├── chatglm2_6b_qlora_openorca_e1.py
│ │ │ ├── chatglm2_6b_qlora_sql_e3.py
│ │ │ └── chatglm2_6b_qlora_tiny_codes_e1.py
│ │ ├── chatglm3_6b
│ │ │ ├── chatglm3_6b_qlora_alpaca_e3.py
│ │ │ ├── chatglm3_6b_qlora_alpaca_enzh_e3.py
│ │ │ ├── chatglm3_6b_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── chatglm3_6b_qlora_alpaca_zh_e3.py
│ │ │ ├── chatglm3_6b_qlora_arxiv_gentitle_e3.py
│ │ │ ├── chatglm3_6b_qlora_code_alpaca_e3.py
│ │ │ ├── chatglm3_6b_qlora_colorist_e5.py
│ │ │ ├── chatglm3_6b_qlora_lawyer_e3.py
│ │ │ ├── chatglm3_6b_qlora_medical_e1.py
│ │ │ ├── chatglm3_6b_qlora_oasst1_512_e3.py
│ │ │ ├── chatglm3_6b_qlora_oasst1_e3.py
│ │ │ ├── chatglm3_6b_qlora_open_platypus_e3.py
│ │ │ ├── chatglm3_6b_qlora_openorca_e1.py
│ │ │ ├── chatglm3_6b_qlora_sql_e3.py
│ │ │ └── chatglm3_6b_qlora_tiny_codes_e1.py
│ │ └── chatglm3_6b_base
│ │ │ ├── chatglm3_6b_base_qlora_alpaca_e3.py
│ │ │ ├── chatglm3_6b_base_qlora_alpaca_enzh_e3.py
│ │ │ ├── chatglm3_6b_base_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── chatglm3_6b_base_qlora_alpaca_zh_e3.py
│ │ │ ├── chatglm3_6b_base_qlora_arxiv_gentitle_e3.py
│ │ │ ├── chatglm3_6b_base_qlora_code_alpaca_e3.py
│ │ │ ├── chatglm3_6b_base_qlora_colorist_e5.py
│ │ │ ├── chatglm3_6b_base_qlora_lawyer_e3.py
│ │ │ ├── chatglm3_6b_base_qlora_medical_e1.py
│ │ │ ├── chatglm3_6b_base_qlora_oasst1_512_e3.py
│ │ │ ├── chatglm3_6b_base_qlora_oasst1_e3.py
│ │ │ ├── chatglm3_6b_base_qlora_open_platypus_e3.py
│ │ │ ├── chatglm3_6b_base_qlora_openorca_e1.py
│ │ │ ├── chatglm3_6b_base_qlora_sql_e3.py
│ │ │ └── chatglm3_6b_base_qlora_tiny_codes_e1.py
│ ├── cohere
│ │ ├── README.md
│ │ └── cohere_104b
│ │ │ └── cohere_100b_128k_sp32.py
│ ├── custom_dataset
│ │ ├── pretrain
│ │ │ ├── baichuan
│ │ │ │ ├── baichuan2_13b_base_full_custom_pretrain_e1.py
│ │ │ │ └── baichuan2_7b_base_full_custom_pretrain_e1.py
│ │ │ ├── chatglm
│ │ │ │ ├── chatglm2_6b_full_custom_pretrain_e1.py
│ │ │ │ └── chatglm3_6b_full_custom_pretrain_e1.py
│ │ │ ├── deepseek
│ │ │ │ └── deepseek_moe_16b_base_full_custom_pretrain_e1.py
│ │ │ ├── gemma
│ │ │ │ ├── gemma_2b_full_custom_pretrain_e1.py
│ │ │ │ └── gemma_7b_full_custom_pretrain_e1.py
│ │ │ ├── internlm
│ │ │ │ ├── internlm2_1_8b_full_custom_pretrain_e1.py
│ │ │ │ ├── internlm2_20b_full_custom_pretrain_e1.py
│ │ │ │ └── internlm2_7b_full_custom_pretrain_e1.py
│ │ │ ├── llama
│ │ │ │ ├── llama2_70b_full_custom_pretrain_e1.py
│ │ │ │ └── llama2_7b_full_custom_pretrain_e1.py
│ │ │ ├── mistral
│ │ │ │ └── mistral_7b_full_custom_pretrain_e1.py
│ │ │ ├── mixtral
│ │ │ │ └── mixtral_8x7b_full_custom_pretrain_e1.py
│ │ │ ├── qwen
│ │ │ │ ├── qwen1_5_0_5b_full_custom_pretrain_e1.py
│ │ │ │ ├── qwen1_5_14b_full_custom_pretrain_e1.py
│ │ │ │ ├── qwen1_5_1_8b_full_custom_pretrain_e1.py
│ │ │ │ ├── qwen1_5_4b_full_custom_pretrain_e1.py
│ │ │ │ ├── qwen1_5_72b_full_custom_pretrain_e1.py
│ │ │ │ ├── qwen1_5_7b_full_custom_pretrain_e1.py
│ │ │ │ ├── qwen_1_8b_full_custom_pretrain_e1.py
│ │ │ │ ├── qwen_72b_full_custom_pretrain_e1.py
│ │ │ │ └── qwen_7b_full_custom_pretrain_e1.py
│ │ │ ├── starcoder
│ │ │ │ └── starcoder_full_custom_pretrain_e1.py
│ │ │ ├── yi
│ │ │ │ ├── yi_34b_full_custom_pretrain_e1.py
│ │ │ │ └── yi_6b_full_custom_pretrain_e1.py
│ │ │ └── zephyr
│ │ │ │ └── zephyr_7b_beta_full_custom_pretrain_e1.py
│ │ └── sft
│ │ │ ├── baichuan
│ │ │ ├── baichuan2_13b_chat_qlora_custom_sft_e1.py
│ │ │ ├── baichuan2_7b_chat_qlora_custom_sft_e1.py
│ │ │ ├── baichuan_13b_chat_qlora_custom_sft_e1.py
│ │ │ └── baichuan_7b_qlora_custom_sft_e1.py
│ │ │ ├── chatglm
│ │ │ ├── chatglm2_6b_qlora_custom_sft_e1.py
│ │ │ └── chatglm3_6b_qlora_custom_sft_e1.py
│ │ │ ├── deepseek
│ │ │ ├── deepseek_moe_16b_chat_qlora_custom_sft_e1.py
│ │ │ └── deepseekcoder_6_7b_instruct_qlora_custom_sft_e1.py
│ │ │ ├── gemma
│ │ │ ├── gemma_2b_it_qlora_custom_sft_e1.py
│ │ │ ├── gemma_2b_qlora_custom_sft_e1.py
│ │ │ ├── gemma_7b_it_qlora_custom_sft_e1.py
│ │ │ └── gemma_7b_qlora_custom_sft_e1.py
│ │ │ ├── internlm
│ │ │ ├── internlm2_chat_1_8b_qlora_custom_sft_e1.py
│ │ │ ├── internlm2_chat_20b_qlora_custom_sft_e1.py
│ │ │ └── internlm2_chat_7b_qlora_custom_sft_e1.py
│ │ │ ├── llama
│ │ │ ├── llama2_70b_qlora_custom_sft_e1.py
│ │ │ └── llama2_7b_chat_qlora_custom_sft_e1.py
│ │ │ ├── mistral
│ │ │ └── mistral_7b_full_finetune_custom_sft_e1.py
│ │ │ ├── mixtral
│ │ │ └── mixtral_8x7b_instruct_qlora_custom_sft_e1.py
│ │ │ ├── qwen
│ │ │ ├── qwen1_5_0_5b_chat_qlora_custom_sft_e1.py
│ │ │ ├── qwen1_5_14b_chat_qlora_custom_sft_e1.py
│ │ │ ├── qwen1_5_1_8b_chat_qlora_custom_sft_e1.py
│ │ │ ├── qwen1_5_4b_chat_qlora_custom_sft_e1.py
│ │ │ ├── qwen1_5_72b_chat_qlora_custom_sft_e1.py
│ │ │ ├── qwen1_5_7b_chat_qlora_custom_sft_e1.py
│ │ │ ├── qwen_1_8b_chat_qlora_custom_sft_e1.py
│ │ │ ├── qwen_72b_qlora_custom_sft_e1.py
│ │ │ └── qwen_7b_chat_qlora_custom_sft_e1.py
│ │ │ ├── starcoder
│ │ │ └── starcoder_qlora_custom_sft_e1.py
│ │ │ ├── yi
│ │ │ ├── yi_34b_qlora_custom_sft_e1.py
│ │ │ └── yi_6b_qlora_custom_sft_e1.py
│ │ │ └── zephyr
│ │ │ └── zephyr_7b_beta_qlora_custom_sft_e1.py
│ ├── deepseek
│ │ ├── README.md
│ │ ├── deepseek_coder_6_7b_base
│ │ │ └── deepseek_coder_6_7b_base_qlora_code_alpaca_e3.py
│ │ ├── deepseek_coder_6_7b_instruct
│ │ │ └── deepseekcoder_6_7b_instruct_qlora_code_alpaca_e3.py
│ │ ├── deepseek_moe_16b_base
│ │ │ ├── deepseek_moe_16b_base_full_oasst1_e3.py
│ │ │ └── deepseek_moe_16b_base_qlora_oasst1_e3.py
│ │ ├── deepseek_moe_16b_chat
│ │ │ ├── deepseek_moe_16b_chat_full_oasst1_e3.py
│ │ │ └── deepseek_moe_16b_chat_qlora_oasst1_e3.py
│ │ ├── deepseek_v2_chat
│ │ │ └── deepseek_v2_chat_full_alpaca_e3.py
│ │ └── deepseek_v2_lite_chat
│ │ │ ├── deepseek_v2_lite_chat_full_alpaca_e3.py
│ │ │ └── deepseek_v2_lite_chat_full_alpaca_e3_32k_varlen.py
│ ├── deepspeed
│ │ ├── deepspeed_zero1.json
│ │ ├── deepspeed_zero2.json
│ │ ├── deepspeed_zero2_offload.json
│ │ ├── deepspeed_zero3.json
│ │ └── deepspeed_zero3_offload.json
│ ├── dpo
│ │ ├── internlm
│ │ │ ├── internlm2_chat_1_8b_dpo_full.py
│ │ │ ├── internlm2_chat_1_8b_dpo_full_varlenattn.py
│ │ │ ├── internlm2_chat_1_8b_dpo_full_varlenattn_jsonl_dataset.py
│ │ │ └── internlm2_chat_7b_dpo_qlora_varlenattn.py
│ │ └── llama
│ │ │ └── llama3_8b_instruct_dpo_qlora_varlenattn.py
│ ├── gemma
│ │ ├── gemma_2b
│ │ │ ├── gemma_2b_full_alpaca_e3.py
│ │ │ └── gemma_2b_qlora_alpaca_e3.py
│ │ ├── gemma_2b_it
│ │ │ ├── gemma_2b_it_full_alpaca_e3.py
│ │ │ └── gemma_2b_it_qlora_alpaca_e3.py
│ │ ├── gemma_7b
│ │ │ ├── gemma_7b_full_alpaca_e3.py
│ │ │ └── gemma_7b_qlora_alpaca_e3.py
│ │ └── gemma_7b_it
│ │ │ ├── gemma_7b_it_full_alpaca_e3.py
│ │ │ └── gemma_7b_it_qlora_alpaca_e3.py
│ ├── internlm
│ │ ├── internlm2_1_8b
│ │ │ ├── internlm2_1_8b_full_alpaca_e3.py
│ │ │ └── internlm2_1_8b_qlora_alpaca_e3.py
│ │ ├── internlm2_20b
│ │ │ ├── internlm2_20b_full_finetune_custom_dataset_e1.py
│ │ │ ├── internlm2_20b_qlora_alpaca_e3.py
│ │ │ ├── internlm2_20b_qlora_arxiv_gentitle_e3.py
│ │ │ ├── internlm2_20b_qlora_code_alpaca_e3.py
│ │ │ ├── internlm2_20b_qlora_colorist_e5.py
│ │ │ ├── internlm2_20b_qlora_lawyer_e3.py
│ │ │ ├── internlm2_20b_qlora_msagent_react_e3_gpu8.py
│ │ │ ├── internlm2_20b_qlora_oasst1_512_e3.py
│ │ │ ├── internlm2_20b_qlora_oasst1_e3.py
│ │ │ └── internlm2_20b_qlora_sql_e3.py
│ │ ├── internlm2_7b
│ │ │ ├── internlm2_7b_full_finetune_custom_dataset_e1.py
│ │ │ ├── internlm2_7b_full_finetune_custom_dataset_e1_sequence_parallel_4.py
│ │ │ ├── internlm2_7b_qlora_alpaca_e3.py
│ │ │ ├── internlm2_7b_qlora_arxiv_gentitle_e3.py
│ │ │ ├── internlm2_7b_qlora_code_alpaca_e3.py
│ │ │ ├── internlm2_7b_qlora_colorist_e5.py
│ │ │ ├── internlm2_7b_qlora_json_e3.py
│ │ │ ├── internlm2_7b_qlora_lawyer_e3.py
│ │ │ ├── internlm2_7b_qlora_msagent_react_e3_gpu8.py
│ │ │ ├── internlm2_7b_qlora_oasst1_512_e3.py
│ │ │ ├── internlm2_7b_qlora_oasst1_e3.py
│ │ │ ├── internlm2_7b_qlora_sql_e3.py
│ │ │ ├── internlm2_7b_w_internevo_dataset.py
│ │ │ ├── internlm2_7b_w_tokenized_dataset.py
│ │ │ └── internlm2_7b_w_untokenized_dataset.py
│ │ ├── internlm2_chat_1_8b
│ │ │ ├── internlm2_chat_1_8b_full_alpaca_e3.py
│ │ │ └── internlm2_chat_1_8b_qlora_alpaca_e3.py
│ │ ├── internlm2_chat_20b
│ │ │ ├── internlm2_chat_20b_full_finetune_custom_dataset_e1.py
│ │ │ ├── internlm2_chat_20b_qlora_alpaca_e3.py
│ │ │ ├── internlm2_chat_20b_qlora_code_alpaca_e3.py
│ │ │ ├── internlm2_chat_20b_qlora_lawyer_e3.py
│ │ │ ├── internlm2_chat_20b_qlora_oasst1_512_e3.py
│ │ │ └── internlm2_chat_20b_qlora_oasst1_e3.py
│ │ ├── internlm2_chat_7b
│ │ │ ├── internlm2_chat_7b_full_finetune_custom_dataset_e1.py
│ │ │ ├── internlm2_chat_7b_qlora_alpaca_e3.py
│ │ │ ├── internlm2_chat_7b_qlora_code_alpaca_e3.py
│ │ │ ├── internlm2_chat_7b_qlora_lawyer_e3.py
│ │ │ ├── internlm2_chat_7b_qlora_oasst1_512_e3.py
│ │ │ └── internlm2_chat_7b_qlora_oasst1_e3.py
│ │ ├── internlm_20b
│ │ │ ├── internlm_20b_qlora_alpaca_e3.py
│ │ │ ├── internlm_20b_qlora_alpaca_enzh_e3.py
│ │ │ ├── internlm_20b_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── internlm_20b_qlora_alpaca_zh_e3.py
│ │ │ ├── internlm_20b_qlora_arxiv_gentitle_e3.py
│ │ │ ├── internlm_20b_qlora_code_alpaca_e3.py
│ │ │ ├── internlm_20b_qlora_colorist_e5.py
│ │ │ ├── internlm_20b_qlora_lawyer_e3.py
│ │ │ ├── internlm_20b_qlora_msagent_react_e3_gpu8.py
│ │ │ ├── internlm_20b_qlora_oasst1_512_e3.py
│ │ │ ├── internlm_20b_qlora_oasst1_e3.py
│ │ │ ├── internlm_20b_qlora_open_platypus_e3.py
│ │ │ └── internlm_20b_qlora_sql_e3.py
│ │ ├── internlm_7b
│ │ │ ├── internlm_7b_full_alpaca_e3.py
│ │ │ ├── internlm_7b_full_alpaca_enzh_e3.py
│ │ │ ├── internlm_7b_full_alpaca_enzh_oasst1_e3.py
│ │ │ ├── internlm_7b_full_alpaca_zh_e3.py
│ │ │ ├── internlm_7b_full_intern_repo_dataset_template.py
│ │ │ ├── internlm_7b_full_oasst1_e3.py
│ │ │ ├── internlm_7b_qlora_alpaca_e3.py
│ │ │ ├── internlm_7b_qlora_alpaca_enzh_e3.py
│ │ │ ├── internlm_7b_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── internlm_7b_qlora_alpaca_zh_e3.py
│ │ │ ├── internlm_7b_qlora_arxiv_gentitle_e3.py
│ │ │ ├── internlm_7b_qlora_code_alpaca_e3.py
│ │ │ ├── internlm_7b_qlora_colorist_e5.py
│ │ │ ├── internlm_7b_qlora_json_e3.py
│ │ │ ├── internlm_7b_qlora_lawyer_e3.py
│ │ │ ├── internlm_7b_qlora_medical_e1.py
│ │ │ ├── internlm_7b_qlora_moss_sft_all_e1.py
│ │ │ ├── internlm_7b_qlora_moss_sft_all_e2_gpu8.py
│ │ │ ├── internlm_7b_qlora_moss_sft_plugins_e1.py
│ │ │ ├── internlm_7b_qlora_msagent_react_e3_gpu8.py
│ │ │ ├── internlm_7b_qlora_oasst1_512_e3.py
│ │ │ ├── internlm_7b_qlora_oasst1_e3.py
│ │ │ ├── internlm_7b_qlora_oasst1_e3_hf.py
│ │ │ ├── internlm_7b_qlora_oasst1_mmlu_e3.py
│ │ │ ├── internlm_7b_qlora_open_platypus_e3.py
│ │ │ ├── internlm_7b_qlora_openorca_e1.py
│ │ │ ├── internlm_7b_qlora_sql_e3.py
│ │ │ └── internlm_7b_qlora_tiny_codes_e1.py
│ │ ├── internlm_chat_20b
│ │ │ ├── internlm_chat_20b_qlora_alpaca_e3.py
│ │ │ ├── internlm_chat_20b_qlora_alpaca_enzh_e3.py
│ │ │ ├── internlm_chat_20b_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── internlm_chat_20b_qlora_alpaca_zh_e3.py
│ │ │ ├── internlm_chat_20b_qlora_code_alpaca_e3.py
│ │ │ ├── internlm_chat_20b_qlora_lawyer_e3.py
│ │ │ ├── internlm_chat_20b_qlora_oasst1_512_e3.py
│ │ │ ├── internlm_chat_20b_qlora_oasst1_e3.py
│ │ │ └── internlm_chat_20b_qlora_open_platypus_e3.py
│ │ └── internlm_chat_7b
│ │ │ ├── internlm_chat_7b_qlora_alpaca_e3.py
│ │ │ ├── internlm_chat_7b_qlora_alpaca_enzh_e3.py
│ │ │ ├── internlm_chat_7b_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── internlm_chat_7b_qlora_alpaca_zh_e3.py
│ │ │ ├── internlm_chat_7b_qlora_arxiv_gentitle_e3.py
│ │ │ ├── internlm_chat_7b_qlora_code_alpaca_e3.py
│ │ │ ├── internlm_chat_7b_qlora_colorist_e5.py
│ │ │ ├── internlm_chat_7b_qlora_lawyer_e3.py
│ │ │ ├── internlm_chat_7b_qlora_medical_e1.py
│ │ │ ├── internlm_chat_7b_qlora_oasst1_512_e3.py
│ │ │ ├── internlm_chat_7b_qlora_oasst1_e3.py
│ │ │ ├── internlm_chat_7b_qlora_open_platypus_e3.py
│ │ │ ├── internlm_chat_7b_qlora_openorca_e1.py
│ │ │ ├── internlm_chat_7b_qlora_sql_e3.py
│ │ │ └── internlm_chat_7b_qlora_tiny_codes_e1.py
│ ├── llama
│ │ ├── llama2_70b
│ │ │ ├── llama2_70b_full_wizardlm_e1.py
│ │ │ ├── llama2_70b_int8_lora_open_platypus_e1.py
│ │ │ ├── llama2_70b_int8_lora_open_platypus_e1_hf.py
│ │ │ ├── llama2_70b_qlora_open_platypus_e1.py
│ │ │ └── llama2_70b_qlora_open_platypus_e1_hf.py
│ │ ├── llama2_7b
│ │ │ ├── llama2_7b_full_pgbooks_400iters_sp1.py
│ │ │ ├── llama2_7b_full_pgbooks_400iters_sp4.py
│ │ │ ├── llama2_7b_full_wizardlm_e1.py
│ │ │ ├── llama2_7b_qlora_alpaca_e3.py
│ │ │ ├── llama2_7b_qlora_alpaca_enzh_e3.py
│ │ │ ├── llama2_7b_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── llama2_7b_qlora_alpaca_zh_e3.py
│ │ │ ├── llama2_7b_qlora_arxiv_gentitle_e3.py
│ │ │ ├── llama2_7b_qlora_code_alpaca_e3.py
│ │ │ ├── llama2_7b_qlora_colorist_e5.py
│ │ │ ├── llama2_7b_qlora_lawyer_e3.py
│ │ │ ├── llama2_7b_qlora_medical_e1.py
│ │ │ ├── llama2_7b_qlora_moss_sft_all_e1.py
│ │ │ ├── llama2_7b_qlora_moss_sft_all_e2_gpu8.py
│ │ │ ├── llama2_7b_qlora_moss_sft_plugins_e1.py
│ │ │ ├── llama2_7b_qlora_msagent_react_e3_gpu8.py
│ │ │ ├── llama2_7b_qlora_oasst1_512_e3.py
│ │ │ ├── llama2_7b_qlora_oasst1_e3.py
│ │ │ ├── llama2_7b_qlora_open_platypus_e3.py
│ │ │ ├── llama2_7b_qlora_openorca_e1.py
│ │ │ ├── llama2_7b_qlora_sql_e3.py
│ │ │ └── llama2_7b_qlora_tiny_codes_e1.py
│ │ ├── llama2_7b_chat
│ │ │ ├── llama2_7b_chat_qlora_alpaca_e3.py
│ │ │ ├── llama2_7b_chat_qlora_alpaca_enzh_e3.py
│ │ │ ├── llama2_7b_chat_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── llama2_7b_chat_qlora_alpaca_zh_e3.py
│ │ │ ├── llama2_7b_chat_qlora_arxiv_gentitle_e3.py
│ │ │ ├── llama2_7b_chat_qlora_code_alpaca_e3.py
│ │ │ ├── llama2_7b_chat_qlora_colorist_e5.py
│ │ │ ├── llama2_7b_chat_qlora_lawyer_e3.py
│ │ │ ├── llama2_7b_chat_qlora_medical_e1.py
│ │ │ ├── llama2_7b_chat_qlora_oasst1_512_e3.py
│ │ │ ├── llama2_7b_chat_qlora_oasst1_e3.py
│ │ │ ├── llama2_7b_chat_qlora_open_platypus_e3.py
│ │ │ ├── llama2_7b_chat_qlora_openorca_e1.py
│ │ │ ├── llama2_7b_chat_qlora_sql_e3.py
│ │ │ └── llama2_7b_chat_qlora_tiny_codes_e1.py
│ │ ├── llama3_70b_instruct
│ │ │ └── llama3_70b_instruct_qlora_alpaca_e3_2k_gpu8.py
│ │ ├── llama3_8b
│ │ │ ├── README.md
│ │ │ └── llama3_8b_full_alpaca_e3.py
│ │ ├── llama3_8b_instruct
│ │ │ ├── llama3_8b_instruct_full_alpaca_e3.py
│ │ │ └── llama3_8b_instruct_qlora_alpaca_e3.py
│ │ └── llama_7b
│ │ │ ├── llama_7b_qlora_alpaca_e3.py
│ │ │ ├── llama_7b_qlora_alpaca_enzh_e3.py
│ │ │ ├── llama_7b_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ ├── llama_7b_qlora_alpaca_zh_e3.py
│ │ │ ├── llama_7b_qlora_arxiv_gentitle_e3.py
│ │ │ ├── llama_7b_qlora_code_alpaca_e3.py
│ │ │ ├── llama_7b_qlora_colorist_e5.py
│ │ │ ├── llama_7b_qlora_lawyer_e3.py
│ │ │ ├── llama_7b_qlora_medical_e1.py
│ │ │ ├── llama_7b_qlora_moss_sft_all_e1.py
│ │ │ ├── llama_7b_qlora_moss_sft_all_e2_gpu8.py
│ │ │ ├── llama_7b_qlora_moss_sft_plugins_e1.py
│ │ │ ├── llama_7b_qlora_oasst1_512_e3.py
│ │ │ ├── llama_7b_qlora_oasst1_e3.py
│ │ │ ├── llama_7b_qlora_open_platypus_e3.py
│ │ │ ├── llama_7b_qlora_openorca_e1.py
│ │ │ ├── llama_7b_qlora_sql_e3.py
│ │ │ └── llama_7b_qlora_tiny_codes_e1.py
│ ├── llama_speed_benchmark
│ │ ├── llama2_70b
│ │ │ ├── llama2_70b_full_alpaca_enzh_128k_sp8.py
│ │ │ ├── llama2_70b_full_alpaca_enzh_256k_sp16.py
│ │ │ ├── llama2_70b_full_alpaca_enzh_32k_sp4.py
│ │ │ └── llama2_70b_full_alpaca_enzh_8k_sp1.py
│ │ ├── llama2_7b
│ │ │ ├── llama2_7b_full_alpaca_enzh_128k_sp8.py
│ │ │ ├── llama2_7b_full_alpaca_enzh_1M_sp16.py
│ │ │ ├── llama2_7b_full_alpaca_enzh_256k_sp8.py
│ │ │ ├── llama2_7b_full_alpaca_enzh_32k_sp1.py
│ │ │ └── llama2_7b_full_alpaca_enzh_8k_sp1.py
│ │ └── yi_34b
│ │ │ ├── yi_34b_200k_full_alpaca_enzh_128k_sp8.py
│ │ │ ├── yi_34b_200k_full_alpaca_enzh_256k_sp8.py
│ │ │ ├── yi_34b_200k_full_alpaca_enzh_32k_sp2.py
│ │ │ └── yi_34b_200k_full_alpaca_enzh_8k_sp1.py
│ ├── llava
│ │ ├── README.md
│ │ ├── README_zh-CN.md
│ │ ├── internlm2_chat_1_8b_clip_vit_large_p14_336
│ │ │ ├── finetune
│ │ │ │ └── llava_internlm2_chat_1_8b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py
│ │ │ └── pretrain
│ │ │ │ └── llava_internlm2_chat_1_8b_clip_vit_large_p14_336_e1_gpu8_pretrain.py
│ │ ├── internlm2_chat_20b_clip_vit_large_p14_336
│ │ │ ├── finetune
│ │ │ │ ├── llava_internlm2_chat_20b_clip_vit_large_p14_336_e1_gpu8_finetune.py
│ │ │ │ └── llava_internlm2_chat_20b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py
│ │ │ └── pretrain
│ │ │ │ └── llava_internlm2_chat_20b_clip_vit_large_p14_336_e1_gpu8_pretrain.py
│ │ ├── internlm2_chat_7b_clip_vit_large_p14_336
│ │ │ ├── finetune
│ │ │ │ ├── llava_internlm2_chat_7b_clip_vit_large_p14_336_e1_gpu8_finetune.py
│ │ │ │ └── llava_internlm2_chat_7b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py
│ │ │ └── pretrain
│ │ │ │ └── llava_internlm2_chat_7b_clip_vit_large_p14_336_e1_gpu8_pretrain.py
│ │ ├── internlm_chat_7b_clip_vit_large_p14_336
│ │ │ ├── finetune
│ │ │ │ └── llava_internlm_chat_7b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py
│ │ │ └── pretrain
│ │ │ │ └── llava_internlm_chat_7b_clip_vit_large_p14_336_e1_gpu8_pretrain.py
│ │ ├── llama3_70b_instruct_clip_vit_large_p14_336
│ │ │ └── pretrain
│ │ │ │ └── llava_llama3_70b_instruct_quant_clip_vit_large_p14_336_e1_gpu8_pretrain.py
│ │ ├── llama3_8b_instruct_clip_vit_large_p14_336
│ │ │ ├── README.md
│ │ │ ├── convert_xtuner_weights_to_hf.py
│ │ │ ├── convert_xtuner_weights_to_llava.py
│ │ │ ├── finetune
│ │ │ │ ├── llava_llama3_8b_instruct_full_clip_vit_large_p14_336_e1_gpu8_finetune.py
│ │ │ │ ├── llava_llama3_8b_instruct_full_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py
│ │ │ │ ├── llava_llama3_8b_instruct_full_clip_vit_large_p14_336_lora_e1_gpu8_internvl_finetune.py
│ │ │ │ └── llava_llama3_8b_instruct_qlora_clip_vit_large_p14_336_e1_gpu1_finetune.py
│ │ │ └── pretrain
│ │ │ │ ├── llava_llama3_8b_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py
│ │ │ │ ├── llava_llama3_8b_instruct_clip_vit_large_p14_336_e1_gpu8_sharegpt4v_pretrain.py
│ │ │ │ └── llava_llama3_8b_instruct_quant_clip_vit_large_p14_336_e1_gpu1_pretrain.py
│ │ ├── official
│ │ │ ├── llava_v15_13b
│ │ │ │ ├── llava_v15_13b_finetune.py
│ │ │ │ ├── llava_v15_13b_finetune_lora.py
│ │ │ │ └── llava_v15_13b_pretrain.py
│ │ │ └── llava_v15_7b
│ │ │ │ ├── llava_v15_7b_finetune.py
│ │ │ │ ├── llava_v15_7b_finetune_lora.py
│ │ │ │ └── llava_v15_7b_pretrain.py
│ │ ├── phi3_mini_4k_instruct_clip_vit_large_p14_336
│ │ │ ├── README.md
│ │ │ ├── convert_phi_to_llama.py
│ │ │ ├── convert_xtuner_weights_to_hf.py
│ │ │ ├── convert_xtuner_weights_to_llava.py
│ │ │ ├── finetune
│ │ │ │ ├── llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_e1_gpu8_finetune.py
│ │ │ │ └── llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_full_e2_gpu8_internvl_finetune.py
│ │ │ └── pretrain
│ │ │ │ ├── llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py
│ │ │ │ └── llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_sharegpt4v_pretrain.py
│ │ ├── vicuna_13b_v15_clip_vit_large_p14_336
│ │ │ ├── finetune
│ │ │ │ └── llava_vicuna_13b_v15_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py
│ │ │ └── pretrain
│ │ │ │ └── llava_vicuna_13b_v15_clip_vit_large_p14_336_e1_gpu8_pretrain.py
│ │ └── vicuna_7b_v15_clip_vit_large_p14_336
│ │ │ ├── finetune
│ │ │ ├── llava_vicuna_7b_v15_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune.py
│ │ │ └── llava_vicuna_7b_v15_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune_refcoco.py
│ │ │ └── pretrain
│ │ │ └── llava_vicuna_7b_v15_clip_vit_large_p14_336_e1_gpu8_pretrain.py
│ ├── mistral
│ │ ├── mistral_7b_full_finetune_custom_dataset_e1.py
│ │ ├── mistral_7b_qlora_skypile_pretrain_e1.py
│ │ ├── mistral_7b_w_tokenized_dataset.py
│ │ └── mistral_7b_w_untokenized_dataset.py
│ ├── mixtral
│ │ ├── README.md
│ │ ├── mixtral_8x7b
│ │ │ ├── mixtral_8x7b_full_oasst1_e3.py
│ │ │ └── mixtral_8x7b_qlora_oasst1_e3.py
│ │ └── mixtral_8x7b_instruct
│ │ │ ├── mixtral_8x7b_instruct_full_oasst1_e3.py
│ │ │ └── mixtral_8x7b_instruct_qlora_oasst1_e3.py
│ ├── orpo
│ │ ├── internlm
│ │ │ ├── internlm2_chat_1_8b_orpo_full.py
│ │ │ ├── internlm2_chat_1_8b_orpo_full_varlenattn.py
│ │ │ ├── internlm2_chat_1_8b_orpo_full_varlenattn_jsonl_dataset.py
│ │ │ └── internlm2_chat_7b_orpo_qlora_varlenattn_ultrafeedback_e5.py
│ │ └── llama
│ │ │ └── llama3_8b_instruct_orpo_qlora_varlenattn_ultrafeedback_e5.py
│ ├── phi
│ │ └── phi3
│ │ │ ├── phi3_mini_128k_instruct_full_alpaca_e3.py
│ │ │ ├── phi3_mini_128k_instruct_qlora_alpaca_e3.py
│ │ │ ├── phi3_mini_4k_instruct_full_alpaca_e3.py
│ │ │ └── phi3_mini_4k_instruct_qlora_alpaca_e3.py
│ ├── qwen
│ │ ├── qwen1
│ │ │ ├── qwen_1_8b
│ │ │ │ ├── qwen_1_8b_qlora_alpaca_e3.py
│ │ │ │ ├── qwen_1_8b_qlora_alpaca_enzh_e3.py
│ │ │ │ ├── qwen_1_8b_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ │ ├── qwen_1_8b_qlora_alpaca_zh_e3.py
│ │ │ │ └── qwen_1_8b_qlora_code_alpaca_e3.py
│ │ │ ├── qwen_1_8b_chat
│ │ │ │ ├── qwen_1_8b_chat_qlora_alpaca_e3.py
│ │ │ │ ├── qwen_1_8b_chat_qlora_alpaca_enzh_e3.py
│ │ │ │ ├── qwen_1_8b_chat_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ │ ├── qwen_1_8b_chat_qlora_alpaca_zh_e3.py
│ │ │ │ └── qwen_1_8b_chat_qlora_code_alpaca_e3.py
│ │ │ ├── qwen_72b
│ │ │ │ ├── qwen_72b_qlora_alpaca_e3.py
│ │ │ │ ├── qwen_72b_qlora_alpaca_enzh_e3.py
│ │ │ │ ├── qwen_72b_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ │ ├── qwen_72b_qlora_alpaca_zh_e3.py
│ │ │ │ └── qwen_72b_qlora_code_alpaca_e3.py
│ │ │ ├── qwen_7b
│ │ │ │ ├── qwen_7b_qlora_alpaca_e3.py
│ │ │ │ ├── qwen_7b_qlora_alpaca_enzh_e3.py
│ │ │ │ ├── qwen_7b_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ │ ├── qwen_7b_qlora_alpaca_zh_e3.py
│ │ │ │ ├── qwen_7b_qlora_arxiv_gentitle_e3.py
│ │ │ │ ├── qwen_7b_qlora_code_alpaca_e3.py
│ │ │ │ ├── qwen_7b_qlora_colorist_e5.py
│ │ │ │ ├── qwen_7b_qlora_lawyer_e3.py
│ │ │ │ ├── qwen_7b_qlora_medical_e1.py
│ │ │ │ ├── qwen_7b_qlora_moss_sft_all_e1.py
│ │ │ │ ├── qwen_7b_qlora_moss_sft_all_e2_gpu8.py
│ │ │ │ ├── qwen_7b_qlora_moss_sft_plugins_e1.py
│ │ │ │ ├── qwen_7b_qlora_oasst1_512_e3.py
│ │ │ │ ├── qwen_7b_qlora_oasst1_e3.py
│ │ │ │ ├── qwen_7b_qlora_open_platypus_e3.py
│ │ │ │ ├── qwen_7b_qlora_openorca_e1.py
│ │ │ │ ├── qwen_7b_qlora_sql_e3.py
│ │ │ │ └── qwen_7b_qlora_tiny_codes_e1.py
│ │ │ └── qwen_7b_chat
│ │ │ │ ├── qwen_7b_chat_qlora_alpaca_e3.py
│ │ │ │ ├── qwen_7b_chat_qlora_alpaca_enzh_e3.py
│ │ │ │ ├── qwen_7b_chat_qlora_alpaca_enzh_oasst1_e3.py
│ │ │ │ ├── qwen_7b_chat_qlora_alpaca_zh_e3.py
│ │ │ │ ├── qwen_7b_chat_qlora_arxiv_gentitle_e3.py
│ │ │ │ ├── qwen_7b_chat_qlora_code_alpaca_e3.py
│ │ │ │ ├── qwen_7b_chat_qlora_colorist_e5.py
│ │ │ │ ├── qwen_7b_chat_qlora_lawyer_e3.py
│ │ │ │ ├── qwen_7b_chat_qlora_medical_e1.py
│ │ │ │ ├── qwen_7b_chat_qlora_oasst1_512_e3.py
│ │ │ │ ├── qwen_7b_chat_qlora_oasst1_e3.py
│ │ │ │ ├── qwen_7b_chat_qlora_open_platypus_e3.py
│ │ │ │ ├── qwen_7b_chat_qlora_openorca_e1.py
│ │ │ │ ├── qwen_7b_chat_qlora_sql_e3.py
│ │ │ │ └── qwen_7b_chat_qlora_tiny_codes_e1.py
│ │ └── qwen1_5
│ │ │ ├── qwen1_5_0_5b
│ │ │ ├── qwen1_5_0_5b_full_alpaca_e3.py
│ │ │ └── qwen1_5_0_5b_qlora_alpaca_e3.py
│ │ │ ├── qwen1_5_0_5b_chat
│ │ │ ├── qwen1_5_0_5b_chat_full_alpaca_e3.py
│ │ │ └── qwen1_5_0_5b_chat_qlora_alpaca_e3.py
│ │ │ ├── qwen1_5_110b
│ │ │ ├── qwen1_5_110b_full_alpaca_e3.py
│ │ │ └── qwen1_5_110b_qlora_alpaca_e3.py
│ │ │ ├── qwen1_5_110b_chat
│ │ │ ├── README.md
│ │ │ ├── qwen1_5_110b_chat_full_alpaca_e3.py
│ │ │ ├── qwen1_5_110b_chat_qlora_alpaca_e3.py
│ │ │ └── qwen1_5_110b_chat_qlora_alpaca_e3_16k_2gpus.py
│ │ │ ├── qwen1_5_14b
│ │ │ ├── qwen1_5_14b_full_alpaca_e3.py
│ │ │ └── qwen1_5_14b_qlora_alpaca_e3.py
│ │ │ ├── qwen1_5_14b_chat
│ │ │ ├── qwen1_5_14b_chat_full_alpaca_e3.py
│ │ │ └── qwen1_5_14b_chat_qlora_alpaca_e3.py
│ │ │ ├── qwen1_5_1_8b
│ │ │ ├── qwen1_5_1_8b_full_alpaca_e3.py
│ │ │ └── qwen1_5_1_8b_qlora_alpaca_e3.py
│ │ │ ├── qwen1_5_1_8b_chat
│ │ │ ├── qwen1_5_1_8b_chat_full_alpaca_e3.py
│ │ │ └── qwen1_5_1_8b_chat_qlora_alpaca_e3.py
│ │ │ ├── qwen1_5_4b
│ │ │ ├── qwen1_5_4b_full_alpaca_e3.py
│ │ │ └── qwen1_5_4b_qlora_alpaca_e3.py
│ │ │ ├── qwen1_5_4b_chat
│ │ │ ├── qwen1_5_4b_chat_full_alpaca_e3.py
│ │ │ └── qwen1_5_4b_chat_qlora_alpaca_e3.py
│ │ │ ├── qwen1_5_72b
│ │ │ ├── qwen1_5_72b_full_alpaca_e3.py
│ │ │ └── qwen1_5_72b_qlora_alpaca_e3.py
│ │ │ ├── qwen1_5_72b_chat
│ │ │ ├── qwen1_5_72b_chat_full_alpaca_e3.py
│ │ │ └── qwen1_5_72b_chat_qlora_alpaca_e3.py
│ │ │ ├── qwen1_5_7b
│ │ │ ├── qwen1_5_7b_full_alpaca_e3.py
│ │ │ └── qwen1_5_7b_qlora_alpaca_e3.py
│ │ │ └── qwen1_5_7b_chat
│ │ │ ├── qwen1_5_7b_chat_full_alpaca_e3.py
│ │ │ └── qwen1_5_7b_chat_qlora_alpaca_e3.py
│ ├── qwen_moe
│ │ └── qwen1_5
│ │ │ └── qwen1_5_moe_a2_7_b_chat
│ │ │ └── qwen1_5_moe_a2_7_b_chat_full_alpaca_e3.py
│ ├── reward_model
│ │ ├── internlm
│ │ │ ├── internlm2_chat_1_8b_reward_full_ultrafeedback.py
│ │ │ ├── internlm2_chat_1_8b_reward_full_varlenattn_jsonl_dataset.py
│ │ │ ├── internlm2_chat_1_8b_reward_full_varlenattn_ultrafeedback.py
│ │ │ └── internlm2_chat_1_8b_reward_qlora_varlenattn_ultrafeedback.py
│ │ └── llama
│ │ │ └── llama3_8b_instruct_reward_full_varlenattn_ultrafeedback.py
│ ├── starcoder
│ │ └── starcoder_qlora_stack_exchange_example.py
│ ├── yi
│ │ ├── yi_34b
│ │ │ └── yi_34b_qlora_alpaca_enzh_e3.py
│ │ └── yi_6b
│ │ │ └── yi_6b_qlora_alpaca_enzh_e3.py
│ └── zephyr
│ │ └── zephyr_7b_beta_qlora_alpaca_e3.py
│ ├── dataset
│ ├── __init__.py
│ ├── collate_fns
│ │ ├── __init__.py
│ │ ├── default_collate_fn.py
│ │ ├── mmlu_collate_fn.py
│ │ └── preference_collate_fn.py
│ ├── concat_dataset.py
│ ├── huggingface.py
│ ├── intern_repo.py
│ ├── json_dataset.py
│ ├── llava.py
│ ├── map_fns
│ │ ├── __init__.py
│ │ ├── dataset_map_fns
│ │ │ ├── __init__.py
│ │ │ ├── alpaca_map_fn.py
│ │ │ ├── alpaca_zh_map_fn.py
│ │ │ ├── arxiv_map_fn.py
│ │ │ ├── code_alpaca_map_fn.py
│ │ │ ├── colors_map_fn.py
│ │ │ ├── crime_kg_assitant_map_fn.py
│ │ │ ├── default_map_fn.py
│ │ │ ├── law_reference_map_fn.py
│ │ │ ├── llava_map_fn.py
│ │ │ ├── medical_map_fn.py
│ │ │ ├── msagent_map_fn.py
│ │ │ ├── oasst1_map_fn.py
│ │ │ ├── openai_map_fn.py
│ │ │ ├── openorca_map_fn.py
│ │ │ ├── pretrain_map_fn.py
│ │ │ ├── sql_map_fn.py
│ │ │ ├── stack_exchange_map_fn.py
│ │ │ ├── tiny_codes_map_fn.py
│ │ │ └── wizardlm_map_fn.py
│ │ └── template_map_fn.py
│ ├── modelscope.py
│ ├── moss_sft.py
│ ├── preference_dataset.py
│ ├── refcoco_json.py
│ ├── samplers
│ │ ├── __init__.py
│ │ ├── intern_repo.py
│ │ └── length_grouped.py
│ └── utils.py
│ ├── engine
│ ├── __init__.py
│ ├── _strategy
│ │ ├── __init__.py
│ │ └── deepspeed.py
│ ├── hooks
│ │ ├── __init__.py
│ │ ├── dataset_info_hook.py
│ │ ├── evaluate_chat_hook.py
│ │ ├── hf_checkpoint_hook.py
│ │ ├── throughput_hook.py
│ │ └── varlen_attn_args_to_messagehub_hook.py
│ └── runner
│ │ ├── __init__.py
│ │ └── loops.py
│ ├── entry_point.py
│ ├── evaluation
│ ├── __init__.py
│ └── metrics
│ │ ├── __init__.py
│ │ ├── mmlu_metric.py
│ │ └── reward_metric.py
│ ├── model
│ ├── __init__.py
│ ├── dpo.py
│ ├── llava.py
│ ├── modules
│ │ ├── __init__.py
│ │ ├── dispatch
│ │ │ ├── __init__.py
│ │ │ ├── attention.py
│ │ │ ├── baichuan.py
│ │ │ ├── cohere.py
│ │ │ ├── deepseek_v2.py
│ │ │ ├── internlm.py
│ │ │ ├── internlm2.py
│ │ │ ├── llama.py
│ │ │ ├── mistral.py
│ │ │ ├── phi3.py
│ │ │ ├── qwen2.py
│ │ │ ├── triton_kernels
│ │ │ │ ├── __init__.py
│ │ │ │ ├── layer_norm.py
│ │ │ │ ├── rms_norm.py
│ │ │ │ └── rotary.py
│ │ │ ├── utils.py
│ │ │ └── yi.py
│ │ └── projector
│ │ │ ├── __init__.py
│ │ │ ├── configuration_projector.py
│ │ │ └── modeling_projector.py
│ ├── orpo.py
│ ├── reward.py
│ ├── sft.py
│ ├── transformers_models
│ │ ├── __init__.py
│ │ ├── deepseek_v2
│ │ │ ├── __init__.py
│ │ │ ├── configuration_deepseek.py
│ │ │ ├── modeling_deepseek.py
│ │ │ └── tokenization_deepseek_fast.py
│ │ └── mixtral
│ │ │ ├── __init__.py
│ │ │ ├── configuration_mixtral.py
│ │ │ └── modeling_mixtral.py
│ └── utils.py
│ ├── parallel
│ ├── __init__.py
│ └── sequence
│ │ ├── __init__.py
│ │ ├── attention.py
│ │ ├── comm.py
│ │ ├── data_collate.py
│ │ ├── reduce_loss.py
│ │ ├── sampler.py
│ │ └── setup_distributed.py
│ ├── registry.py
│ ├── tools
│ ├── chat.py
│ ├── check_custom_dataset.py
│ ├── copy_cfg.py
│ ├── data_preprocess
│ │ ├── arxiv.py
│ │ └── convert_refcoco.py
│ ├── eval_refcoco.py
│ ├── get_data_order.py
│ ├── list_cfg.py
│ ├── list_dataset_format.py
│ ├── log_dataset.py
│ ├── mmbench.py
│ ├── model_converters
│ │ ├── merge.py
│ │ ├── modeling_internlm2_reward
│ │ │ ├── __init__.py
│ │ │ ├── configuration_internlm2.py
│ │ │ └── modeling_internlm2.py
│ │ ├── pth_to_hf.py
│ │ └── split.py
│ ├── plugins
│ │ ├── __init__.py
│ │ ├── api.py
│ │ ├── calculate.py
│ │ ├── search.py
│ │ └── solve.py
│ ├── process_untokenized_datasets.py
│ ├── process_untokenized_datasets_legacy.py
│ ├── process_untokenized_llava_data.py
│ ├── test.py
│ ├── tokenize_ftdp_datasets.py
│ ├── train.py
│ └── utils.py
│ ├── utils
│ ├── __init__.py
│ ├── constants.py
│ ├── fileio.py
│ ├── handle_moe_load_and_save.py
│ ├── stop_criteria.py
│ ├── templates.py
│ └── zero_to_any_dtype.py
│ └── version.py
├── seg
├── configs
│ ├── _base_
│ │ ├── datasets
│ │ │ ├── ade_panoptic.py
│ │ │ ├── ade_panoptic_ov.py
│ │ │ ├── ade_panoptic_ov_720p.py
│ │ │ ├── cityscapes_panoptic.py
│ │ │ ├── cityscapes_panoptic_720p.py
│ │ │ ├── coco_panoptic_lsj.py
│ │ │ ├── coco_panoptic_lsj_sam.py
│ │ │ ├── coco_panoptic_lsj_sam_720p.py
│ │ │ ├── coco_panoptic_video_ade_yt19_yt21_vip_cityscapes.py
│ │ │ ├── coco_panoptic_video_lsj.py
│ │ │ ├── coco_panoptic_video_yt19_vip_cityscapes_cocopansam.py
│ │ │ ├── coco_panoptic_video_yt19_vip_cocopansam.py
│ │ │ ├── coco_panoptic_video_yt19_yt21_vip_cityscapes_cocopansam.py
│ │ │ ├── davis.py
│ │ │ ├── joint_dataset.py
│ │ │ ├── objects365v2_detection_lsj.py
│ │ │ ├── objects365v2_instance_lsj.py
│ │ │ ├── vipseg.py
│ │ │ ├── youtube_vis_2019.py
│ │ │ ├── youtube_vis_2021.py
│ │ │ └── youtube_vis_ovis.py
│ │ ├── default_runtime.py
│ │ └── schedules
│ │ │ ├── schedule_12e.py
│ │ │ └── schedule_24e.py
│ ├── m2_train_close_set
│ │ └── omg_convl_coco_vid_ade_yt19_yt21_vip_city.py
│ ├── m2ov_train
│ │ ├── omg_convl_vlm_fix_12e_ov_coco_vid_yt19_vip_city_cocopansam.py
│ │ └── omg_convl_vlm_fix_12e_ov_coco_vid_yt19_y21_vip_city_cocopansam.py
│ └── m2ov_val
│ │ ├── datasets
│ │ ├── ade.py
│ │ ├── cityscapes.py
│ │ ├── coco.py
│ │ ├── coco_pan_point.py
│ │ ├── davis.py
│ │ ├── vipseg.py
│ │ ├── y19.py
│ │ └── y21.py
│ │ ├── eval_m2_convl_300q_ov_ade.py
│ │ ├── eval_m2_convl_300q_ov_cityscapes.py
│ │ ├── eval_m2_convl_300q_ov_coco.py
│ │ ├── eval_m2_convl_300q_ov_davis.py
│ │ ├── eval_m2_convl_300q_ov_vipseg.py
│ │ ├── eval_m2_convl_300q_ov_y19.py
│ │ ├── eval_m2_convl_300q_ov_y21.py
│ │ ├── eval_m2_convl_ov_coco_pan_point.py
│ │ └── models
│ │ └── m2_convl_300q.py
├── datasets
│ ├── ade_ov.py
│ ├── cityscapes.py
│ ├── coco_ins_ov.py
│ ├── coco_ov.py
│ ├── coco_pan_sam.py
│ ├── concat_dataset.py
│ ├── davis.py
│ ├── pipelines
│ │ ├── formatting.py
│ │ ├── frame_copy.py
│ │ ├── frame_sampling.py
│ │ ├── loading.py
│ │ └── transforms.py
│ ├── samplers
│ │ ├── batch_sampler.py
│ │ └── multi_dataset_sampler.py
│ ├── vipseg.py
│ └── youtube_vis_dataset.py
├── evaluation
│ ├── hooks
│ │ └── visual_hook.py
│ └── metrics
│ │ ├── cityscapes_panoptic_metric.py
│ │ ├── ins_cls_iou_metric.py
│ │ ├── vip_seg_metric.py
│ │ └── vos_metric.py
└── models
│ ├── backbones
│ ├── __init__.py
│ └── openclip_backbone.py
│ ├── data_preprocessor
│ ├── __init__.py
│ ├── ovsam_preprocessor.py
│ └── vidseg_data_preprocessor.py
│ ├── detectors
│ ├── __init__.py
│ ├── mask2former_vid.py
│ └── mask2former_vid_minvis.py
│ ├── fusion_head
│ ├── __init__.py
│ └── omgseg_fusionhead.py
│ ├── heads
│ ├── __init__.py
│ └── mask2former_vid.py
│ ├── task_modules
│ └── cost.py
│ └── utils
│ ├── __init__.py
│ ├── class_overlapping.py
│ ├── load_checkpoint.py
│ ├── mask_pool.py
│ ├── no_obj.py
│ ├── offline_video_metrics.py
│ ├── online_pq_utils.py
│ ├── pan_seg_transform.py
│ └── video_gt_preprocess.py
└── tools
├── dataset_convert
└── vis_to_coco.py
├── dist.sh
├── eval_scripts
├── eval_davis.py
└── eval_video.py
├── gen_cls.py
├── slurm.sh
├── test.py
└── train.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.jpg filter=lfs diff=lfs merge=lfs -text
2 |
--------------------------------------------------------------------------------
/EMB.md:
--------------------------------------------------------------------------------
1 | ## Generate text embedding for each dataset and Download the pretrained models.
2 |
3 | ### For Separate Dataset. (Mainly For Evaluation)
4 |
5 | We adopt the separate dataset embedding for testing.
6 |
7 |
8 | ```commandline
9 | ./tools/dist.sh gen_cls seg/configs/m2ov_val/eval_m2_convl_300q_ov_coco.py 1
10 | ```
11 | ```commandline
12 | ./tools/dist.sh gen_cls seg/configs/m2ov_val/eval_m2_convl_300q_ov_ade.py 1
13 | ```
14 |
15 | ```commandline
16 | ./tools/dist.sh gen_cls seg/configs/m2ov_val/eval_m2_convl_300q_ov_cityscapes.py 1
17 | ```
18 |
19 | ```commandline
20 | ./tools/dist.sh gen_cls seg/configs/m2ov_val/eval_m2_convl_300q_ov_vipseg.py 1
21 | ```
22 |
23 | ```commandline
24 | ./tools/dist.sh gen_cls seg/configs/m2ov_val/eval_m2_convl_300q_ov_y19.py 1
25 | ```
26 |
27 | ```commandline
28 | ./tools/dist.sh gen_cls seg/configs/m2ov_val/eval_m2_convl_300q_ov_y21.py 1
29 | ```
30 |
31 | ### For Merged Dataset Training. (Mainly For Co-Training)
32 |
33 | We adopt the merged dataset embedding for training.
34 |
35 | ```commandline
36 | ./tools/dist.sh gen_cls seg/configs/m2ov_train/omg_convl_vlm_fix_24e_ov_coco_vid_yt19_vip_city_cocopansam.py 1
37 | ```
38 |
39 | Once you finish converting the embedding, you will obtain the embedding file in your cache folder.
40 |
41 | ### Download Pre-trained Open-ClIP models.
42 |
43 | When generating the class embedding classifier, the scripts will automatically download the pre-trained CLIP models.
44 |
45 | If you are in China, you can use [HF-Mirror](https://hf-mirror.com/). Follow the step to set the default path.
46 |
47 | ```commandline
48 | pip install -U huggingface_hub
49 | ```
50 |
51 | ```commandline
52 | export HF_ENDPOINT=https://hf-mirror.com
53 | ```
54 |
55 |
56 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | This project is licensed under the MIT license.
2 | Copyrights are respective of each contributor listed at the beginning of each definition file.
3 |
4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5 |
6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7 |
8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/demo/README.md:
--------------------------------------------------------------------------------
1 | # OMG-Seg Demo
2 |
3 | We provide a single-file demo in this folder to facilitate getting started. Supposing that you are in the root directory of this project.
4 |
5 | ## Embedding Generation
6 | To use the demo, you need first generate a class name list to tell the OMG-Seg model all possible categories as the vocabulary dictionary. We have already provided a sample vocabulary list in `demo/configs/names/th139_st101.py`.
7 |
8 | Then, we need to generate the class embeddings based on the names. You can do this by the following command:
9 | ```commandline
10 | PYTHONPATH=. python tools/gen_cls.py demo/configs/m2_convl.py
11 | ```
12 | The script will automatically read the class list, which is imported in `demo/configs/m2_convl.py` (please refer to `CLASSES` and `DATASET_NAME`), and generate the embeddings.
13 |
14 | ## Run the Demo
15 | After generating the embeddings, you can run the demo by:
16 | ```commandline
17 | PYTHONPATH=. python demo/image_demo.py
18 | ```
19 | for image; and
20 | ```commandline
21 | PYTHONPATH=. python demo/video_demo.py
22 | ```
23 | for video.
24 |
25 | Please refer to `test_image` and `test_video` for the visualization of the outputs.
26 |
27 | ## Customization
28 | If you want to try your own images or videos, please change the `IMG_PATH`, `VID_PATH`, and `MODEL_PATH`.
29 |
30 | If you want to customize our model, please refer to the config scripts (`demo/configs/m2_convl.py` and `demo/configs/m2_convl_vid.py`) for details.
31 |
32 | Note that all the model-related code have been imported in the config file. You need to find the corresponding path to find the model implementation details.
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002020.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002020.jpg
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002023.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002023.jpg
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002026.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002026.jpg
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002029.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002029.jpg
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002032.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002032.jpg
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002035.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002035.jpg
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002038.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002038.jpg
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002041.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002041.jpg
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002044.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002044.jpg
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002047.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002047.jpg
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002050.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002050.jpg
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002053.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002053.jpg
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002056.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002056.jpg
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002059.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002059.jpg
--------------------------------------------------------------------------------
/demo/images/350_6L1vA-xJt-M/00002062.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/350_6L1vA-xJt-M/00002062.jpg
--------------------------------------------------------------------------------
/demo/images/sa_1002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/demo/images/sa_1002.jpg
--------------------------------------------------------------------------------
/ext/cityscapes_scripts/helpers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/ext/cityscapes_scripts/helpers/__init__.py
--------------------------------------------------------------------------------
/ext/cityscapes_scripts/helpers/version.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os
4 |
5 | with open(os.path.join(os.path.dirname(__file__), '..', 'VERSION')) as f:
6 | version = f.read().strip()
7 |
8 | if __name__ == "__main__":
9 | print(version)
10 |
--------------------------------------------------------------------------------
/ext/davis2017/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | __version__ = '0.1.0'
4 |
--------------------------------------------------------------------------------
/ext/meta/sam_meta.py:
--------------------------------------------------------------------------------
1 | meta_dict = {
2 | 'vit_h': dict(
3 | encoder_embed_dim=1280,
4 | encoder_depth=32,
5 | encoder_num_heads=16,
6 | encoder_global_attn_indexes=[7, 15, 23, 31],
7 | # common
8 | prompt_embed_dim=256,
9 | image_size=1024,
10 | vit_patch_size=16,
11 | image_embedding_size=64
12 | ),
13 | 'vit_l': dict(
14 | encoder_embed_dim=1024,
15 | encoder_depth=24,
16 | encoder_num_heads=16,
17 | encoder_global_attn_indexes=[5, 11, 17, 23],
18 | # common
19 | prompt_embed_dim=256,
20 | image_size=1024,
21 | vit_patch_size=16,
22 | image_embedding_size=64
23 | ),
24 | 'vit_b': dict(
25 | encoder_embed_dim=768,
26 | encoder_depth=12,
27 | encoder_num_heads=12,
28 | encoder_global_attn_indexes=[2, 5, 8, 11],
29 | # common
30 | prompt_embed_dim=256,
31 | image_size=1024,
32 | vit_patch_size=16,
33 | image_embedding_size=64
34 | )
35 | }
36 |
37 | checkpoint_dict = {
38 | 'vit_h': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth',
39 | 'vit_l': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth',
40 | 'vit_b': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth',
41 | }
42 |
--------------------------------------------------------------------------------
/ext/open_clip/__init__.py:
--------------------------------------------------------------------------------
1 | from .coca_model import CoCa
2 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
3 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss
4 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint
5 | from .loss import ClipLoss, DistillClipLoss, CoCaLoss
6 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \
7 | convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype
8 | from .openai import load_openai_model, list_openai_models
9 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \
10 | get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
11 | from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub
12 | from .tokenizer import SimpleTokenizer, tokenize, decode
13 | from .transform import image_transform, AugmentationCfg
14 | from .zero_shot_classifier import build_zero_shot_classifier, build_zero_shot_classifier_legacy
15 | from .zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES, SIMPLE_IMAGENET_TEMPLATES, IMAGENET_CLASSNAMES
16 |
--------------------------------------------------------------------------------
/ext/open_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/ext/open_clip/bpe_simple_vocab_16e6.txt.gz
--------------------------------------------------------------------------------
/ext/open_clip/constants.py:
--------------------------------------------------------------------------------
1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
3 |
--------------------------------------------------------------------------------
/ext/open_clip/generation_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/ext/open_clip/generation_utils.py
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/EVA01-g-14-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva_giant_patch14_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/EVA01-g-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva_giant_patch14_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 768,
14 | "heads": 12,
15 | "layers": 12
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/EVA02-B-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva02_base_patch16_clip_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/EVA02-E-14-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva02_enormous_patch14_clip_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1280,
14 | "heads": 20,
15 | "layers": 32
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/EVA02-E-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva02_enormous_patch14_clip_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/EVA02-L-14-336.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 336,
5 | "timm_model_name": "eva02_large_patch14_clip_336",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 768,
14 | "heads": 12,
15 | "layers": 12
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/EVA02-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva02_large_patch14_clip_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 768,
14 | "heads": 12,
15 | "layers": 12
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/RN101-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": [
7 | 3,
8 | 4,
9 | 23,
10 | 3
11 | ],
12 | "width": 64,
13 | "patch_size": null
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 512,
19 | "heads": 8,
20 | "layers": 12
21 | }
22 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/RN101.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": [
6 | 3,
7 | 4,
8 | 23,
9 | 3
10 | ],
11 | "width": 64,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 512,
18 | "heads": 8,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/RN50-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": [
7 | 3,
8 | 4,
9 | 6,
10 | 3
11 | ],
12 | "width": 64,
13 | "patch_size": null
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 512,
19 | "heads": 8,
20 | "layers": 12
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/RN50.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": [
6 | 3,
7 | 4,
8 | 6,
9 | 3
10 | ],
11 | "width": 64,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 512,
18 | "heads": 8,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/RN50x16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 384,
5 | "layers": [
6 | 6,
7 | 8,
8 | 18,
9 | 8
10 | ],
11 | "width": 96,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 768,
18 | "heads": 12,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/RN50x4.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 288,
5 | "layers": [
6 | 4,
7 | 6,
8 | 10,
9 | 6
10 | ],
11 | "width": 80,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 640,
18 | "heads": 10,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/RN50x64.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 448,
5 | "layers": [
6 | 3,
7 | 15,
8 | 36,
9 | 10
10 | ],
11 | "width": 128,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 1024,
18 | "heads": 16,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 240,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-B-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 256,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": 12,
7 | "width": 768,
8 | "patch_size": 32
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-H-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 14
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | }
17 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-H-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 16
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | }
17 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-L-14-280.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 280,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-L-14-336.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 336,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-L-16-320.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 320,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-L-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-M-16-alt.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 384,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 512,
7 | "patch_size": 16,
8 | "ls_init_value": 1e-4
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 384,
14 | "heads": 6,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-M-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 512,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-M-32-alt.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 384,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 512,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 384,
13 | "heads": 6,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-M-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 512,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-S-16-alt.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 256,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 384,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 256,
13 | "heads": 4,
14 | "layers": 10
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-S-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 384,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 384,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 384,
13 | "heads": 6,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-S-32-alt.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 256,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 384,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 256,
13 | "heads": 4,
14 | "layers": 10
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-S-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 384,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 384,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 384,
13 | "heads": 6,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-bigG-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1280,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 48,
6 | "width": 1664,
7 | "head_width": 104,
8 | "mlp_ratio": 4.9231,
9 | "patch_size": 14
10 | },
11 | "text_cfg": {
12 | "context_length": 77,
13 | "vocab_size": 49408,
14 | "width": 1280,
15 | "heads": 20,
16 | "layers": 32
17 | }
18 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-e-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1280,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 56,
6 | "width": 1792,
7 | "head_width": 112,
8 | "mlp_ratio": 8.5715,
9 | "patch_size": 14
10 | },
11 | "text_cfg": {
12 | "context_length": 77,
13 | "vocab_size": 49408,
14 | "width": 1280,
15 | "heads": 20,
16 | "layers": 36
17 | }
18 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/ViT-g-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 40,
6 | "width": 1408,
7 | "head_width": 88,
8 | "mlp_ratio": 4.3637,
9 | "patch_size": 14
10 | },
11 | "text_cfg": {
12 | "context_length": 77,
13 | "vocab_size": 49408,
14 | "width": 1024,
15 | "heads": 16,
16 | "layers": 24
17 | }
18 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/coca_ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32,
8 | "attentional_pool": true,
9 | "attn_pooler_heads": 8,
10 | "output_tokens": true
11 | },
12 | "text_cfg": {
13 | "context_length": 76,
14 | "vocab_size": 49408,
15 | "width": 512,
16 | "heads": 8,
17 | "layers": 12,
18 | "embed_cls": true,
19 | "output_tokens": true
20 | },
21 | "multimodal_cfg": {
22 | "context_length": 76,
23 | "vocab_size": 49408,
24 | "width": 512,
25 | "heads": 8,
26 | "layers": 12,
27 | "attn_pooler_heads": 8
28 | },
29 | "custom_text": true
30 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/coca_ViT-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14,
8 | "attentional_pool": true,
9 | "attn_pooler_heads": 8,
10 | "output_tokens": true
11 | },
12 | "text_cfg": {
13 | "context_length": 76,
14 | "vocab_size": 49408,
15 | "width": 768,
16 | "heads": 12,
17 | "layers": 12,
18 | "embed_cls": true,
19 | "output_tokens": true
20 | },
21 | "multimodal_cfg": {
22 | "context_length": 76,
23 | "vocab_size": 49408,
24 | "width": 768,
25 | "heads": 12,
26 | "layers": 12,
27 | "attn_pooler_heads": 12
28 | },
29 | "custom_text": true
30 | }
31 |
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/coca_base.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "multimodal_cfg": {
4 | "width": 768,
5 | "context_length": 76,
6 | "vocab_size": 64000,
7 | "mlp_ratio": 4,
8 | "layers": 12,
9 | "dim_head": 64,
10 | "heads": 12,
11 | "n_queries": 256,
12 | "attn_pooler_heads": 8
13 | },
14 | "vision_cfg": {
15 | "image_size": 288,
16 | "layers": 12,
17 | "width": 768,
18 | "patch_size": 18,
19 | "output_tokens": true
20 | },
21 | "text_cfg": {
22 | "context_length": 76,
23 | "vocab_size": 64000,
24 | "layers": 12,
25 | "heads": 12,
26 | "width": 768,
27 | "embed_cls": true,
28 | "output_tokens": true
29 | },
30 | "custom_text": true
31 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/coca_roberta-ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32,
8 | "output_tokens": true
9 | },
10 | "text_cfg": {
11 | "hf_model_name": "roberta-base",
12 | "hf_tokenizer_name": "roberta-base",
13 | "proj": "linear",
14 | "width": 768,
15 | "output_tokens": true
16 | },
17 | "multimodal_cfg": {
18 | "context_length": 76,
19 | "width": 768,
20 | "heads": 8,
21 | "layers": 12
22 | },
23 | "custom_text": true
24 | }
25 |
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/convnext_base.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_base",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 224
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 512,
16 | "heads": 8,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/convnext_base_w.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_base",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 256
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 640,
16 | "heads": 10,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/convnext_base_w_320.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_base",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 320
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 640,
16 | "heads": 10,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/convnext_large.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_large",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 224
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 768,
16 | "heads": 12,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/convnext_large_d.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_large",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "mlp",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 256
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 768,
16 | "heads": 12,
17 | "layers": 16
18 | }
19 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/convnext_large_d_320.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_large",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "mlp",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 320
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 768,
16 | "heads": 12,
17 | "layers": 16
18 | }
19 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/convnext_small.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_small",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 224
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 512,
16 | "heads": 8,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/convnext_tiny.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_tiny",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 224
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 512,
16 | "heads": 8,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/convnext_xlarge.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_xlarge",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 256
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 1024,
16 | "heads": 16,
17 | "layers": 20
18 | }
19 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/convnext_xxlarge.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_xxlarge",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 256
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 1024,
16 | "heads": 16,
17 | "layers": 24
18 | }
19 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/convnext_xxlarge_320.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_xxlarge",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 320
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 1024,
16 | "heads": 16,
17 | "layers": 24
18 | }
19 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/mt5-base-ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "hf_model_name": "google/mt5-base",
11 | "hf_tokenizer_name": "google/mt5-base",
12 | "proj": "mlp",
13 | "pooler_type": "mean_pooler"
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/mt5-xl-ViT-H-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 14
9 | },
10 | "text_cfg": {
11 | "hf_model_name": "google/mt5-xl",
12 | "hf_tokenizer_name": "google/mt5-xl",
13 | "proj": "mlp",
14 | "pooler_type": "mean_pooler"
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/roberta-ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": 12,
7 | "width": 768,
8 | "patch_size": 32
9 | },
10 | "text_cfg": {
11 | "hf_model_name": "roberta-base",
12 | "hf_tokenizer_name": "roberta-base",
13 | "proj": "mlp",
14 | "pooler_type": "mean_pooler"
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "timm_model_name": "swin_base_patch4_window7_224",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 640,
14 | "heads": 10,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/vit_medium_patch16_gap_256.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "vit_medium_patch16_gap_256",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 256
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "vit_relpos_medium_patch16_cls_224",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "hf_model_name": "xlm-roberta-base",
11 | "hf_tokenizer_name": "xlm-roberta-base",
12 | "proj": "mlp",
13 | "pooler_type": "mean_pooler"
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/ext/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 14
9 | },
10 | "text_cfg": {
11 | "hf_model_name": "xlm-roberta-large",
12 | "hf_tokenizer_name": "xlm-roberta-large",
13 | "proj": "mlp",
14 | "pooler_type": "mean_pooler"
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/ext/open_clip/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '2.20.0'
2 |
--------------------------------------------------------------------------------
/ext/sam/__init__.py:
--------------------------------------------------------------------------------
1 | from .image_encoder import ImageEncoderViT
2 | from .prompt_encoder import PromptEncoder
3 | from .mask_decoder import MaskDecoder
4 |
--------------------------------------------------------------------------------
/ext/sam/common.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import torch
8 | import torch.nn as nn
9 |
10 | from typing import Type
11 |
12 |
13 | class MLPBlock(nn.Module):
14 | def __init__(
15 | self,
16 | embedding_dim: int,
17 | mlp_dim: int,
18 | act: Type[nn.Module] = nn.GELU,
19 | ) -> None:
20 | super().__init__()
21 | self.lin1 = nn.Linear(embedding_dim, mlp_dim)
22 | self.lin2 = nn.Linear(mlp_dim, embedding_dim)
23 | self.act = act()
24 |
25 | def forward(self, x: torch.Tensor) -> torch.Tensor:
26 | return self.lin2(self.act(self.lin1(x)))
27 |
28 |
29 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
30 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa
31 | class LayerNorm2d(nn.Module):
32 | def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
33 | super().__init__()
34 | self.weight = nn.Parameter(torch.ones(num_channels))
35 | self.bias = nn.Parameter(torch.zeros(num_channels))
36 | self.eps = eps
37 |
38 | def forward(self, x: torch.Tensor) -> torch.Tensor:
39 | u = x.mean(1, keepdim=True)
40 | s = (x - u).pow(2).mean(1, keepdim=True)
41 | x = (x - u) / torch.sqrt(s + self.eps)
42 | x = self.weight[:, None, None] * x + self.bias[:, None, None]
43 | return x
44 |
--------------------------------------------------------------------------------
/ext/templates/__init__.py:
--------------------------------------------------------------------------------
1 | from .vild import VILD_PROMPT
2 |
--------------------------------------------------------------------------------
/ext/templates/vild.py:
--------------------------------------------------------------------------------
1 | # https://github.com/bytedance/fc-clip/blob/93f3122518e8a3ef98926e5ea761a776d5050430/fcclip/fcclip.py#L26C1-L41C2
2 | VILD_PROMPT = [
3 | "a photo of a {}.",
4 | "This is a photo of a {}",
5 | "There is a {} in the scene",
6 | "There is the {} in the scene",
7 | "a photo of a {} in the scene",
8 | "a photo of a small {}.",
9 | "a photo of a medium {}.",
10 | "a photo of a large {}.",
11 | "This is a photo of a small {}.",
12 | "This is a photo of a medium {}.",
13 | "This is a photo of a large {}.",
14 | "There is a small {} in the scene.",
15 | "There is a medium {} in the scene.",
16 | "There is a large {} in the scene.",
17 | ]
18 |
--------------------------------------------------------------------------------
/figs/method_comparison.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/figs/method_comparison.jpg
--------------------------------------------------------------------------------
/figs/omg_teaser.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/figs/omg_teaser.jpg
--------------------------------------------------------------------------------
/omg_llava/.owners.yml:
--------------------------------------------------------------------------------
1 | assign:
2 | issues: disabled
3 | pull_requests: disabled
4 | strategy:
5 | random
6 | # daily-shift-based
7 | schedule:
8 | '*/1 * * * *'
9 |
--------------------------------------------------------------------------------
/omg_llava/.pre-commit-config-zh-cn.yaml:
--------------------------------------------------------------------------------
1 | exclude: ^tests/data/
2 | repos:
3 | - repo: https://gitee.com/openmmlab/mirrors-flake8
4 | rev: 5.0.4
5 | hooks:
6 | - id: flake8
7 | - repo: https://gitee.com/openmmlab/mirrors-isort
8 | rev: 5.11.5
9 | hooks:
10 | - id: isort
11 | - repo: https://gitee.com/openmmlab/mirrors-yapf
12 | rev: v0.32.0
13 | hooks:
14 | - id: yapf
15 | - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
16 | rev: v4.3.0
17 | hooks:
18 | - id: trailing-whitespace
19 | - id: check-yaml
20 | - id: end-of-file-fixer
21 | - id: requirements-txt-fixer
22 | - id: double-quote-string-fixer
23 | - id: check-merge-conflict
24 | - id: fix-encoding-pragma
25 | args: ["--remove"]
26 | - id: mixed-line-ending
27 | args: ["--fix=lf"]
28 | - repo: https://gitee.com/openmmlab/mirrors-codespell
29 | rev: v2.2.1
30 | hooks:
31 | - id: codespell
32 | - repo: https://gitee.com/openmmlab/mirrors-mdformat
33 | rev: 0.7.9
34 | hooks:
35 | - id: mdformat
36 | args: ["--number"]
37 | additional_dependencies:
38 | - mdformat-openmmlab
39 | - mdformat_frontmatter
40 | - linkify-it-py
41 | - repo: https://gitee.com/openmmlab/mirrors-docformatter
42 | rev: v1.3.1
43 | hooks:
44 | - id: docformatter
45 | args: ["--in-place", "--wrap-descriptions", "79"]
46 | - repo: https://github.com/asottile/pyupgrade
47 | rev: v3.0.0
48 | hooks:
49 | - id: pyupgrade
50 | args: ["--py36-plus"]
51 |
--------------------------------------------------------------------------------
/omg_llava/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | exclude: ^tests/data/
2 | repos:
3 | - repo: https://github.com/PyCQA/flake8
4 | rev: 5.0.4
5 | hooks:
6 | - id: flake8
7 | - repo: https://github.com/PyCQA/isort
8 | rev: 5.11.5
9 | hooks:
10 | - id: isort
11 | - repo: https://github.com/pre-commit/mirrors-yapf
12 | rev: v0.32.0
13 | hooks:
14 | - id: yapf
15 | exclude: 'xtuner/parallel/sequence/__init__.py'
16 | - repo: https://github.com/pre-commit/pre-commit-hooks
17 | rev: v4.3.0
18 | hooks:
19 | - id: trailing-whitespace
20 | - id: check-yaml
21 | - id: end-of-file-fixer
22 | - id: requirements-txt-fixer
23 | - id: double-quote-string-fixer
24 | - id: check-merge-conflict
25 | - id: fix-encoding-pragma
26 | args: ["--remove"]
27 | - id: mixed-line-ending
28 | args: ["--fix=lf"]
29 | - repo: https://github.com/codespell-project/codespell
30 | rev: v2.2.1
31 | hooks:
32 | - id: codespell
33 | - repo: https://github.com/executablebooks/mdformat
34 | rev: 0.7.9
35 | hooks:
36 | - id: mdformat
37 | args: ["--number"]
38 | additional_dependencies:
39 | - mdformat-openmmlab
40 | - mdformat_frontmatter
41 | - linkify-it-py
42 | exclude: 'docs/zh_cn/user_guides/sequence_parallel.md'
43 | - repo: https://github.com/myint/docformatter
44 | rev: v1.3.1
45 | hooks:
46 | - id: docformatter
47 | args: ["--in-place", "--wrap-descriptions", "79"]
48 | - repo: https://github.com/asottile/pyupgrade
49 | rev: v3.0.0
50 | hooks:
51 | - id: pyupgrade
52 | args: ["--py36-plus"]
53 |
--------------------------------------------------------------------------------
/omg_llava/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include xtuner/configs *.py *.yml *.json
2 | recursive-include xtuner/tools *.sh *.py
3 |
--------------------------------------------------------------------------------
/omg_llava/figs/omg_llava.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/figs/omg_llava.png
--------------------------------------------------------------------------------
/omg_llava/omg_llava/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/omg_llava/__init__.py
--------------------------------------------------------------------------------
/omg_llava/omg_llava/configs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/omg_llava/configs/__init__.py
--------------------------------------------------------------------------------
/omg_llava/omg_llava/dataset/collect_fns/__init__.py:
--------------------------------------------------------------------------------
1 | from .omg_llava_collate_fn import omg_llava_collate_fn
--------------------------------------------------------------------------------
/omg_llava/omg_llava/dataset/process_functions/__init__.py:
--------------------------------------------------------------------------------
1 | from .gcg_process import glamm_refcocog_map_fn, glamm_granf_map_fn, glamm_openpsg_map_fn, glamm_flickr_map_fn
2 | from .mdpv_points_process import mdpv_points_map_fn
3 | from .referring_seg_process import referring_seg_map_fn, referring_seg_gcg_format_map_fn
4 | from .region_caption_process import osprey_region_caption_map_fn, osprey_region_caption_gcg_format_map_fn, osprey_region_conversation_map_fn
5 | from .semantic_seg_process import semantic_seg_map_fn, pascal_part_map_fn, semantic_seg_gcg_format_map_fn, pascal_part_gcg_format_map_fn
6 | from .decoupled_gcg_process import glamm_openpsg_decoupled_given_objects_map_fn, glamm_openpsg_decoupled_given_description_map_fn,\
7 | glamm_flickr_decoupled_given_objects_map_fn, glamm_flickr_decoupled_given_description_map_fn,\
8 | glamm_granf_decoupled_given_objects_map_fn, glamm_granf_decoupled_given_description_map_fn,\
9 | glamm_refcocog_decoupled_given_description_map_fn, glamm_refcocog_decoupled_given_objects_map_fn
--------------------------------------------------------------------------------
/omg_llava/omg_llava/dataset/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import expand2square, expand2square_mask, expand2square_points, expand2square_bbox
--------------------------------------------------------------------------------
/omg_llava/omg_llava/dataset/utils/ade20k_classes.json:
--------------------------------------------------------------------------------
1 | [
2 | "wall", "building", "sky", "floor", "tree", "ceiling", "road",
3 | "bed", "windowpane", "grass", "cabinet", "sidewalk",
4 | "person", "earth", "door", "table", "mountain", "plant",
5 | "curtain", "chair", "car", "water", "painting", "sofa",
6 | "shelf", "house", "sea", "mirror", "rug", "field", "armchair",
7 | "seat", "fence", "desk", "rock", "wardrobe", "lamp",
8 | "bathtub", "railing", "cushion", "base", "box", "column",
9 | "signboard", "chest of drawers", "counter", "sand", "sink",
10 | "skyscraper", "fireplace", "refrigerator", "grandstand",
11 | "path", "stairs", "runway", "case", "pool table", "pillow",
12 | "screen door", "stairway", "river", "bridge", "bookcase",
13 | "blind", "coffee table", "toilet", "flower", "book", "hill",
14 | "bench", "countertop", "stove", "palm", "kitchen island",
15 | "computer", "swivel chair", "boat", "bar", "arcade machine",
16 | "hovel", "bus", "towel", "light", "truck", "tower",
17 | "chandelier", "awning", "streetlight", "booth",
18 | "television receiver", "airplane", "dirt track", "apparel",
19 | "pole", "land", "bannister", "escalator", "ottoman", "bottle",
20 | "buffet", "poster", "stage", "van", "ship", "fountain",
21 | "conveyer belt", "canopy", "washer", "plaything",
22 | "swimming pool", "stool", "barrel", "basket", "waterfall",
23 | "tent", "bag", "minibike", "cradle", "oven", "ball", "food",
24 | "step", "tank", "trade name", "microwave", "pot", "animal",
25 | "bicycle", "lake", "dishwasher", "screen", "blanket",
26 | "sculpture", "hood", "sconce", "vase", "traffic light",
27 | "tray", "ashcan", "fan", "pier", "crt screen", "plate",
28 | "monitor", "bulletin board", "shower", "radiator", "glass",
29 | "clock", "flag"
30 | ]
--------------------------------------------------------------------------------
/omg_llava/omg_llava/engine/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset_info_hook import DatasetInfoHook_withSpecoalTokens
2 | from .evaluate_chat_hook import EvaluateChatHook_withSpecialTokens
--------------------------------------------------------------------------------
/omg_llava/omg_llava/engine/dataset_info_hook.py:
--------------------------------------------------------------------------------
1 | from xtuner.registry import BUILDER
2 | from xtuner.engine.hooks import DatasetInfoHook
3 |
4 | class DatasetInfoHook_withSpecoalTokens(DatasetInfoHook):
5 | def __init__(self, tokenizer, is_intern_repo_dataset=False):
6 | self.tokenizer = BUILDER.build(tokenizer)
7 | self.is_intern_repo_dataset = is_intern_repo_dataset
8 | # add special tokens
9 | # Adding special tokens for pixel grounding
10 | segmentation_tokens = ['[SEG]']
11 | # Adding tokens for GCG
12 | phrase_tokens = ['
', '
']
13 | # add for visual prompt
14 | region_tokens = ['']
15 | point_tokens = ['']
16 | special_tokens = segmentation_tokens + phrase_tokens + region_tokens + point_tokens
17 | self.tokenizer.add_tokens(special_tokens, special_tokens=True)
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .convnext_clip import OpenCLIPBackbone, OpenCLIPBackbone_omgseg
2 | from .modules import ProjectorConfig_OMG_LLaVA, ProjectorModel_OMG_LLaVA
3 | from .omg_seg import OMGSegVisualEncoder, Mask2FormerVideoSemSamHead
4 | from .omg_llava import OMG_LLaVA
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/__init__.py:
--------------------------------------------------------------------------------
1 | from .openclip_backbone import OpenCLIPBackbone, OpenCLIPBackbone_omgseg
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/__init__.py:
--------------------------------------------------------------------------------
1 | from .coca_model import CoCa
2 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
3 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss
4 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint
5 | from .loss import ClipLoss, DistillClipLoss, CoCaLoss
6 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \
7 | convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype
8 | from .openai import load_openai_model, list_openai_models
9 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \
10 | get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
11 | from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub
12 | from .tokenizer import SimpleTokenizer, tokenize, decode
13 | from .transform import image_transform, AugmentationCfg
14 | from .zero_shot_classifier import build_zero_shot_classifier, build_zero_shot_classifier_legacy
15 | from .zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES, SIMPLE_IMAGENET_TEMPLATES, IMAGENET_CLASSNAMES
16 |
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/omg_llava/model/convnext_clip/open_clip/bpe_simple_vocab_16e6.txt.gz
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/constants.py:
--------------------------------------------------------------------------------
1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
3 |
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/generation_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/omg_llava/model/convnext_clip/open_clip/generation_utils.py
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/EVA01-g-14-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva_giant_patch14_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/EVA01-g-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva_giant_patch14_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 768,
14 | "heads": 12,
15 | "layers": 12
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/EVA02-B-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva02_base_patch16_clip_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/EVA02-E-14-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva02_enormous_patch14_clip_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1280,
14 | "heads": 20,
15 | "layers": 32
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/EVA02-E-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva02_enormous_patch14_clip_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/EVA02-L-14-336.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 336,
5 | "timm_model_name": "eva02_large_patch14_clip_336",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 768,
14 | "heads": 12,
15 | "layers": 12
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/EVA02-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "timm_model_name": "eva02_large_patch14_clip_224",
6 | "timm_model_pretrained": false,
7 | "timm_pool": "token",
8 | "timm_proj": null
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 768,
14 | "heads": 12,
15 | "layers": 12
16 | },
17 | "custom_text": true
18 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/RN101-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": [
7 | 3,
8 | 4,
9 | 23,
10 | 3
11 | ],
12 | "width": 64,
13 | "patch_size": null
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 512,
19 | "heads": 8,
20 | "layers": 12
21 | }
22 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/RN101.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": [
6 | 3,
7 | 4,
8 | 23,
9 | 3
10 | ],
11 | "width": 64,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 512,
18 | "heads": 8,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/RN50-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": [
7 | 3,
8 | 4,
9 | 6,
10 | 3
11 | ],
12 | "width": 64,
13 | "patch_size": null
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 512,
19 | "heads": 8,
20 | "layers": 12
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/RN50.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": [
6 | 3,
7 | 4,
8 | 6,
9 | 3
10 | ],
11 | "width": 64,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 512,
18 | "heads": 8,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/RN50x16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 384,
5 | "layers": [
6 | 6,
7 | 8,
8 | 18,
9 | 8
10 | ],
11 | "width": 96,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 768,
18 | "heads": 12,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/RN50x4.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 288,
5 | "layers": [
6 | 4,
7 | 6,
8 | 10,
9 | 6
10 | ],
11 | "width": 80,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 640,
18 | "heads": 10,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/RN50x64.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 448,
5 | "layers": [
6 | 3,
7 | 15,
8 | 36,
9 | 10
10 | ],
11 | "width": 128,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 1024,
18 | "heads": 16,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 240,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-B-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 256,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": 12,
7 | "width": 768,
8 | "patch_size": 32
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-H-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 14
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | }
17 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-H-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 16
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | }
17 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-L-14-280.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 280,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-L-14-336.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 336,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-L-16-320.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 320,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-L-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-M-16-alt.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 384,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 512,
7 | "patch_size": 16,
8 | "ls_init_value": 1e-4
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 384,
14 | "heads": 6,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-M-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 512,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-M-32-alt.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 384,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 512,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 384,
13 | "heads": 6,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-M-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 512,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-S-16-alt.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 256,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 384,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 256,
13 | "heads": 4,
14 | "layers": 10
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-S-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 384,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 384,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 384,
13 | "heads": 6,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-S-32-alt.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 256,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 384,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 256,
13 | "heads": 4,
14 | "layers": 10
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-S-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 384,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 384,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 384,
13 | "heads": 6,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-bigG-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1280,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 48,
6 | "width": 1664,
7 | "head_width": 104,
8 | "mlp_ratio": 4.9231,
9 | "patch_size": 14
10 | },
11 | "text_cfg": {
12 | "context_length": 77,
13 | "vocab_size": 49408,
14 | "width": 1280,
15 | "heads": 20,
16 | "layers": 32
17 | }
18 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-e-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1280,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 56,
6 | "width": 1792,
7 | "head_width": 112,
8 | "mlp_ratio": 8.5715,
9 | "patch_size": 14
10 | },
11 | "text_cfg": {
12 | "context_length": 77,
13 | "vocab_size": 49408,
14 | "width": 1280,
15 | "heads": 20,
16 | "layers": 36
17 | }
18 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/ViT-g-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 40,
6 | "width": 1408,
7 | "head_width": 88,
8 | "mlp_ratio": 4.3637,
9 | "patch_size": 14
10 | },
11 | "text_cfg": {
12 | "context_length": 77,
13 | "vocab_size": 49408,
14 | "width": 1024,
15 | "heads": 16,
16 | "layers": 24
17 | }
18 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/coca_ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32,
8 | "attentional_pool": true,
9 | "attn_pooler_heads": 8,
10 | "output_tokens": true
11 | },
12 | "text_cfg": {
13 | "context_length": 76,
14 | "vocab_size": 49408,
15 | "width": 512,
16 | "heads": 8,
17 | "layers": 12,
18 | "embed_cls": true,
19 | "output_tokens": true
20 | },
21 | "multimodal_cfg": {
22 | "context_length": 76,
23 | "vocab_size": 49408,
24 | "width": 512,
25 | "heads": 8,
26 | "layers": 12,
27 | "attn_pooler_heads": 8
28 | },
29 | "custom_text": true
30 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/coca_ViT-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14,
8 | "attentional_pool": true,
9 | "attn_pooler_heads": 8,
10 | "output_tokens": true
11 | },
12 | "text_cfg": {
13 | "context_length": 76,
14 | "vocab_size": 49408,
15 | "width": 768,
16 | "heads": 12,
17 | "layers": 12,
18 | "embed_cls": true,
19 | "output_tokens": true
20 | },
21 | "multimodal_cfg": {
22 | "context_length": 76,
23 | "vocab_size": 49408,
24 | "width": 768,
25 | "heads": 12,
26 | "layers": 12,
27 | "attn_pooler_heads": 12
28 | },
29 | "custom_text": true
30 | }
31 |
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/coca_base.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "multimodal_cfg": {
4 | "width": 768,
5 | "context_length": 76,
6 | "vocab_size": 64000,
7 | "mlp_ratio": 4,
8 | "layers": 12,
9 | "dim_head": 64,
10 | "heads": 12,
11 | "n_queries": 256,
12 | "attn_pooler_heads": 8
13 | },
14 | "vision_cfg": {
15 | "image_size": 288,
16 | "layers": 12,
17 | "width": 768,
18 | "patch_size": 18,
19 | "output_tokens": true
20 | },
21 | "text_cfg": {
22 | "context_length": 76,
23 | "vocab_size": 64000,
24 | "layers": 12,
25 | "heads": 12,
26 | "width": 768,
27 | "embed_cls": true,
28 | "output_tokens": true
29 | },
30 | "custom_text": true
31 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/coca_roberta-ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32,
8 | "output_tokens": true
9 | },
10 | "text_cfg": {
11 | "hf_model_name": "roberta-base",
12 | "hf_tokenizer_name": "roberta-base",
13 | "proj": "linear",
14 | "width": 768,
15 | "output_tokens": true
16 | },
17 | "multimodal_cfg": {
18 | "context_length": 76,
19 | "width": 768,
20 | "heads": 8,
21 | "layers": 12
22 | },
23 | "custom_text": true
24 | }
25 |
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_base.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_base",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 224
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 512,
16 | "heads": 8,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_base_w.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_base",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 256
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 640,
16 | "heads": 10,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_base_w_320.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_base",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 320
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 640,
16 | "heads": 10,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_large.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_large",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 224
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 768,
16 | "heads": 12,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_large_d.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_large",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "mlp",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 256
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 768,
16 | "heads": 12,
17 | "layers": 16
18 | }
19 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_large_d_320.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_large",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "mlp",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 320
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 768,
16 | "heads": 12,
17 | "layers": 16
18 | }
19 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_small.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_small",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 224
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 512,
16 | "heads": 8,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_tiny.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_tiny",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 224
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 512,
16 | "heads": 8,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_xlarge.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_xlarge",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 256
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 1024,
16 | "heads": 16,
17 | "layers": 20
18 | }
19 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_xxlarge.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_xxlarge",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 256
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 1024,
16 | "heads": 16,
17 | "layers": 24
18 | }
19 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/convnext_xxlarge_320.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "convnext_xxlarge",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "timm_drop": 0.0,
9 | "timm_drop_path": 0.1,
10 | "image_size": 320
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 1024,
16 | "heads": 16,
17 | "layers": 24
18 | }
19 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/mt5-base-ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "hf_model_name": "google/mt5-base",
11 | "hf_tokenizer_name": "google/mt5-base",
12 | "proj": "mlp",
13 | "pooler_type": "mean_pooler"
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/mt5-xl-ViT-H-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 14
9 | },
10 | "text_cfg": {
11 | "hf_model_name": "google/mt5-xl",
12 | "hf_tokenizer_name": "google/mt5-xl",
13 | "proj": "mlp",
14 | "pooler_type": "mean_pooler"
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/roberta-ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": 12,
7 | "width": 768,
8 | "patch_size": 32
9 | },
10 | "text_cfg": {
11 | "hf_model_name": "roberta-base",
12 | "hf_tokenizer_name": "roberta-base",
13 | "proj": "mlp",
14 | "pooler_type": "mean_pooler"
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "timm_model_name": "swin_base_patch4_window7_224",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 640,
14 | "heads": 10,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/vit_medium_patch16_gap_256.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "vit_medium_patch16_gap_256",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 256
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "vit_relpos_medium_patch16_cls_224",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "hf_model_name": "xlm-roberta-base",
11 | "hf_tokenizer_name": "xlm-roberta-base",
12 | "proj": "mlp",
13 | "pooler_type": "mean_pooler"
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 14
9 | },
10 | "text_cfg": {
11 | "hf_model_name": "xlm-roberta-large",
12 | "hf_tokenizer_name": "xlm-roberta-large",
13 | "proj": "mlp",
14 | "pooler_type": "mean_pooler"
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/convnext_clip/open_clip/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '2.20.0'
2 |
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from xtuner.model import *
2 | from .projector import ProjectorModel_OMG_LLaVA, ProjectorConfig_OMG_LLaVA
3 |
4 | __all__ = ['ProjectorConfig_OMG_LLaVA', 'ProjectorModel_OMG_LLaVA', ]
5 |
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/modules/projector/__init__.py:
--------------------------------------------------------------------------------
1 | from xtuner.model.modules.projector import *
2 | from transformers import AutoConfig, AutoModel
3 | from .configuration_projector import ProjectorConfig_OMG_LLaVA
4 | from .modeling_projector import ProjectorModel_OMG_LLaVA
5 |
6 | AutoConfig.register('projector', ProjectorConfig_OMG_LLaVA)
7 | AutoModel.register(ProjectorConfig_OMG_LLaVA, ProjectorModel_OMG_LLaVA)
8 |
9 | __all__ = ['ProjectorConfig_OMG_LLaVA', 'ProjectorModel_OMG_LLaVA']
10 |
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/modules/projector/configuration_projector.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from transformers import PretrainedConfig
3 |
4 | class ProjectorConfig_OMG_LLaVA(PretrainedConfig):
5 | model_type = 'projector'
6 | _auto_class = 'AutoConfig'
7 |
8 | def __init__(
9 | self,
10 | visual_hidden_size=4096,
11 | llm_hidden_size=4096,
12 | depth=2,
13 | hidden_act='gelu',
14 | bias=True,
15 | query_channels=256,
16 | feat_channels=1536,
17 | pixel_shuffle_ratio=None,
18 | additional_bg_tokens=10,
19 | visual_prompt_proj=False,
20 | add_cross_attn_layer=False,
21 | **kwargs,
22 | ):
23 | self.visual_hidden_size = visual_hidden_size
24 | self.llm_hidden_size = llm_hidden_size
25 | self.depth = depth
26 | self.hidden_act = hidden_act
27 | self.bias = bias
28 | self.query_channels=query_channels
29 | self.feat_channels=feat_channels
30 | if pixel_shuffle_ratio is not None:
31 | self.feat_channels = self.feat_channels * pixel_shuffle_ratio * pixel_shuffle_ratio
32 | self.additional_bg_tokens = additional_bg_tokens
33 | self.visual_prompt_proj = visual_prompt_proj
34 | self.add_cross_attn_layer = add_cross_attn_layer
35 | super().__init__(**kwargs)
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/omg_seg/__init__.py:
--------------------------------------------------------------------------------
1 | from .omg_seg_visual_encoder import OMGSegVisualEncoder
2 | from .mask2former_vid_semanticsam import Mask2FormerVideoSemSamHead
--------------------------------------------------------------------------------
/omg_llava/omg_llava/model/omg_seg/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 |
4 | def mask_pool(x, mask):
5 | """
6 | Args:
7 | x: [B, C, H, W]
8 | mask: [B, Q, H, W]
9 | """
10 | if not x.shape[-2:] == mask.shape[-2:]:
11 | # reshape mask to x
12 | mask = F.interpolate(mask, size=x.shape[-2:], mode='bilinear', align_corners=False)
13 | with torch.no_grad():
14 | mask = mask.detach()
15 | mask = (mask > 0).to(mask.dtype)
16 | denorm = mask.sum(dim=(-1, -2), keepdim=True) + 1e-8
17 |
18 | mask_pooled_x = torch.einsum(
19 | "bchw,bqhw->bqc",
20 | x,
21 | mask / denorm,
22 | )
23 |
24 | return mask_pooled_x
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/omg_llava/omg_llava/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/omg_llava/tools/__init__.py
--------------------------------------------------------------------------------
/omg_llava/requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements/runtime.txt
2 | -r requirements/deepspeed.txt
3 | -r requirements/modelscope.txt
4 |
5 | gradio==4.37.2
6 | gradio-image-prompter
7 | pycocotools
8 | timm
9 | ftfy
10 | kornia
--------------------------------------------------------------------------------
/omg_llava/requirements/deepspeed.txt:
--------------------------------------------------------------------------------
1 | # Minimum 0.12.3, see https://github.com/microsoft/DeepSpeed/pull/4587
2 | deepspeed>=0.12.3
3 | mpi4py-mpich
4 |
--------------------------------------------------------------------------------
/omg_llava/requirements/docs.txt:
--------------------------------------------------------------------------------
1 | docutils
2 | myst-parser==2.0.0
3 | sphinx==6.2.1
4 | sphinx-argparse
5 | sphinx-book-theme==1.0.1
6 | sphinx-copybutton==0.5.2
7 | sphinx_markdown_tables
8 |
--------------------------------------------------------------------------------
/omg_llava/requirements/modelscope.txt:
--------------------------------------------------------------------------------
1 | modelscope
2 |
--------------------------------------------------------------------------------
/omg_llava/requirements/runtime.txt:
--------------------------------------------------------------------------------
1 | # Minimum 0.40.0.post4 to fix some 4-bit precision bugs
2 | bitsandbytes>=0.40.0.post4
3 | # Minimum 2.16.0 to fix some bugs, see https://github.com/huggingface/datasets/pull/6444
4 | datasets>=2.16.0
5 | einops
6 | # Minimum 0.1.2 to fix some bugs, see https://github.com/InternLM/lagent/pull/44
7 | lagent>=0.1.2
8 | # Minimum 0.10.3 to support distributed evaluation for MMBench
9 | # see https://github.com/open-mmlab/mmengine/pull/1469
10 | mmengine>=0.10.3
11 | openpyxl
12 | # Minimum 0.4.0 to support QLoRA, see https://github.com/huggingface/peft/pull/476
13 | triton==2.1.0
14 | peft>=0.4.0
15 | scikit-image
16 | scipy
17 | SentencePiece
18 | tiktoken
19 | torch
20 | torchvision
21 | # Minimum 4.36.0 to support `Cache` data structure used by KV Cache
22 | # Registering a causal mask in `LlamaModel` is not friendly for very large
23 | # `max_position_embeddings`. Refer to
24 | # https://github.com/huggingface/transformers/blob/v4.38.0/src/transformers/models/llama/modeling_llama.py#L921-L923
25 | # transformers>=4.36.0,!=4.38.0,!=4.38.1,!=4.38.2
26 | transformers==4.36.0
27 | transformers_stream_generator
28 |
--------------------------------------------------------------------------------
/omg_llava/setup.cfg:
--------------------------------------------------------------------------------
1 | [isort]
2 | line_length = 79
3 | multi_line_output = 0
4 | extra_standard_library = setuptools
5 | known_first_party = xtuner
6 | known_third_party = pytest,yaml
7 | no_lines_before = STDLIB,LOCALFOLDER
8 | default_section = THIRDPARTY
9 |
10 | [yapf]
11 | BASED_ON_STYLE = pep8
12 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
13 | SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
14 |
15 | [codespell]
16 | ignore-words-list = nd, ba, warmup
17 |
--------------------------------------------------------------------------------
/omg_llava/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/test.jpg
--------------------------------------------------------------------------------
/omg_llava/xtuner/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import os
3 |
4 | from mmengine.utils import digit_version
5 |
6 | from .entry_point import cli
7 | from .version import __version__, version_info
8 |
9 | HF_CEPH_HUB = os.getenv('HF_CEPH_HUB', '')
10 | HF_USE_CEPH = os.getenv('HF_USE_CEPH', 0) or HF_CEPH_HUB != ''
11 | DS_CEPH_DIR = os.getenv('DS_CEPH_DIR', None)
12 | if HF_USE_CEPH:
13 | from .utils.fileio import (patch_hf_auto_from_pretrained,
14 | patch_hf_save_pretrained)
15 | patch_hf_auto_from_pretrained(HF_CEPH_HUB)
16 | patch_hf_save_pretrained()
17 |
18 | if DS_CEPH_DIR:
19 | from .utils.fileio import patch_deepspeed_engine
20 | patch_deepspeed_engine()
21 |
22 | __all__ = [
23 | '__version__', 'version_info', 'digit_version', 'cli', 'HF_USE_CEPH',
24 | 'DS_CEPH_DIR'
25 | ]
26 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/apis/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .datasets import * # noqa: F401, F403
3 | from .model import * # noqa: F401, F403
4 | from .training_args import * # noqa: F401, F403
5 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/apis/datasets/arxiv.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from functools import partial
3 |
4 | from datasets import load_dataset
5 |
6 | from xtuner.dataset import process_hf_dataset
7 | from xtuner.dataset.collate_fns import default_collate_fn
8 | from xtuner.dataset.map_fns import arxiv_map_fn, template_map_fn_factory
9 | from xtuner.utils import PROMPT_TEMPLATE
10 |
11 |
12 | def arxiv_dataset(tokenizer,
13 | data_file=None,
14 | max_length=2048,
15 | prompt_template=PROMPT_TEMPLATE.default,
16 | remove_unused_columns=True,
17 | pack_to_max_length=True):
18 | template_map_fn = template_map_fn_factory(template=prompt_template)
19 | # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv # noqa: E501
20 | # 2. Process data with `./tools/data_preprocess/arxiv.py`
21 | if data_file is None:
22 | data_file = './data/arxiv_postprocess_csAIcsCLcsCV_20200101.json'
23 | dataset_org = load_dataset(path='json', data_files=dict(train=data_file))
24 | dataset = process_hf_dataset(
25 | dataset=dataset_org,
26 | tokenizer=tokenizer,
27 | max_length=max_length,
28 | dataset_map_fn=arxiv_map_fn,
29 | template_map_fn=template_map_fn,
30 | remove_unused_columns=remove_unused_columns,
31 | shuffle_before_pack=True,
32 | pack_to_max_length=pack_to_max_length)
33 |
34 | return dataset
35 |
36 |
37 | def arxiv_data_collator(return_hf_format=False):
38 | return partial(default_collate_fn, return_hf_format=return_hf_format)
39 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/apis/datasets/code_alpaca.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from functools import partial
3 |
4 | from datasets import load_dataset
5 |
6 | from xtuner.dataset import process_hf_dataset
7 | from xtuner.dataset.collate_fns import default_collate_fn
8 | from xtuner.dataset.map_fns import code_alpaca_map_fn, template_map_fn_factory
9 | from xtuner.utils import PROMPT_TEMPLATE
10 |
11 |
12 | def code_alpaca_dataset(tokenizer,
13 | path='HuggingFaceH4/CodeAlpaca_20K',
14 | max_length=2048,
15 | prompt_template=PROMPT_TEMPLATE.default,
16 | remove_unused_columns=True,
17 | pack_to_max_length=True):
18 | template_map_fn = template_map_fn_factory(template=prompt_template)
19 | dataset_org = load_dataset(path)
20 | dataset = process_hf_dataset(
21 | dataset=dataset_org,
22 | tokenizer=tokenizer,
23 | max_length=max_length,
24 | dataset_map_fn=code_alpaca_map_fn,
25 | template_map_fn=template_map_fn,
26 | remove_unused_columns=remove_unused_columns,
27 | shuffle_before_pack=True,
28 | pack_to_max_length=pack_to_max_length)
29 |
30 | return dataset
31 |
32 |
33 | def code_alpaca_data_collator(return_hf_format=False):
34 | return partial(default_collate_fn, return_hf_format=return_hf_format)
35 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/apis/datasets/colorist.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from functools import partial
3 |
4 | from datasets import load_dataset
5 |
6 | from xtuner.dataset import process_hf_dataset
7 | from xtuner.dataset.collate_fns import default_collate_fn
8 | from xtuner.dataset.map_fns import colors_map_fn, template_map_fn_factory
9 | from xtuner.utils import PROMPT_TEMPLATE
10 |
11 |
12 | def colorist_dataset(tokenizer,
13 | path='burkelibbey/colors',
14 | max_length=2048,
15 | prompt_template=PROMPT_TEMPLATE.default,
16 | remove_unused_columns=True,
17 | pack_to_max_length=True):
18 | template_map_fn = template_map_fn_factory(template=prompt_template)
19 | dataset_org = load_dataset(path)
20 | dataset = process_hf_dataset(
21 | dataset=dataset_org,
22 | tokenizer=tokenizer,
23 | max_length=max_length,
24 | dataset_map_fn=colors_map_fn,
25 | template_map_fn=template_map_fn,
26 | remove_unused_columns=remove_unused_columns,
27 | shuffle_before_pack=True,
28 | pack_to_max_length=pack_to_max_length)
29 |
30 | return dataset
31 |
32 |
33 | def colorist_data_collator(return_hf_format=False):
34 | return partial(default_collate_fn, return_hf_format=return_hf_format)
35 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/apis/datasets/medical.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from functools import partial
3 |
4 | from datasets import load_dataset
5 |
6 | from xtuner.dataset import process_hf_dataset
7 | from xtuner.dataset.collate_fns import default_collate_fn
8 | from xtuner.dataset.map_fns import medical_map_fn, template_map_fn_factory
9 | from xtuner.utils import PROMPT_TEMPLATE
10 |
11 |
12 | def medical_dataset(tokenizer,
13 | path='shibing624/medical',
14 | max_length=2048,
15 | prompt_template=PROMPT_TEMPLATE.default,
16 | remove_unused_columns=False,
17 | pack_to_max_length=True):
18 | template_map_fn = template_map_fn_factory(template=prompt_template)
19 | dataset_org = load_dataset(path)
20 | dataset = process_hf_dataset(
21 | dataset=dataset_org,
22 | tokenizer=tokenizer,
23 | max_length=max_length,
24 | dataset_map_fn=medical_map_fn,
25 | template_map_fn=template_map_fn,
26 | remove_unused_columns=remove_unused_columns,
27 | shuffle_before_pack=True,
28 | pack_to_max_length=pack_to_max_length)
29 |
30 | return dataset
31 |
32 |
33 | def medical_data_collator(return_hf_format=False):
34 | return partial(default_collate_fn, return_hf_format=return_hf_format)
35 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/apis/datasets/oasst1.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from functools import partial
3 |
4 | from datasets import load_dataset
5 |
6 | from xtuner.dataset import process_hf_dataset
7 | from xtuner.dataset.collate_fns import default_collate_fn
8 | from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory
9 | from xtuner.utils import PROMPT_TEMPLATE
10 |
11 |
12 | def oasst1_dataset(tokenizer,
13 | path='timdettmers/openassistant-guanaco',
14 | max_length=2048,
15 | prompt_template=PROMPT_TEMPLATE.default,
16 | remove_unused_columns=False,
17 | pack_to_max_length=True):
18 | template_map_fn = template_map_fn_factory(template=prompt_template)
19 | dataset_org = load_dataset(path)
20 | dataset = process_hf_dataset(
21 | dataset=dataset_org,
22 | tokenizer=tokenizer,
23 | max_length=max_length,
24 | dataset_map_fn=oasst1_map_fn,
25 | template_map_fn=template_map_fn,
26 | remove_unused_columns=remove_unused_columns,
27 | shuffle_before_pack=True,
28 | pack_to_max_length=pack_to_max_length)
29 |
30 | return dataset
31 |
32 |
33 | def oasst1_data_collator(return_hf_format=False):
34 | return partial(default_collate_fn, return_hf_format=return_hf_format)
35 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/apis/datasets/open_orca.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from functools import partial
3 |
4 | from datasets import load_dataset
5 |
6 | from xtuner.dataset import process_hf_dataset
7 | from xtuner.dataset.collate_fns import default_collate_fn
8 | from xtuner.dataset.map_fns import openorca_map_fn, template_map_fn_factory
9 | from xtuner.utils import PROMPT_TEMPLATE
10 |
11 |
12 | def openorca_dataset(tokenizer,
13 | path='Open-Orca/OpenOrca',
14 | max_length=2048,
15 | prompt_template=PROMPT_TEMPLATE.default,
16 | remove_unused_columns=True,
17 | pack_to_max_length=True):
18 | template_map_fn = template_map_fn_factory(template=prompt_template)
19 | dataset_org = load_dataset(path)
20 | dataset = process_hf_dataset(
21 | dataset=dataset_org,
22 | tokenizer=tokenizer,
23 | max_length=max_length,
24 | dataset_map_fn=openorca_map_fn,
25 | template_map_fn=template_map_fn,
26 | remove_unused_columns=remove_unused_columns,
27 | shuffle_before_pack=True,
28 | pack_to_max_length=pack_to_max_length)
29 |
30 | return dataset
31 |
32 |
33 | def openorca_data_collator(return_hf_format=False):
34 | return partial(default_collate_fn, return_hf_format=return_hf_format)
35 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/apis/datasets/sql.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from functools import partial
3 |
4 | from datasets import load_dataset
5 |
6 | from xtuner.dataset import process_hf_dataset
7 | from xtuner.dataset.collate_fns import default_collate_fn
8 | from xtuner.dataset.map_fns import sql_map_fn, template_map_fn_factory
9 | from xtuner.utils import PROMPT_TEMPLATE
10 |
11 |
12 | def sql_dataset(tokenizer,
13 | path='b-mc2/sql-create-context',
14 | max_length=2048,
15 | prompt_template=PROMPT_TEMPLATE.default,
16 | remove_unused_columns=True,
17 | pack_to_max_length=True):
18 | template_map_fn = template_map_fn_factory(template=prompt_template)
19 | dataset_org = load_dataset(path)
20 | dataset = process_hf_dataset(
21 | dataset=dataset_org,
22 | tokenizer=tokenizer,
23 | max_length=max_length,
24 | dataset_map_fn=sql_map_fn,
25 | template_map_fn=template_map_fn,
26 | remove_unused_columns=remove_unused_columns,
27 | shuffle_before_pack=True,
28 | pack_to_max_length=pack_to_max_length)
29 |
30 | return dataset
31 |
32 |
33 | def sql_data_collator(return_hf_format=False):
34 | return partial(default_collate_fn, return_hf_format=return_hf_format)
35 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/apis/datasets/tiny_codes.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from functools import partial
3 |
4 | from datasets import load_dataset
5 |
6 | from xtuner.dataset import process_hf_dataset
7 | from xtuner.dataset.collate_fns import default_collate_fn
8 | from xtuner.dataset.map_fns import template_map_fn_factory, tiny_codes_map_fn
9 | from xtuner.utils import PROMPT_TEMPLATE
10 |
11 |
12 | def tiny_codes_dataset(tokenizer,
13 | path='nampdn-ai/tiny-codes',
14 | max_length=2048,
15 | prompt_template=PROMPT_TEMPLATE.default,
16 | remove_unused_columns=True,
17 | pack_to_max_length=True):
18 | template_map_fn = template_map_fn_factory(template=prompt_template)
19 | dataset_org = load_dataset(path)
20 | dataset = process_hf_dataset(
21 | dataset=dataset_org,
22 | tokenizer=tokenizer,
23 | max_length=max_length,
24 | dataset_map_fn=tiny_codes_map_fn,
25 | template_map_fn=template_map_fn,
26 | remove_unused_columns=remove_unused_columns,
27 | shuffle_before_pack=True,
28 | pack_to_max_length=pack_to_max_length)
29 |
30 | return dataset
31 |
32 |
33 | def tiny_codes_data_collator(return_hf_format=False):
34 | return partial(default_collate_fn, return_hf_format=return_hf_format)
35 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/apis/datasets/wizardlm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from functools import partial
3 |
4 | from datasets import load_dataset
5 |
6 | from xtuner.dataset import process_hf_dataset
7 | from xtuner.dataset.collate_fns import default_collate_fn
8 | from xtuner.dataset.map_fns import template_map_fn_factory, wizardlm_map_fn
9 | from xtuner.utils import PROMPT_TEMPLATE
10 |
11 |
12 | def wizardlm_dataset(tokenizer,
13 | path='WizardLM/WizardLM_evol_instruct_V2_196k',
14 | max_length=2048,
15 | prompt_template=PROMPT_TEMPLATE.default,
16 | remove_unused_columns=False,
17 | pack_to_max_length=True):
18 | template_map_fn = template_map_fn_factory(template=prompt_template)
19 | dataset_org = load_dataset(path)
20 | dataset = process_hf_dataset(
21 | dataset=dataset_org,
22 | tokenizer=tokenizer,
23 | max_length=max_length,
24 | dataset_map_fn=wizardlm_map_fn,
25 | template_map_fn=template_map_fn,
26 | remove_unused_columns=remove_unused_columns,
27 | shuffle_before_pack=True,
28 | pack_to_max_length=pack_to_max_length)
29 |
30 | return dataset
31 |
32 |
33 | def wizardlm_data_collator(return_hf_format=False):
34 | return partial(default_collate_fn, return_hf_format=return_hf_format)
35 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/configs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import os
3 |
4 |
5 | def get_cfgs_name_path():
6 | path = os.path.dirname(__file__)
7 | mapping = {}
8 | for root, dirs, files in os.walk(path):
9 | for file_ in files:
10 | if file_.endswith(
11 | ('.py', '.json')
12 | ) and not file_.startswith('.') and not file_.startswith('_'):
13 | mapping[os.path.splitext(file_)[0]] = os.path.join(root, file_)
14 | return mapping
15 |
16 |
17 | cfgs_name_path = get_cfgs_name_path()
18 |
19 | __all__ = ['cfgs_name_path']
20 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/configs/cohere/README.md:
--------------------------------------------------------------------------------
1 | # Cohere 8x7B
2 |
3 | ## Install
4 |
5 | ```bash
6 | # Install the latest xtuner
7 | pip install -U 'xtuner[deepspeed]'
8 |
9 | # Cohere requires the latest version of transformers.
10 | pip install git+https://github.com/huggingface/transformers.git
11 |
12 | # Sequence parallel requires flash-attn
13 | pip install flash-attn
14 | ```
15 |
16 | ## Full Parameter Fine-tune
17 |
18 | Full parameter fine-tune needs 64 A100-80G
19 |
20 | ### slurm
21 |
22 | Note: `$PARTITION` means the virtual partition of slurm.
23 |
24 | ```bash
25 | srun -p $PARTITION --job-name=Cohere --nodes=8 --gres=gpu:8 --ntasks-per-node=8 xtuner train cohere_100b_128k_sp32 --deepspeed deepspeed_zero3 --launcher slurm
26 | ```
27 |
28 | ### torchrun
29 |
30 | Note: `$NODE_0_ADDR` means the ip address of the node_0 machine.
31 |
32 | ```bash
33 | # excuete on node 0
34 | NPROC_PER_NODE=8 NNODES=8 PORT=29600 ADDR=$NODE_0_ADDR NODE_RANK=0 xtuner train cohere_100b_128k_sp32 --deepspeed deepspeed_zero3
35 |
36 | # excuete on node 1
37 | NPROC_PER_NODE=8 NNODES=8 PORT=29600 ADDR=$NODE_0_ADDR NODE_RANK=1 xtuner train cohere_100b_128k_sp32 --deepspeed deepspeed_zero3
38 | ```
39 |
40 | ### Speed
41 |
42 | 16 * A100 80G:
43 |
44 | | Model | Sequence Length | GPUs Number | Sequence Parallel World Size | Tokens per Second | TFLOPs |
45 | | :---------: | :-------------: | :---------: | :--------------------------: | :---------------: | :----: |
46 | | Cohere_100b | 128k | 64 | 32 | 97.3 | 173.4 |
47 | | Cohere_100b | 128k | 128 | 16 | 102.1 | 182.7 |
48 | | Cohere_100b | 128k | 256 | 16 | 101.3 | 181.3 |
49 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/configs/deepspeed/deepspeed_zero1.json:
--------------------------------------------------------------------------------
1 | {
2 | "gradient_accumulation_steps": "auto",
3 | "train_micro_batch_size_per_gpu": "auto",
4 | "gradient_clipping": "auto",
5 | "zero_allow_untested_optimizer": true,
6 | "zero_force_ds_cpu_optimizer": false,
7 | "zero_optimization": {
8 | "stage": 1,
9 | "overlap_comm": true
10 | },
11 | "fp16": {
12 | "enabled": "auto",
13 | "initial_scale_power": 16
14 | },
15 | "bf16": {
16 | "enabled": "auto"
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/configs/deepspeed/deepspeed_zero2.json:
--------------------------------------------------------------------------------
1 | {
2 | "gradient_accumulation_steps": "auto",
3 | "train_micro_batch_size_per_gpu": "auto",
4 | "gradient_clipping": "auto",
5 | "zero_allow_untested_optimizer": true,
6 | "zero_force_ds_cpu_optimizer": false,
7 | "zero_optimization": {
8 | "stage": 2,
9 | "overlap_comm": true
10 | },
11 | "fp16": {
12 | "enabled": "auto",
13 | "initial_scale_power": 16
14 | },
15 | "bf16": {
16 | "enabled": "auto"
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/configs/deepspeed/deepspeed_zero2_offload.json:
--------------------------------------------------------------------------------
1 | {
2 | "gradient_accumulation_steps": "auto",
3 | "train_micro_batch_size_per_gpu": "auto",
4 | "gradient_clipping": "auto",
5 | "zero_allow_untested_optimizer": true,
6 | "zero_force_ds_cpu_optimizer": false,
7 | "zero_optimization": {
8 | "stage": 2,
9 | "overlap_comm": true,
10 | "offload_optimizer": {
11 | "device": "cpu",
12 | "pin_memory": true
13 | }
14 | },
15 | "fp16": {
16 | "enabled": "auto",
17 | "initial_scale_power": 16
18 | },
19 | "bf16": {
20 | "enabled": "auto"
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/configs/deepspeed/deepspeed_zero3.json:
--------------------------------------------------------------------------------
1 | {
2 | "gradient_accumulation_steps": "auto",
3 | "train_micro_batch_size_per_gpu": "auto",
4 | "gradient_clipping": "auto",
5 | "zero_allow_untested_optimizer": true,
6 | "zero_force_ds_cpu_optimizer": false,
7 | "zero_optimization": {
8 | "stage": 3,
9 | "overlap_comm": true,
10 | "stage3_gather_16bit_weights_on_model_save": true
11 | },
12 | "fp16": {
13 | "enabled": "auto",
14 | "initial_scale_power": 16
15 | },
16 | "bf16": {
17 | "enabled": "auto"
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/configs/deepspeed/deepspeed_zero3_offload.json:
--------------------------------------------------------------------------------
1 | {
2 | "gradient_accumulation_steps": "auto",
3 | "train_micro_batch_size_per_gpu": "auto",
4 | "gradient_clipping": "auto",
5 | "zero_allow_untested_optimizer": true,
6 | "zero_force_ds_cpu_optimizer": false,
7 | "zero_optimization": {
8 | "stage": 3,
9 | "overlap_comm": true,
10 | "offload_optimizer": {
11 | "device": "cpu",
12 | "pin_memory": true
13 | },
14 | "offload_param": {
15 | "device": "cpu",
16 | "pin_memory": true
17 | },
18 | "stage3_gather_16bit_weights_on_model_save": true
19 | },
20 | "fp16": {
21 | "enabled": "auto",
22 | "initial_scale_power": 16
23 | },
24 | "bf16": {
25 | "enabled": "auto"
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/configs/qwen/qwen1_5/qwen1_5_110b_chat/README.md:
--------------------------------------------------------------------------------
1 | # Qwen 110B
2 |
3 | ## Install
4 |
5 | ```bash
6 | # Install the latest xtuner
7 | pip install -U 'xtuner[deepspeed]'
8 |
9 | # We recommend installing flash_attn
10 | # pip install flash-attn
11 |
12 | # install the latest transformers
13 | pip install -U transformers
14 | ```
15 |
16 | ## QLoRA Fine-tune
17 |
18 | Training Qwen 110B with 32k context capability requires only 2 * A100 80G.
19 |
20 | ```bash
21 | xtuner train xtuner/configs/qwen/qwen1_5/qwen1_5_110b_chat/qwen1_5_110b_chat_qlora_alpaca_e3_16k_2gpus.py --deepspeed deepspeed_zero3
22 | ```
23 |
24 |
25 |

26 |
27 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import warnings
3 |
4 | from .concat_dataset import ConcatDataset
5 | from .huggingface import process_hf_dataset
6 | from .intern_repo import (build_packed_dataset,
7 | load_intern_repo_tokenized_dataset,
8 | load_intern_repo_untokenized_dataset)
9 | from .json_dataset import load_json_file
10 | from .llava import LLaVADataset
11 | from .modelscope import process_ms_dataset
12 | from .moss_sft import MOSSSFTDataset
13 | from .refcoco_json import (InvRefCOCOJsonDataset, RefCOCOJsonDataset,
14 | RefCOCOJsonEvalDataset)
15 | from .utils import decode_base64_to_image, expand2square, load_image
16 |
17 | # ignore FutureWarning in hf datasets
18 | warnings.simplefilter(action='ignore', category=FutureWarning)
19 |
20 | __all__ = [
21 | 'process_hf_dataset', 'ConcatDataset', 'MOSSSFTDataset',
22 | 'process_ms_dataset', 'LLaVADataset', 'expand2square',
23 | 'decode_base64_to_image', 'load_image', 'process_ms_dataset',
24 | 'load_intern_repo_tokenized_dataset',
25 | 'load_intern_repo_untokenized_dataset', 'build_packed_dataset',
26 | 'RefCOCOJsonDataset', 'RefCOCOJsonEvalDataset', 'InvRefCOCOJsonDataset',
27 | 'load_json_file'
28 | ]
29 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/collate_fns/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .default_collate_fn import default_collate_fn
3 | from .mmlu_collate_fn import mmlu_collate_fn
4 |
5 | __all__ = ['default_collate_fn', 'mmlu_collate_fn']
6 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/collate_fns/mmlu_collate_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from typing import Dict, Sequence
3 |
4 | import torch
5 | from torch.nn.utils.rnn import pad_sequence
6 |
7 | from xtuner.utils import DEFAULT_PAD_TOKEN_INDEX, IGNORE_INDEX
8 |
9 |
10 | def mmlu_collate_fn(instances: Sequence[Dict],
11 | pad_index: int = DEFAULT_PAD_TOKEN_INDEX,
12 | return_hf_format: bool = False) -> Dict[str, torch.Tensor]:
13 | input_ids = []
14 | labels = []
15 | data_samples = {'labels': [], 'subjects': []}
16 | for example in instances:
17 | input_ids.append(torch.tensor(example['input_ids']))
18 | labels.append(torch.tensor(example['labels']))
19 | data_samples['labels'].append(example['output'])
20 | data_samples['subjects'].append(example['subject'])
21 | if len(instances) > 1:
22 | input_ids = pad_sequence(
23 | input_ids, batch_first=True, padding_value=pad_index)
24 | labels = pad_sequence(
25 | labels, batch_first=True, padding_value=IGNORE_INDEX)
26 | else:
27 | input_ids = torch.stack(input_ids)
28 | labels = torch.stack(labels)
29 |
30 | data_dict = {
31 | 'input_ids': input_ids,
32 | 'attention_mask': input_ids.ne(pad_index),
33 | 'labels': labels
34 | }
35 |
36 | if return_hf_format:
37 | return data_dict
38 | else:
39 | return {'data': data_dict, 'data_samples': data_samples}
40 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/concat_dataset.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from torch.utils.data import ConcatDataset as TorchConcatDataset
3 |
4 | from xtuner.registry import BUILDER
5 |
6 |
7 | class ConcatDataset(TorchConcatDataset):
8 |
9 | def __init__(self, datasets):
10 | datasets_instance = []
11 | for cfg in datasets:
12 | datasets_instance.append(BUILDER.build(cfg))
13 | super().__init__(datasets=datasets_instance)
14 |
15 | def __repr__(self):
16 | main_str = 'Dataset as a concatenation of multiple datasets. \n'
17 | main_str += ',\n'.join(
18 | [f'{repr(dataset)}' for dataset in self.datasets])
19 | return main_str
20 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/json_dataset.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 |
4 | from datasets import Dataset, concatenate_datasets
5 |
6 |
7 | def load_json_file(data_files=None, data_dir=None, suffix=None):
8 | assert (data_files is not None) != (data_dir is not None)
9 | if data_dir is not None:
10 | data_files = os.listdir(data_dir)
11 | data_files = [os.path.join(data_dir, fn) for fn in data_files]
12 | if suffix is not None:
13 | data_files = [fp for fp in data_files if fp.endswith(suffix)]
14 | elif isinstance(data_files, str):
15 | data_files = [data_files]
16 |
17 | dataset_list = []
18 | for fp in data_files:
19 | with open(fp, encoding='utf-8') as file:
20 | data = json.load(file)
21 | ds = Dataset.from_list(data)
22 | dataset_list.append(ds)
23 | dataset = concatenate_datasets(dataset_list)
24 | return dataset
25 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dataset_map_fns import * # noqa: F401, F403
3 | from .template_map_fn import template_map_fn # noqa: F401
4 | from .template_map_fn import template_map_fn_factory # noqa: F401
5 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/alpaca_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 |
4 | def alpaca_map_fn(example):
5 | if example.get('output') == '':
6 | return {'conversation': []}
7 | else:
8 | return {
9 | 'conversation': [{
10 | 'input': f"{example['instruction']}\n{example['input']}",
11 | 'output': example['output']
12 | }]
13 | }
14 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/alpaca_zh_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 |
4 | def alpaca_zh_map_fn(example):
5 | return {
6 | 'conversation': [{
7 | 'input': f"{example['instruction_zh']}\n{example['input_zh']}",
8 | 'output': example['output_zh']
9 | }]
10 | }
11 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/arxiv_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from xtuner.utils import SYSTEM_TEMPLATE
3 |
4 |
5 | def arxiv_map_fn(example):
6 | return {
7 | 'conversation': [{
8 | 'system': SYSTEM_TEMPLATE.arxiv_gentile,
9 | 'input': example['abstract'],
10 | 'output': example['title']
11 | }]
12 | }
13 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/code_alpaca_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from xtuner.utils import SYSTEM_TEMPLATE
3 |
4 |
5 | def code_alpaca_map_fn(example):
6 | return {
7 | 'conversation': [{
8 | 'system': SYSTEM_TEMPLATE.coder,
9 | 'input': example['prompt'],
10 | 'output': example['completion']
11 | }]
12 | }
13 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/colors_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from xtuner.utils import SYSTEM_TEMPLATE
3 |
4 |
5 | def colors_map_fn(example):
6 | desc = ':'.join(example['description'].split(':')[1:]).strip()
7 | return {
8 | 'conversation': [{
9 | 'system': SYSTEM_TEMPLATE.colorist,
10 | 'input': desc,
11 | 'output': example['color']
12 | }]
13 | }
14 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/crime_kg_assitant_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from xtuner.utils import SYSTEM_TEMPLATE
3 |
4 |
5 | def crime_kg_assitant_map_fn(example):
6 | return {
7 | 'conversation': [{
8 | 'system': SYSTEM_TEMPLATE.lawyer,
9 | 'input': example['input'],
10 | 'output': example['output']
11 | }]
12 | }
13 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/default_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | def default_map_fn(example):
3 | return {
4 | 'conversation': [{
5 | 'input': example['input'],
6 | 'output': example['output']
7 | }]
8 | }
9 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/law_reference_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from xtuner.utils import SYSTEM_TEMPLATE
3 |
4 |
5 | def law_reference_map_fn(example):
6 | return {
7 | 'conversation': [{
8 | 'system': SYSTEM_TEMPLATE.lawyer,
9 | 'input': example['question'],
10 | 'output': example['answer']
11 | }]
12 | }
13 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/llava_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from xtuner.utils import DEFAULT_IMAGE_TOKEN
3 |
4 |
5 | def llava_image_only_map_fn(example):
6 | # input contains the DEFAULT_IMAGE_TOKEN only
7 | messages = example['conversations']
8 | input = ''
9 | conversation = []
10 | while messages and messages[0]['from'] == 'gpt':
11 | # Skip the first one if it is from gpt
12 | messages = messages[1:]
13 | for msg in messages:
14 | if msg['from'] == 'human':
15 | assert DEFAULT_IMAGE_TOKEN in msg['value']
16 | input += DEFAULT_IMAGE_TOKEN
17 | elif msg['from'] == 'gpt':
18 | conversation.append({'input': input, 'output': msg['value']})
19 | input = ''
20 | else:
21 | raise NotImplementedError
22 | return {'conversation': conversation}
23 |
24 |
25 | def llava_map_fn(example):
26 | messages = example['conversations']
27 | input = ''
28 | conversation = []
29 | while messages and messages[0]['from'] == 'gpt':
30 | # Skip the first one if it is from gpt
31 | messages = messages[1:]
32 | for msg in messages:
33 | if msg['from'] == 'human':
34 | if DEFAULT_IMAGE_TOKEN in msg['value']:
35 | msg['value'] = msg['value'].replace(DEFAULT_IMAGE_TOKEN,
36 | '').strip()
37 | msg['value'] = DEFAULT_IMAGE_TOKEN + '\n' + msg['value']
38 | msg['value'] = msg['value'].strip()
39 | input += msg['value']
40 |
41 | elif msg['from'] == 'gpt':
42 | conversation.append({'input': input, 'output': msg['value']})
43 | input = ''
44 | else:
45 | raise NotImplementedError
46 | return {'conversation': conversation}
47 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/medical_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from xtuner.utils import SYSTEM_TEMPLATE
3 |
4 |
5 | def medical_map_fn(example):
6 | return {
7 | 'conversation': [{
8 | 'system': SYSTEM_TEMPLATE.medical,
9 | 'input': '{instruction}\n{input}'.format(**example),
10 | 'output': example['output']
11 | }]
12 | }
13 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/oasst1_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | def oasst1_map_fn(example):
3 | r"""Example before preprocessing:
4 | example['text'] = '### Human: Can you explain xxx'
5 | '### Assistant: Sure! xxx'
6 | '### Human: I didn't understand how xxx'
7 | '### Assistant: It has to do with a process xxx.'
8 |
9 | Example after preprocessing:
10 | example['conversation'] = [
11 | {
12 | 'input': 'Can you explain xxx',
13 | 'output': 'Sure! xxx'
14 | },
15 | {
16 | 'input': 'I didn't understand how xxx',
17 | 'output': 'It has to do with a process xxx.'
18 | }
19 | ]
20 | """
21 | data = []
22 | for sentence in example['text'].strip().split('###'):
23 | sentence = sentence.strip()
24 | if sentence[:6] == 'Human:':
25 | data.append(sentence[6:].strip())
26 | elif sentence[:10] == 'Assistant:':
27 | data.append(sentence[10:].strip())
28 | if len(data) % 2:
29 | # The last round of conversation solely consists of input
30 | # without any output.
31 | # Discard the input part of the last round, as this part is ignored in
32 | # the loss calculation.
33 | data.pop()
34 | conversation = []
35 | for i in range(0, len(data), 2):
36 | single_turn_conversation = {'input': data[i], 'output': data[i + 1]}
37 | conversation.append(single_turn_conversation)
38 | return {'conversation': conversation}
39 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/openai_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | def openai_map_fn(example):
3 | """
4 | Example before preprocessing:
5 | example["messages"] = [
6 | { "role": "system", "content": "You are an assistant that
7 | occasionally misspells words." },
8 | { "role": "user", "content": "Tell me a story." },
9 | { "role": "assistant", "content": "One day a student
10 | went to schoool." }
11 | ]
12 | Example after preprocessing:
13 | example["conversation"] = [
14 | {
15 | "system": "You are an assistant that occasionally misspells
16 | words.",
17 | "input": "Tell me a story.",
18 | "output": "One day a student went to schoool."
19 | }
20 | ]
21 | """
22 | messages = example['messages']
23 | system = ''
24 | input = ''
25 | conversation = []
26 | while messages and messages[0]['role'] == 'assistant':
27 | # Skip the first one if it is from assistant
28 | messages = messages[1:]
29 | for msg in messages:
30 | if msg['role'] == 'system':
31 | system = msg['content']
32 | elif msg['role'] == 'user':
33 | input += msg['content']
34 | elif msg['role'] == 'assistant':
35 | conversation.append({
36 | 'system': system,
37 | 'input': input,
38 | 'output': msg['content']
39 | })
40 | system = ''
41 | input = ''
42 | else:
43 | raise NotImplementedError
44 | return {'conversation': conversation}
45 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/openorca_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | def openorca_map_fn(example):
3 | return {
4 | 'conversation': [{
5 | 'system': example['system_prompt'],
6 | 'input': example['question'],
7 | 'output': example['response']
8 | }]
9 | }
10 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/pretrain_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | def pretrain_map_fn(example):
3 | r"""Example before preprocessing:
4 | example['text'] = 'xxx'
5 |
6 | Example after preprocessing:
7 | example['conversation'] = [
8 | {
9 | 'input': '',
10 | 'output': 'xxx'
11 | },
12 | ]
13 | """
14 | return {
15 | 'conversation': [{
16 | 'input': '',
17 | 'output': example['text'].strip(),
18 | 'need_eos_token': False
19 | }]
20 | }
21 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/sql_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from xtuner.utils import SYSTEM_TEMPLATE
3 |
4 |
5 | def sql_map_fn(example):
6 | return {
7 | 'conversation': [{
8 | 'system': SYSTEM_TEMPLATE.sql,
9 | 'input': '{context}\n{question}'.format(**example),
10 | 'output': example['answer']
11 | }]
12 | }
13 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/stack_exchange_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | def stack_exchange_map_fn(example):
3 | return {
4 | 'conversation': [{
5 | 'input': example['question'],
6 | 'output': example['response']
7 | }]
8 | }
9 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/tiny_codes_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from xtuner.utils import SYSTEM_TEMPLATE
3 |
4 |
5 | def tiny_codes_map_fn(example):
6 | return {
7 | 'conversation': [{
8 | 'system': SYSTEM_TEMPLATE.coder,
9 | 'input': example['prompt'],
10 | 'output': example['response']
11 | }]
12 | }
13 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/dataset_map_fns/wizardlm_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | def wizardlm_map_fn(example):
3 | messages = example['conversations']
4 | input = ''
5 | conversation = []
6 | while messages and messages[0]['from'] == 'gpt':
7 | # Skip the first one if it is from gpt
8 | messages = messages[1:]
9 | for msg in messages:
10 | if msg['from'] == 'human':
11 | input += msg['value']
12 | elif msg['from'] == 'gpt':
13 | conversation.append({'input': input, 'output': msg['value']})
14 | input = ''
15 | else:
16 | raise NotImplementedError
17 | return {'conversation': conversation}
18 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/map_fns/template_map_fn.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from functools import partial
3 |
4 | from mmengine.utils.misc import get_object_from_string
5 |
6 |
7 | def template_map_fn(example, template):
8 | conversation = example.get('conversation', [])
9 | for i, single_turn_conversation in enumerate(conversation):
10 | input = single_turn_conversation.get('input', '')
11 | if input is None:
12 | input = ''
13 | input_text = template.INSTRUCTION.format(input=input, round=i + 1)
14 | system = single_turn_conversation.get('system', '')
15 | if system != '' and system is not None:
16 | system = template.SYSTEM.format(system=system)
17 | input_text = system + input_text
18 | single_turn_conversation['input'] = input_text
19 |
20 | if template.get('SUFFIX', None):
21 | output_text = single_turn_conversation.get('output', '')
22 | output_text += template.SUFFIX
23 | single_turn_conversation['output'] = output_text
24 |
25 | # SUFFIX_AS_EOS is False ==> need_eos_token is True
26 | single_turn_conversation['need_eos_token'] = \
27 | not template.get('SUFFIX_AS_EOS', False)
28 | single_turn_conversation['sep'] = template.get('SEP', '')
29 |
30 | return {'conversation': conversation}
31 |
32 |
33 | def template_map_fn_factory(template):
34 | if isinstance(template, str): # for resume
35 | template = get_object_from_string(template)
36 | return partial(template_map_fn, template=template)
37 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/modelscope.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from mmengine.config import Config, ConfigDict
3 |
4 | from xtuner.registry import BUILDER
5 | from .huggingface import process_hf_dataset
6 |
7 |
8 | def process_ms_dataset(dataset, split='train', *args, **kwargs):
9 | """Post-process the dataset loaded from the ModelScope Hub."""
10 |
11 | if isinstance(dataset, (Config, ConfigDict)):
12 | dataset = BUILDER.build(dataset)
13 | if isinstance(dataset, dict):
14 | dataset = dataset[split]
15 | dataset = dataset.to_hf_dataset()
16 | return process_hf_dataset(dataset, *args, **kwargs)
17 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/dataset/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | from .intern_repo import InternlmRepoSampler, InternRepoSampler
2 | from .length_grouped import LengthGroupedSampler
3 |
4 | __all__ = ['LengthGroupedSampler', 'InternRepoSampler', 'InternlmRepoSampler']
5 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/engine/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from ._strategy import DeepSpeedStrategy
3 | from .hooks import (DatasetInfoHook, EvaluateChatHook, ThroughputHook,
4 | VarlenAttnArgsToMessageHubHook)
5 | from .runner import TrainLoop
6 |
7 | __all__ = [
8 | 'EvaluateChatHook', 'DatasetInfoHook', 'ThroughputHook',
9 | 'VarlenAttnArgsToMessageHubHook', 'DeepSpeedStrategy', 'TrainLoop'
10 | ]
11 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/engine/_strategy/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .deepspeed import DeepSpeedStrategy
3 |
4 | __all__ = ['DeepSpeedStrategy']
5 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/engine/hooks/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dataset_info_hook import DatasetInfoHook
3 | from .evaluate_chat_hook import EvaluateChatHook
4 | from .hf_checkpoint_hook import HFCheckpointHook
5 | from .throughput_hook import ThroughputHook
6 | from .varlen_attn_args_to_messagehub_hook import VarlenAttnArgsToMessageHubHook
7 |
8 | __all__ = [
9 | 'EvaluateChatHook', 'DatasetInfoHook', 'ThroughputHook',
10 | 'VarlenAttnArgsToMessageHubHook', 'HFCheckpointHook'
11 | ]
12 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/engine/runner/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .loops import TrainLoop
3 |
4 | __all__ = ['TrainLoop']
5 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/engine/runner/loops.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from typing import Dict, Optional, Union
3 |
4 | from mmengine.runner import IterBasedTrainLoop
5 | from torch.utils.data import DataLoader
6 |
7 |
8 | class TrainLoop(IterBasedTrainLoop):
9 |
10 | def __init__(self,
11 | runner,
12 | dataloader: Union[DataLoader, Dict],
13 | max_iters: Optional[int] = None,
14 | max_epochs: Union[int, float] = None,
15 | **kwargs) -> None:
16 |
17 | if max_iters is None and max_epochs is None:
18 | raise RuntimeError('Please specify the `max_iters` or '
19 | '`max_epochs` in `train_cfg`.')
20 | elif max_iters is not None and max_epochs is not None:
21 | raise RuntimeError('Only one of `max_iters` or `max_epochs` can '
22 | 'exist in `train_cfg`.')
23 | else:
24 | if max_iters is not None:
25 | iters = int(max_iters)
26 | assert iters == max_iters, ('`max_iters` should be a integer '
27 | f'number, but get {max_iters}')
28 | elif max_epochs is not None:
29 | if isinstance(dataloader, dict):
30 | diff_rank_seed = runner._randomness_cfg.get(
31 | 'diff_rank_seed', False)
32 | dataloader = runner.build_dataloader(
33 | dataloader,
34 | seed=runner.seed,
35 | diff_rank_seed=diff_rank_seed)
36 | iters = max_epochs * len(dataloader)
37 | else:
38 | raise NotImplementedError
39 | super().__init__(
40 | runner=runner, dataloader=dataloader, max_iters=iters, **kwargs)
41 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .metrics import MMLUMetric
3 |
4 | __all__ = ['MMLUMetric']
5 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/evaluation/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .mmlu_metric import MMLUMetric
3 |
4 | __all__ = ['MMLUMetric']
5 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .llava import LLaVAModel
3 | from .sft import SupervisedFinetune
4 |
5 | __all__ = ['SupervisedFinetune', 'LLaVAModel']
6 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/model/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .dispatch import dispatch_modules
2 | from .projector import ProjectorConfig, ProjectorModel
3 |
4 | __all__ = ['dispatch_modules', 'ProjectorConfig', 'ProjectorModel']
5 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/model/modules/dispatch/triton_kernels/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .layer_norm import layer_norm_forward
3 | from .rms_norm import rms_norm_forward
4 | from .rotary import apply_rotary_emb
5 |
6 | __all__ = ['rms_norm_forward', 'layer_norm_forward', 'apply_rotary_emb']
7 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/model/modules/dispatch/triton_kernels/layer_norm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 | import torch.nn.functional as F
4 |
5 |
6 | def layer_norm_forward(self, hidden_states):
7 | input_dtype = hidden_states.dtype
8 | hidden_states = hidden_states.to(torch.float32)
9 | hidden_states = F.layer_norm(
10 | hidden_states, (hidden_states.shape[-1], ), eps=self.variance_epsilon)
11 | hidden_states = self.weight.to(torch.float32) * hidden_states
12 | return hidden_states.to(input_dtype)
13 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/model/modules/projector/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from transformers import AutoConfig, AutoModel
3 |
4 | from .configuration_projector import ProjectorConfig
5 | from .modeling_projector import ProjectorModel
6 |
7 | AutoConfig.register('projector', ProjectorConfig)
8 | AutoModel.register(ProjectorConfig, ProjectorModel)
9 |
10 | __all__ = ['ProjectorConfig', 'ProjectorModel']
11 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/model/modules/projector/configuration_projector.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from transformers import PretrainedConfig
3 |
4 |
5 | class ProjectorConfig(PretrainedConfig):
6 | model_type = 'projector'
7 | _auto_class = 'AutoConfig'
8 |
9 | def __init__(
10 | self,
11 | visual_hidden_size=4096,
12 | llm_hidden_size=4096,
13 | depth=2,
14 | hidden_act='gelu',
15 | bias=True,
16 | **kwargs,
17 | ):
18 | self.visual_hidden_size = visual_hidden_size
19 | self.llm_hidden_size = llm_hidden_size
20 | self.depth = depth
21 | self.hidden_act = hidden_act
22 | self.bias = bias
23 | super().__init__(**kwargs)
24 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/model/transformers_models/__init__.py:
--------------------------------------------------------------------------------
1 | from .deepseek_v2 import (DeepseekTokenizerFast, DeepseekV2Config,
2 | DeepseekV2ForCausalLM, DeepseekV2Model)
3 | from .mixtral import MixtralConfig, MixtralForCausalLM, MixtralModel
4 |
5 | __all__ = [
6 | 'DeepseekTokenizerFast', 'DeepseekV2Config', 'DeepseekV2ForCausalLM',
7 | 'DeepseekV2Model', 'MixtralConfig', 'MixtralForCausalLM', 'MixtralModel'
8 | ]
9 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/model/transformers_models/deepseek_v2/__init__.py:
--------------------------------------------------------------------------------
1 | from .configuration_deepseek import DeepseekV2Config
2 | from .modeling_deepseek import DeepseekV2ForCausalLM, DeepseekV2Model
3 | from .tokenization_deepseek_fast import DeepseekTokenizerFast
4 |
5 | __all__ = [
6 | 'DeepseekV2ForCausalLM', 'DeepseekV2Model', 'DeepseekV2Config',
7 | 'DeepseekTokenizerFast'
8 | ]
9 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/model/transformers_models/deepseek_v2/tokenization_deepseek_fast.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Union
2 |
3 | from transformers.models.llama import LlamaTokenizerFast
4 |
5 |
6 | class DeepseekTokenizerFast(LlamaTokenizerFast):
7 |
8 | def convert_ids_to_tokens(
9 | self,
10 | ids: Union[int, List[int]],
11 | skip_special_tokens: bool = False) -> Union[str, List[str]]:
12 | """Converts a single index or a sequence of indices in a token or a
13 | sequence of tokens, using the vocabulary and added tokens.
14 |
15 | Args:
16 | ids (`int` or `List[int]`):
17 | The token id (or token ids) to convert to tokens.
18 | skip_special_tokens (`bool`, *optional*, defaults to `False`):
19 | Whether or not to remove special tokens in the decoding.
20 |
21 | Returns:
22 | `str` or `List[str]`: The decoded token(s).
23 | """
24 | if isinstance(ids, int):
25 | return self._convert_id_to_token(ids)
26 | tokens = []
27 | for index in ids:
28 | index = int(index)
29 | if skip_special_tokens and index in self.all_special_ids:
30 | continue
31 | token = self._tokenizer.id_to_token(index)
32 | tokens.append(token if token is not None else '')
33 | return tokens
34 |
35 | def _convert_id_to_token(self, index: int) -> Optional[str]:
36 | token = self._tokenizer.id_to_token(int(index))
37 | return token if token is not None else ''
38 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/model/transformers_models/mixtral/__init__.py:
--------------------------------------------------------------------------------
1 | from .configuration_mixtral import MixtralConfig
2 | from .modeling_mixtral import MixtralForCausalLM, MixtralModel
3 |
4 | __all__ = ['MixtralForCausalLM', 'MixtralModel', 'MixtralConfig']
5 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/parallel/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .sequence import * # noqa: F401, F403
3 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/parallel/sequence/reduce_loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.distributed as dist
3 |
4 | from .setup_distributed import get_sequence_parallel_group
5 |
6 |
7 | class _ReduceLoss(torch.autograd.Function):
8 |
9 | @staticmethod
10 | def forward(ctx, mean_loss, loss_scale, process_group):
11 | ctx.mode = process_group
12 | if loss_scale == 0:
13 | # convert nan to 0 just for logging
14 | mean_loss = torch.nan_to_num(mean_loss)
15 | loss_sum = mean_loss * loss_scale
16 | dist.all_reduce(loss_sum, group=process_group)
17 | dist.all_reduce(loss_scale, group=process_group)
18 | loss = loss_sum / loss_scale
19 | return loss
20 |
21 | @staticmethod
22 | def backward(ctx, grad_output):
23 | return grad_output, None, None
24 |
25 |
26 | def reduce_sequence_parallel_loss(mean_loss,
27 | loss_scale,
28 | sp_group: dist.ProcessGroup = None):
29 | if dist.get_world_size(sp_group) == 1:
30 | return mean_loss
31 | if sp_group is None:
32 | # avoid bc breaking
33 | sp_group = get_sequence_parallel_group()
34 | return _ReduceLoss.apply(mean_loss, loss_scale, sp_group)
35 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/parallel/sequence/sampler.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import math
3 | from typing import Optional, Sized
4 |
5 | from mmengine.dataset import DefaultSampler
6 | from mmengine.dist import sync_random_seed
7 |
8 | from .setup_distributed import (get_data_parallel_rank,
9 | get_data_parallel_world_size)
10 |
11 |
12 | class SequenceParallelSampler(DefaultSampler):
13 |
14 | def __init__(self,
15 | dataset: Sized,
16 | shuffle: bool = True,
17 | seed: Optional[int] = None,
18 | round_up: bool = True) -> None:
19 | rank = get_data_parallel_rank()
20 | world_size = get_data_parallel_world_size()
21 | self.rank = rank
22 | self.world_size = world_size
23 |
24 | self.dataset = dataset
25 | self.shuffle = shuffle
26 | if seed is None:
27 | seed = sync_random_seed()
28 | self.seed = seed
29 | self.epoch = 0
30 | self.round_up = round_up
31 |
32 | if self.round_up:
33 | self.num_samples = math.ceil(len(self.dataset) / world_size)
34 | self.total_size = self.num_samples * self.world_size
35 | else:
36 | self.num_samples = math.ceil(
37 | (len(self.dataset) - rank) / world_size)
38 | self.total_size = len(self.dataset)
39 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/registry.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from mmengine.registry import Registry
3 |
4 | __all__ = ['BUILDER', 'MAP_FUNC']
5 |
6 | BUILDER = Registry('builder')
7 | MAP_FUNC = Registry('map_fn')
8 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/tools/copy_cfg.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import argparse
3 | import os.path as osp
4 | import shutil
5 |
6 | from mmengine.utils import mkdir_or_exist
7 |
8 | from xtuner.configs import cfgs_name_path
9 |
10 |
11 | def parse_args():
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('config_name', help='config name')
14 | parser.add_argument('save_dir', help='save directory for copied config')
15 | args = parser.parse_args()
16 | return args
17 |
18 |
19 | def add_copy_suffix(string):
20 | file_name, ext = osp.splitext(string)
21 | return f'{file_name}_copy{ext}'
22 |
23 |
24 | def main():
25 | args = parse_args()
26 | mkdir_or_exist(args.save_dir)
27 | config_path = cfgs_name_path[args.config_name]
28 | save_path = osp.join(args.save_dir,
29 | add_copy_suffix(osp.basename(config_path)))
30 | shutil.copyfile(config_path, save_path)
31 | print(f'Copy to {save_path}')
32 |
33 |
34 | if __name__ == '__main__':
35 | main()
36 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/tools/data_preprocess/convert_refcoco.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import argparse
3 | import json
4 |
5 | from xtuner.dataset.refcoco_json import RefCOCOJsonDataset
6 |
7 |
8 | def parse_args():
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument(
11 | '--ann-path',
12 | default='data/refcoco_annotations',
13 | help='Refcoco annotation path',
14 | )
15 | parser.add_argument(
16 | '--image-path',
17 | default='data/llava_data/llava_images/coco/train2017',
18 | help='COCO image path',
19 | )
20 | parser.add_argument(
21 | '--save-path', default='./', help='The folder to save converted data')
22 | args = parser.parse_args()
23 | return args
24 |
25 |
26 | if __name__ == '__main__':
27 | args = parse_args()
28 |
29 | data_info = [
30 | ('refcoco', 'unc'),
31 | ('refcoco+', 'unc'),
32 | ('refcocog', 'umd'),
33 | ]
34 | all_data = []
35 | for dataset, split in data_info:
36 | data = RefCOCOJsonDataset.get_data_json(
37 | ann_path=args.ann_path,
38 | image_path=args.image_path,
39 | dataset=dataset,
40 | splitBy=split,
41 | )[0]
42 | all_data.extend(data)
43 | save_path = args.save_path + '/train.json'
44 | with open(save_path, 'w') as f:
45 | print(f'save to {save_path} with {len(all_data)} items.')
46 | print(all_data[0])
47 | json.dump(all_data, f, indent=4)
48 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/tools/get_data_order.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import argparse
3 | import os
4 |
5 |
6 | def parse_args():
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument('--data-folder', help='Data folder')
9 | parser.add_argument('--save-folder', help='The folder to save data order.')
10 | parser.add_argument(
11 | '--file-type',
12 | default='.bin',
13 | help='We want to get the order of the file in this type.')
14 | args = parser.parse_args()
15 | return args
16 |
17 |
18 | def save_data_order(data_folder, save_folder, file_type='.bin'):
19 | assert os.path.exists(data_folder), f'{data_folder} does not exist.'
20 | triples = list(os.walk(data_folder, followlinks=True))
21 | data_order = []
22 | for root, dirs, files in triples:
23 | dirs.sort()
24 | print(f'Reading {root}...')
25 | for fn in sorted(files):
26 | if fn.endswith(file_type):
27 | fp = os.path.join(root, fn)
28 | # Using relative paths so that you can get the same result
29 | # on different clusters
30 | fp = fp.replace(data_folder, '')[1:]
31 | data_order.append(fp)
32 |
33 | save_path = os.path.join(save_folder, 'data_order.txt')
34 | with open(save_path, 'w') as f:
35 | for fp in data_order:
36 | f.write(fp + '\n')
37 |
38 |
39 | if __name__ == '__main__':
40 | args = parse_args()
41 | save_data_order(args.data_folder, args.save_folder, args.file_type)
42 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/tools/list_cfg.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import argparse
3 |
4 | from xtuner.configs import cfgs_name_path
5 |
6 |
7 | def parse_args():
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument(
10 | '-p', '--pattern', default=None, help='Pattern for fuzzy matching')
11 | args = parser.parse_args()
12 | return args
13 |
14 |
15 | def main(pattern=None):
16 | args = parse_args()
17 | configs_names = sorted(list(cfgs_name_path.keys()))
18 | print('==========================CONFIGS===========================')
19 | if args.pattern is not None:
20 | print(f'PATTERN: {args.pattern}')
21 | print('-------------------------------')
22 | for name in configs_names:
23 | if args.pattern is None or args.pattern.lower() in name.lower():
24 | print(name)
25 | print('=============================================================')
26 |
27 |
28 | if __name__ == '__main__':
29 | main()
30 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/tools/list_dataset_format.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from xtuner.dataset.map_fns import DATASET_FORMAT_MAPPING
3 |
4 |
5 | def main():
6 | dataset_format = DATASET_FORMAT_MAPPING.keys()
7 | print('======================DATASET_FORMAT======================')
8 | for format in dataset_format:
9 | print(format)
10 | print('==========================================================')
11 |
12 |
13 | if __name__ == '__main__':
14 | main()
15 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/tools/model_converters/modeling_internlm2_reward/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lxtGH/OMG-Seg/1ae2e7d446a607fddaefabf4ace312a2cdf7ab55/omg_llava/xtuner/tools/model_converters/modeling_internlm2_reward/__init__.py
--------------------------------------------------------------------------------
/omg_llava/xtuner/tools/plugins/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .api import plugins_api
3 |
4 | __all__ = ['plugins_api']
5 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/tools/plugins/api.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import re
3 |
4 |
5 | def plugins_api(input_str,
6 | calculate_open=True,
7 | solve_open=True,
8 | search_open=True):
9 |
10 | pattern = r'(Solve|solve|Solver|solver|Calculate|calculate|Calculator|calculator|Search)\("([^"]*)"\)' # noqa: E501
11 |
12 | matches = re.findall(pattern, input_str)
13 |
14 | converted_str = '<|Results|>:\n'
15 |
16 | for i in range(len(matches)):
17 | if matches[i][0] in [
18 | 'Calculate', 'calculate'
19 | 'Calculator', 'calculator'
20 | ]:
21 | if calculate_open:
22 | from .calculate import Calculate
23 | result = Calculate(matches[i][1])
24 | else:
25 | result = None
26 | converted_str += f"Calculate(\"{matches[i][1]}\") => {result}\n"
27 | elif matches[i][0] in ['Solve', 'solve', 'Solver', 'solver']:
28 | if solve_open:
29 | from .solve import Solve
30 | result = Solve(matches[i][1])
31 | else:
32 | result = None
33 | converted_str += f"Solve(\"{matches[i][1]}\") =>\n{result}\n"
34 | elif matches[i][0] == 'Search':
35 | if search_open:
36 | from .search import Search
37 | result = Search(matches[i][1])
38 | else:
39 | result = None
40 | converted_str += f"Search(\"{matches[i][1]}\") =>\n{result}"
41 |
42 | converted_str += '\n'
43 | return converted_str
44 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/tools/plugins/calculate.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from math import * # noqa: F401, F403
3 |
4 |
5 | def Calculate(expression):
6 | res = ''
7 | for exp in expression.split(';'):
8 | try:
9 | res += '{:.2f};'.format(eval(exp.replace('^', '**')))
10 | except Exception:
11 | res += 'No result.'
12 | if res[-1] == ';':
13 | res = res[:-1]
14 | return res
15 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/tools/plugins/search.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import os
3 | import sys
4 |
5 | import requests
6 |
7 | try:
8 | SERPER_API_KEY = os.environ['SERPER_API_KEY']
9 | except Exception:
10 | print('Please obtain the `SERPER_API_KEY` from https://serper.dev and '
11 | 'set it using `export SERPER_API_KEY=xxx`.')
12 | sys.exit(1)
13 |
14 |
15 | def parse_results(results, k=10):
16 | snippets = []
17 |
18 | for result in results['organic'][:k]:
19 | if 'snippet' in result:
20 | snippets.append(result['snippet'])
21 | for attribute, value in result.get('attributes', {}).items():
22 | snippets.append(f'{attribute}: {value}.')
23 | return snippets
24 |
25 |
26 | def search(api_key, search_term, **kwargs):
27 | headers = {
28 | 'X-API-KEY': api_key,
29 | 'Content-Type': 'application/json',
30 | }
31 | params = {
32 | 'q': search_term,
33 | **{key: value
34 | for key, value in kwargs.items() if value is not None},
35 | }
36 | try:
37 | response = requests.post(
38 | 'https://google.serper.dev/search',
39 | headers=headers,
40 | params=params,
41 | timeout=5)
42 | except Exception as e:
43 | return -1, str(e)
44 | return response.status_code, response.json()
45 |
46 |
47 | def Search(q, k=10):
48 | status_code, response = search(SERPER_API_KEY, q)
49 | if status_code != 200:
50 | ret = 'None\n'
51 | else:
52 | text = parse_results(response, k=k)
53 | ret = ''
54 | for idx, res in enumerate(text):
55 | ret += f"<|{idx+1}|>: '{res}'\n"
56 | return ret
57 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/tools/process_untokenized_llava_data.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import argparse
3 | import warnings
4 |
5 | from mmengine import Config
6 |
7 | from xtuner.registry import BUILDER
8 |
9 | # ignore FutureWarning in hf datasets
10 | warnings.simplefilter(action='ignore', category=FutureWarning)
11 |
12 |
13 | def parse_args():
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('config', help='config file name or path.')
16 | parser.add_argument('--save-folder', help='The folder to save data order.')
17 | args = parser.parse_args()
18 | return args
19 |
20 |
21 | def build_llava_dataset(config):
22 | dataset = BUILDER.build(config.train_dataloader.dataset)
23 | return dataset
24 |
25 |
26 | if __name__ == '__main__':
27 | args = parse_args()
28 | cfg = Config.fromfile(args.config)
29 |
30 | llava_dataset = build_llava_dataset(cfg)
31 | text_data = llava_dataset.text_data
32 |
33 | text_data.save_to_disk(args.save_folder)
34 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .constants import (DEFAULT_IMAGE_TOKEN, DEFAULT_PAD_TOKEN_INDEX,
3 | IGNORE_INDEX, IMAGE_TOKEN_INDEX)
4 | from .handle_moe_load_and_save import (SUPPORT_MODELS, get_origin_state_dict,
5 | load_state_dict_into_model)
6 | from .stop_criteria import StopWordStoppingCriteria
7 | from .templates import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
8 |
9 | __all__ = [
10 | 'IGNORE_INDEX', 'DEFAULT_PAD_TOKEN_INDEX', 'PROMPT_TEMPLATE',
11 | 'DEFAULT_IMAGE_TOKEN', 'SYSTEM_TEMPLATE', 'StopWordStoppingCriteria',
12 | 'IMAGE_TOKEN_INDEX', 'load_state_dict_into_model', 'get_origin_state_dict',
13 | 'SUPPORT_MODELS'
14 | ]
15 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/utils/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | IGNORE_INDEX = -100
3 | DEFAULT_PAD_TOKEN_INDEX = 0
4 | IMAGE_TOKEN_INDEX = -200
5 | DEFAULT_IMAGE_TOKEN = ''
6 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/utils/stop_criteria.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from transformers import StoppingCriteria
3 |
4 |
5 | class StopWordStoppingCriteria(StoppingCriteria):
6 | """StopWord stopping criteria."""
7 |
8 | def __init__(self, tokenizer, stop_word):
9 | self.tokenizer = tokenizer
10 | self.stop_word = stop_word
11 | self.length = len(self.stop_word)
12 |
13 | def __call__(self, input_ids, *args, **kwargs) -> bool:
14 | cur_text = self.tokenizer.decode(input_ids[0])
15 | cur_text = cur_text.replace('\r', '').replace('\n', '')
16 | return cur_text[-self.length:] == self.stop_word
17 |
--------------------------------------------------------------------------------
/omg_llava/xtuner/version.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | __version__ = '0.1.21'
3 | short_version = __version__
4 |
5 |
6 | def parse_version_info(version_str):
7 | """Parse a version string into a tuple.
8 |
9 | Args:
10 | version_str (str): The version string.
11 | Returns:
12 | tuple[int or str]: The version info, e.g., "1.3.0" is parsed into
13 | (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1').
14 | """
15 | version_info = []
16 | for x in version_str.split('.'):
17 | if x.isdigit():
18 | version_info.append(int(x))
19 | elif x.find('rc') != -1:
20 | patch_version = x.split('rc')
21 | version_info.append(int(patch_version[0]))
22 | version_info.append(f'rc{patch_version[1]}')
23 | return tuple(version_info)
24 |
25 |
26 | version_info = parse_version_info(__version__)
27 |
--------------------------------------------------------------------------------
/seg/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
3 | LoggerHook, ParamSchedulerHook)
4 | from mmengine.runner import LogProcessor
5 | from mmengine.visualization import LocalVisBackend
6 |
7 | from mmdet.engine.hooks import DetVisualizationHook
8 | from mmdet.visualization import DetLocalVisualizer
9 |
10 | default_scope = None
11 |
12 | default_hooks = dict(
13 | timer=dict(type=IterTimerHook),
14 | logger=dict(type=LoggerHook, interval=50),
15 | param_scheduler=dict(type=ParamSchedulerHook),
16 | checkpoint=dict(type=CheckpointHook, interval=1, max_keep_ckpts=1),
17 | sampler_seed=dict(type=DistSamplerSeedHook),
18 | visualization=dict(type=DetVisualizationHook))
19 |
20 | env_cfg = dict(
21 | cudnn_benchmark=False,
22 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
23 | dist_cfg=dict(backend='nccl'),
24 | )
25 |
26 | vis_backends = [dict(type=LocalVisBackend)]
27 | visualizer = dict(
28 | type=DetLocalVisualizer, vis_backends=vis_backends, name='visualizer')
29 | log_processor = dict(type=LogProcessor, window_size=50, by_epoch=True)
30 |
31 | log_level = 'INFO'
32 | load_from = None
33 | resume = False
34 |
--------------------------------------------------------------------------------
/seg/configs/_base_/schedules/schedule_12e.py:
--------------------------------------------------------------------------------
1 | from mmengine.optim import LinearLR, MultiStepLR, OptimWrapper
2 | from mmengine.runner import EpochBasedTrainLoop, ValLoop, TestLoop
3 | from torch.optim import AdamW
4 |
5 | # training schedule for 50e
6 | train_cfg = dict(
7 | type=EpochBasedTrainLoop,
8 | max_epochs=12,
9 | val_interval=2,
10 | )
11 | val_cfg = dict(type=ValLoop)
12 | test_cfg = dict(type=TestLoop)
13 |
14 | # learning rate
15 | param_scheduler = [
16 | dict(
17 | type=LinearLR,
18 | start_factor=0.001,
19 | by_epoch=False,
20 | begin=0,
21 | end=500
22 | ),
23 | dict(
24 | type=MultiStepLR,
25 | begin=0,
26 | end=12,
27 | by_epoch=True,
28 | milestones=[8, 11],
29 | gamma=0.1
30 | )
31 | ]
32 |
33 | _embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
34 | optim_wrapper = dict(
35 | type=OptimWrapper,
36 | optimizer=dict(
37 | type=AdamW,
38 | lr=0.0001,
39 | weight_decay=0.05,
40 | eps=1e-8,
41 | betas=(0.9, 0.999)
42 | ),
43 | paramwise_cfg=dict(
44 | custom_keys={
45 | 'backbone': dict(lr_mult=0.1, decay_mult=1.0),
46 | 'query_embed': _embed_multi,
47 | 'query_feat': _embed_multi,
48 | 'level_embed': _embed_multi,
49 | },
50 | norm_decay_mult=0.0
51 | ),
52 | clip_grad=dict(max_norm=0.01, norm_type=2)
53 | )
54 |
55 | # Default setting for scaling LR automatically
56 | # - `enable` means enable scaling LR automatically
57 | # or not by default.
58 | # - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
59 | auto_scale_lr = dict(enable=True, base_batch_size=16)
60 |
--------------------------------------------------------------------------------
/seg/configs/_base_/schedules/schedule_24e.py:
--------------------------------------------------------------------------------
1 | from mmengine.optim import LinearLR, MultiStepLR, OptimWrapper
2 | from mmengine.runner import EpochBasedTrainLoop, ValLoop, TestLoop
3 | from torch.optim import AdamW
4 |
5 | # training schedule for 50e
6 | train_cfg = dict(
7 | type=EpochBasedTrainLoop,
8 | max_epochs=24,
9 | val_interval=2,
10 | )
11 | val_cfg = dict(type=ValLoop)
12 | test_cfg = dict(type=TestLoop)
13 |
14 | # learning rate
15 | param_scheduler = [
16 | dict(
17 | type=LinearLR,
18 | start_factor=0.001,
19 | by_epoch=False,
20 | begin=0,
21 | end=500
22 | ),
23 | dict(
24 | type=MultiStepLR,
25 | begin=0,
26 | end=24,
27 | by_epoch=True,
28 | milestones=[16, 22],
29 | gamma=0.1
30 | )
31 | ]
32 |
33 | _embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
34 | optim_wrapper = dict(
35 | type=OptimWrapper,
36 | optimizer=dict(
37 | type=AdamW,
38 | lr=0.0001,
39 | weight_decay=0.05,
40 | eps=1e-8,
41 | betas=(0.9, 0.999)
42 | ),
43 | paramwise_cfg=dict(
44 | custom_keys={
45 | 'backbone': dict(lr_mult=0.1, decay_mult=1.0),
46 | 'query_embed': _embed_multi,
47 | 'query_feat': _embed_multi,
48 | 'level_embed': _embed_multi,
49 | },
50 | norm_decay_mult=0.0
51 | ),
52 | clip_grad=dict(max_norm=0.01, norm_type=2)
53 | )
54 |
55 | # Default setting for scaling LR automatically
56 | # - `enable` means enable scaling LR automatically
57 | # or not by default.
58 | # - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
59 | auto_scale_lr = dict(enable=True, base_batch_size=16)
60 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/datasets/ade.py:
--------------------------------------------------------------------------------
1 | from mmdet.models import BatchFixedSizePad
2 | from mmengine import read_base
3 |
4 | from seg.models.data_preprocessor import VideoSegDataPreprocessor
5 |
6 | with read_base():
7 | from ..._base_.default_runtime import *
8 | from ..._base_.schedules.schedule_12e import *
9 | from ..._base_.datasets.ade_panoptic_ov import train_dataloader, image_size
10 | from ..._base_.datasets.ade_panoptic import val_dataloader, val_evaluator, test_dataloader, test_evaluator
11 | from ..._base_.datasets.joint_dataset import train_dataloader as training_loader
12 |
13 | batch_augments = [
14 | dict(
15 | type=BatchFixedSizePad,
16 | size=(image_size[1], image_size[0]),
17 | img_pad_value=0,
18 | pad_mask=True,
19 | mask_pad_value=0,
20 | pad_seg=True,
21 | seg_pad_value=255
22 | )
23 | ]
24 | data_preprocessor = dict(
25 | type=VideoSegDataPreprocessor,
26 | mean=[123.675, 116.28, 103.53],
27 | std=[58.395, 57.12, 57.375],
28 | bgr_to_rgb=True,
29 | pad_size_divisor=32,
30 | pad_mask=True,
31 | mask_pad_value=0,
32 | pad_seg=True,
33 | seg_pad_value=255,
34 | batch_augments=batch_augments
35 | )
36 |
37 | num_things_classes = 100
38 | num_stuff_classes = 50
39 | num_classes = num_things_classes + num_stuff_classes
40 |
41 | ov_datasets_name = 'ADEPanopticOVDataset'
42 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/datasets/cityscapes.py:
--------------------------------------------------------------------------------
1 | from mmengine.config import read_base
2 |
3 | from mmdet.models import BatchFixedSizePad
4 |
5 | from seg.models.data_preprocessor import VideoSegDataPreprocessor
6 | from seg.models.utils import NO_OBJ
7 |
8 | with read_base():
9 | from ..._base_.default_runtime import *
10 | from ..._base_.datasets.cityscapes_panoptic import *
11 | from ..._base_.schedules.schedule_12e import *
12 |
13 | batch_augments = [
14 | dict(
15 | type=BatchFixedSizePad,
16 | size=(image_size[1], image_size[0]),
17 | img_pad_value=0,
18 | pad_mask=True,
19 | mask_pad_value=0,
20 | pad_seg=True,
21 | seg_pad_value=255
22 | )
23 | ]
24 | data_preprocessor = dict(
25 | type=VideoSegDataPreprocessor,
26 | mean=[123.675, 116.28, 103.53],
27 | std=[58.395, 57.12, 57.375],
28 | bgr_to_rgb=True,
29 | pad_size_divisor=32,
30 | pad_mask=True,
31 | mask_pad_value=0,
32 | pad_seg=True,
33 | seg_pad_value=NO_OBJ,
34 | batch_augments=batch_augments
35 | )
36 |
37 | num_things_classes = 11
38 | num_stuff_classes = 8
39 | num_classes = num_things_classes + num_stuff_classes
40 |
41 | ov_datasets_name = 'CityscapesPanopticDataset'
42 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/datasets/coco.py:
--------------------------------------------------------------------------------
1 | from mmengine.config import read_base
2 |
3 | from mmdet.models import BatchFixedSizePad
4 |
5 | from seg.models.data_preprocessor import VideoSegDataPreprocessor
6 |
7 | with read_base():
8 | from ..._base_.default_runtime import *
9 | from ..._base_.datasets.coco_panoptic_lsj import *
10 | from ..._base_.schedules.schedule_12e import *
11 |
12 | batch_augments = [
13 | dict(
14 | type=BatchFixedSizePad,
15 | size=(image_size[1], image_size[0]),
16 | img_pad_value=0,
17 | pad_mask=True,
18 | mask_pad_value=0,
19 | pad_seg=True,
20 | seg_pad_value=255
21 | )
22 | ]
23 | data_preprocessor = dict(
24 | type=VideoSegDataPreprocessor,
25 | mean=[123.675, 116.28, 103.53],
26 | std=[58.395, 57.12, 57.375],
27 | bgr_to_rgb=True,
28 | pad_size_divisor=32,
29 | pad_mask=True,
30 | mask_pad_value=0,
31 | pad_seg=True,
32 | seg_pad_value=255,
33 | batch_augments=batch_augments
34 | )
35 |
36 | num_things_classes = 80
37 | num_stuff_classes = 53
38 | num_classes = num_things_classes + num_stuff_classes
39 |
40 | ov_datasets_name = 'CocoPanopticOVDataset'
41 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/datasets/coco_pan_point.py:
--------------------------------------------------------------------------------
1 | from mmengine.config import read_base
2 |
3 | from seg.evaluation.metrics.ins_cls_iou_metric import InsClsIoUMetric
4 | from seg.models.data_preprocessor import OVSAMDataPreprocessor
5 |
6 | with read_base():
7 | from ..._base_.default_runtime import *
8 | from ..._base_.datasets.coco_panoptic_lsj_sam import *
9 | from ..._base_.schedules.schedule_12e import *
10 |
11 | data_preprocessor = dict(
12 | type=OVSAMDataPreprocessor,
13 | mean=[123.675, 116.28, 103.53],
14 | std=[58.395, 57.12, 57.375],
15 | bgr_to_rgb=True,
16 | pad_size_divisor=32,
17 | pad_mask=True,
18 | mask_pad_value=0,
19 | pad_seg=True,
20 | seg_pad_value=255,
21 | batch_augments=None,
22 | use_point_pseudo_box=True
23 | )
24 |
25 | num_things_classes = 80
26 | num_stuff_classes = 0
27 | num_classes = num_things_classes + num_stuff_classes
28 |
29 | ov_datasets_name = 'CocoPanopticOVDataset'
30 |
31 | val_evaluator = dict(
32 | type=InsClsIoUMetric,
33 | with_score=False,
34 | )
35 | test_evaluator = val_evaluator
36 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/datasets/davis.py:
--------------------------------------------------------------------------------
1 | from mmengine.config import read_base
2 |
3 | from mmdet.models import BatchFixedSizePad
4 |
5 | from seg.models.data_preprocessor import VideoSegDataPreprocessor
6 | from seg.models.utils import NO_OBJ
7 |
8 | with read_base():
9 | from ..._base_.default_runtime import *
10 | from ..._base_.datasets.davis import *
11 | from ..._base_.schedules.schedule_12e import *
12 |
13 | batch_augments = [
14 | dict(
15 | type=BatchFixedSizePad,
16 | size=(image_size[1], image_size[0]),
17 | img_pad_value=0,
18 | pad_mask=True,
19 | mask_pad_value=0,
20 | pad_seg=True,
21 | seg_pad_value=NO_OBJ
22 | )
23 | ]
24 | data_preprocessor = dict(
25 | type=VideoSegDataPreprocessor,
26 | mean=[123.675, 116.28, 103.53],
27 | std=[58.395, 57.12, 57.375],
28 | bgr_to_rgb=True,
29 | pad_size_divisor=32,
30 | pad_mask=True,
31 | mask_pad_value=0,
32 | pad_seg=True,
33 | seg_pad_value=NO_OBJ,
34 | batch_augments=batch_augments
35 | )
36 |
37 | num_things_classes = 80
38 | num_stuff_classes = 0
39 | num_classes = num_things_classes + num_stuff_classes
40 |
41 | ov_datasets_name = 'CocoOVDataset'
42 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/datasets/vipseg.py:
--------------------------------------------------------------------------------
1 | from mmengine.config import read_base
2 |
3 | from mmdet.models import BatchFixedSizePad
4 |
5 | from seg.models.data_preprocessor import VideoSegDataPreprocessor
6 |
7 | with read_base():
8 | from ..._base_.default_runtime import *
9 | from ..._base_.datasets.vipseg import *
10 | from ..._base_.schedules.schedule_12e import *
11 |
12 | batch_augments = [
13 | dict(
14 | type=BatchFixedSizePad,
15 | size=(image_size[1], image_size[0]),
16 | img_pad_value=0,
17 | pad_mask=True,
18 | mask_pad_value=0,
19 | pad_seg=True,
20 | seg_pad_value=255
21 | )
22 | ]
23 | data_preprocessor = dict(
24 | type=VideoSegDataPreprocessor,
25 | mean=[123.675, 116.28, 103.53],
26 | std=[58.395, 57.12, 57.375],
27 | bgr_to_rgb=True,
28 | pad_size_divisor=32,
29 | pad_mask=True,
30 | mask_pad_value=0,
31 | pad_seg=True,
32 | seg_pad_value=255,
33 | batch_augments=batch_augments
34 | )
35 |
36 | num_things_classes = 58
37 | num_stuff_classes = 66
38 | num_classes = num_things_classes + num_stuff_classes
39 |
40 | ov_datasets_name = 'VIPSegDataset'
41 | default_hooks.update(
42 | logger=dict(type=LoggerHook, interval=1),
43 | )
44 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/datasets/y19.py:
--------------------------------------------------------------------------------
1 | from mmengine.config import read_base
2 |
3 | from mmdet.models import BatchFixedSizePad
4 |
5 | from seg.models.data_preprocessor import VideoSegDataPreprocessor
6 |
7 | with read_base():
8 | from ..._base_.default_runtime import *
9 | from ..._base_.datasets.youtube_vis_2019 import *
10 | from ..._base_.schedules.schedule_12e import *
11 |
12 | batch_augments = [
13 | dict(
14 | type=BatchFixedSizePad,
15 | size=(image_size[1], image_size[0]),
16 | img_pad_value=0,
17 | pad_mask=True,
18 | mask_pad_value=0,
19 | pad_seg=True,
20 | seg_pad_value=255
21 | )
22 | ]
23 | data_preprocessor = dict(
24 | type=VideoSegDataPreprocessor,
25 | mean=[123.675, 116.28, 103.53],
26 | std=[58.395, 57.12, 57.375],
27 | bgr_to_rgb=True,
28 | pad_size_divisor=32,
29 | pad_mask=True,
30 | mask_pad_value=0,
31 | pad_seg=True,
32 | seg_pad_value=255,
33 | batch_augments=batch_augments
34 | )
35 |
36 | num_things_classes = 40
37 | num_stuff_classes = 0
38 | num_classes = num_things_classes + num_stuff_classes
39 |
40 | ov_datasets_name = 'YouTubeVISDataset_2019'
41 | default_hooks.update(
42 | logger=dict(type=LoggerHook, interval=1),
43 | )
44 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/datasets/y21.py:
--------------------------------------------------------------------------------
1 | from mmengine.config import read_base
2 |
3 | from mmdet.models import BatchFixedSizePad
4 |
5 | from seg.models.data_preprocessor import VideoSegDataPreprocessor
6 |
7 | with read_base():
8 | from ..._base_.default_runtime import *
9 | from ..._base_.datasets.youtube_vis_2021 import *
10 | from ..._base_.schedules.schedule_12e import *
11 | from ..._base_.datasets.joint_dataset import train_dataloader as training_loader
12 |
13 |
14 | batch_augments = [
15 | dict(
16 | type=BatchFixedSizePad,
17 | size=(image_size[1], image_size[0]),
18 | img_pad_value=0,
19 | pad_mask=True,
20 | mask_pad_value=0,
21 | pad_seg=True,
22 | seg_pad_value=255
23 | )
24 | ]
25 | data_preprocessor = dict(
26 | type=VideoSegDataPreprocessor,
27 | mean=[123.675, 116.28, 103.53],
28 | std=[58.395, 57.12, 57.375],
29 | bgr_to_rgb=True,
30 | pad_size_divisor=32,
31 | pad_mask=True,
32 | mask_pad_value=0,
33 | pad_seg=True,
34 | seg_pad_value=255,
35 | batch_augments=batch_augments
36 | )
37 |
38 | num_things_classes = 40
39 | num_stuff_classes = 0
40 | num_classes = num_things_classes + num_stuff_classes
41 |
42 | ov_datasets_name = 'YouTubeVISDataset_2021'
43 | default_hooks.update(
44 | logger=dict(type=LoggerHook, interval=1),
45 | )
46 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/eval_m2_convl_300q_ov_ade.py:
--------------------------------------------------------------------------------
1 | from mmengine import read_base
2 |
3 | with read_base():
4 | from .datasets.ade import *
5 | from .models.m2_convl_300q import *
6 |
7 | model.update(
8 | data_preprocessor=data_preprocessor,
9 | panoptic_head=dict(
10 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}',
11 | num_things_classes=num_things_classes,
12 | num_stuff_classes=num_stuff_classes,
13 | ),
14 | panoptic_fusion_head=dict(
15 | num_things_classes=num_things_classes,
16 | num_stuff_classes=num_stuff_classes,
17 | ),
18 | test_cfg=dict(
19 | panoptic_on=True,
20 | semantic_on=False,
21 | instance_on=False,
22 | ),
23 | )
24 | overlapping = dict(
25 | train=training_loader.dataset,
26 | test=test_dataloader.dataset
27 | )
28 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/eval_m2_convl_300q_ov_cityscapes.py:
--------------------------------------------------------------------------------
1 | from mmengine import read_base
2 |
3 | with read_base():
4 | from .datasets.cityscapes import *
5 | from .models.m2_convl_300q import *
6 |
7 | model.update(
8 | data_preprocessor=data_preprocessor,
9 | panoptic_head=dict(
10 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}',
11 | num_things_classes=num_things_classes,
12 | num_stuff_classes=num_stuff_classes,
13 | ),
14 | panoptic_fusion_head=dict(
15 | num_things_classes=num_things_classes,
16 | num_stuff_classes=num_stuff_classes,
17 | ),
18 | test_cfg=dict(
19 | panoptic_on=True,
20 | semantic_on=False,
21 | instance_on=False,
22 | ),
23 | )
24 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/eval_m2_convl_300q_ov_coco.py:
--------------------------------------------------------------------------------
1 | from mmengine import read_base
2 |
3 | with read_base():
4 | from .datasets.coco import *
5 | from .models.m2_convl_300q import *
6 |
7 | model.update(
8 | data_preprocessor=data_preprocessor,
9 | panoptic_head=dict(
10 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}',
11 | num_things_classes=num_things_classes,
12 | num_stuff_classes=num_stuff_classes,
13 | ),
14 | panoptic_fusion_head=dict(
15 | num_things_classes=num_things_classes,
16 | num_stuff_classes=num_stuff_classes,
17 | ),
18 | test_cfg=dict(
19 | panoptic_on=True,
20 | semantic_on=False,
21 | instance_on=True,
22 | ),
23 | )
24 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/eval_m2_convl_300q_ov_davis.py:
--------------------------------------------------------------------------------
1 | from mmengine import read_base
2 |
3 | from seg.models.detectors import Mask2formerVideoMinVIS
4 |
5 | with read_base():
6 | from .datasets.davis import *
7 | from .models.m2_convl_300q import *
8 |
9 | model.update(
10 | data_preprocessor=data_preprocessor,
11 | type=Mask2formerVideoMinVIS,
12 | clip_size=5,
13 | clip_size_small=3,
14 | whole_clip_thr=0,
15 | small_clip_thr=15,
16 | overlap=0,
17 | panoptic_head=dict(
18 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}',
19 | num_things_classes=num_things_classes,
20 | num_stuff_classes=num_stuff_classes,
21 | ),
22 | panoptic_fusion_head=dict(
23 | num_things_classes=num_things_classes,
24 | num_stuff_classes=num_stuff_classes,
25 | ),
26 | test_cfg=dict(
27 | panoptic_on=False,
28 | semantic_on=False,
29 | instance_on=False,
30 | proposal_on=True,
31 | num_proposals=25,
32 | ),
33 | )
34 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/eval_m2_convl_300q_ov_vipseg.py:
--------------------------------------------------------------------------------
1 | from mmengine import read_base
2 |
3 | from seg.models.detectors import Mask2formerVideoMinVIS
4 |
5 | with read_base():
6 | from .datasets.vipseg import *
7 | from .models.m2_convl_300q import *
8 |
9 | model.update(
10 | data_preprocessor=data_preprocessor,
11 | type=Mask2formerVideoMinVIS,
12 | clip_size=2,
13 | clip_size_small=3,
14 | whole_clip_thr=0,
15 | small_clip_thr=15,
16 | overlap=0,
17 | panoptic_head=dict(
18 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}',
19 | num_things_classes=num_things_classes,
20 | num_stuff_classes=num_stuff_classes,
21 | ),
22 | panoptic_fusion_head=dict(
23 | num_things_classes=num_things_classes,
24 | num_stuff_classes=num_stuff_classes,
25 | ),
26 | test_cfg=dict(
27 | panoptic_on=True,
28 | semantic_on=False,
29 | instance_on=False,
30 | ),
31 | )
32 |
33 | val_evaluator = dict(
34 | type=VIPSegMetric,
35 | metric=['VPQ@1', 'VPQ@2', 'VPQ@4', 'VPQ@6'],
36 | format_only=True,
37 | )
38 | test_evaluator = val_evaluator
39 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/eval_m2_convl_300q_ov_y19.py:
--------------------------------------------------------------------------------
1 | from mmengine import read_base
2 |
3 | from seg.models.detectors import Mask2formerVideoMinVIS
4 |
5 | with read_base():
6 | from .datasets.y19 import *
7 | from .models.m2_convl_300q import *
8 |
9 | model.update(
10 | data_preprocessor=data_preprocessor,
11 | type=Mask2formerVideoMinVIS,
12 | clip_size=5,
13 | clip_size_small=3,
14 | whole_clip_thr=0,
15 | small_clip_thr=15,
16 | overlap=0,
17 | panoptic_head=dict(
18 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}',
19 | num_things_classes=num_things_classes,
20 | num_stuff_classes=num_stuff_classes,
21 | ),
22 | panoptic_fusion_head=dict(
23 | num_things_classes=num_things_classes,
24 | num_stuff_classes=num_stuff_classes,
25 | ),
26 | test_cfg=dict(
27 | panoptic_on=False,
28 | semantic_on=False,
29 | instance_on=True,
30 | ),
31 | )
32 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/eval_m2_convl_300q_ov_y21.py:
--------------------------------------------------------------------------------
1 | from mmengine import read_base
2 |
3 | from seg.models.detectors import Mask2formerVideoMinVIS
4 |
5 | with read_base():
6 | from .datasets.y21 import *
7 | from .models.m2_convl_300q import *
8 |
9 | model.update(
10 | data_preprocessor=data_preprocessor,
11 | type=Mask2formerVideoMinVIS,
12 | clip_size=5,
13 | clip_size_small=3,
14 | whole_clip_thr=0,
15 | small_clip_thr=15,
16 | overlap=0,
17 | panoptic_head=dict(
18 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}',
19 | num_things_classes=num_things_classes,
20 | num_stuff_classes=num_stuff_classes,
21 | ),
22 | panoptic_fusion_head=dict(
23 | num_things_classes=num_things_classes,
24 | num_stuff_classes=num_stuff_classes,
25 | ),
26 | test_cfg=dict(
27 | panoptic_on=False,
28 | semantic_on=False,
29 | instance_on=True,
30 | ),
31 | )
32 |
--------------------------------------------------------------------------------
/seg/configs/m2ov_val/eval_m2_convl_ov_coco_pan_point.py:
--------------------------------------------------------------------------------
1 | from mmengine import read_base
2 |
3 | with read_base():
4 | from .datasets.coco_pan_point import *
5 | from .models.m2_convl_300q import *
6 |
7 | model.update(
8 | data_preprocessor=data_preprocessor,
9 | inference_sam=True,
10 | panoptic_head=dict(
11 | enable_box_query=True,
12 | ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}',
13 | num_things_classes=num_things_classes,
14 | num_stuff_classes=num_stuff_classes,
15 | ),
16 | panoptic_fusion_head=dict(
17 | num_things_classes=num_things_classes,
18 | num_stuff_classes=num_stuff_classes,
19 | ),
20 | test_cfg=dict(
21 | panoptic_on=False,
22 | semantic_on=False,
23 | instance_on=True,
24 | ),
25 | )
26 |
--------------------------------------------------------------------------------
/seg/datasets/pipelines/frame_sampling.py:
--------------------------------------------------------------------------------
1 | import random
2 | from typing import Dict, List, Optional
3 |
4 | import numpy as np
5 | from mmdet.registry import TRANSFORMS
6 | from mmdet.datasets.transforms import BaseFrameSample
7 |
8 |
9 | @TRANSFORMS.register_module()
10 | class VideoClipSample(BaseFrameSample):
11 | def __init__(self,
12 | num_selected: int = 1,
13 | interval: int = 1,
14 | collect_video_keys: List[str] = ['video_id', 'video_length']):
15 | self.num_selected = num_selected
16 | self.interval = interval
17 | super().__init__(collect_video_keys=collect_video_keys)
18 |
19 | def transform(self, video_infos: dict) -> Optional[Dict[str, List]]:
20 | """Transform the video information.
21 |
22 | Args:
23 | video_infos (dict): The whole video information.
24 |
25 | Returns:
26 | dict: The data information of the sampled frames.
27 | """
28 | len_with_interval = self.num_selected + (self.num_selected - 1) * (self.interval - 1)
29 | len_video = video_infos['video_length']
30 | if len_with_interval > len_video:
31 | return None
32 |
33 | first_frame_id = random.sample(range(len_video - len_with_interval + 1), 1)[0]
34 |
35 | sampled_frames_ids = first_frame_id + np.arange(self.num_selected) * self.interval
36 | results = self.prepare_data(video_infos, sampled_frames_ids)
37 |
38 | return results
39 |
40 | def __repr__(self) -> str:
41 | repr_str = self.__class__.__name__
42 | repr_str += f'num_selected=({self.num_selected}'
43 | repr_str += f'interval={self.interval}'
44 | repr_str += f'collect_video_keys={self.collect_video_keys})'
45 | return repr_str
46 |
--------------------------------------------------------------------------------
/seg/models/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .openclip_backbone import OpenCLIPBackbone
2 | from .openclip_backbone import OpenCLIPBackboneText
3 |
--------------------------------------------------------------------------------
/seg/models/data_preprocessor/__init__.py:
--------------------------------------------------------------------------------
1 | from .vidseg_data_preprocessor import VideoSegDataPreprocessor
2 | from .ovsam_preprocessor import OVSAMDataPreprocessor, OVSAMVideoSegDataPreprocessor
3 |
--------------------------------------------------------------------------------
/seg/models/detectors/__init__.py:
--------------------------------------------------------------------------------
1 | from .mask2former_vid import Mask2formerVideo
2 | from .mask2former_vid_minvis import Mask2formerVideoMinVIS
3 |
--------------------------------------------------------------------------------
/seg/models/fusion_head/__init__.py:
--------------------------------------------------------------------------------
1 | from .omgseg_fusionhead import OMGFusionHead
2 |
--------------------------------------------------------------------------------
/seg/models/heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .mask2former_vid import Mask2FormerVideoHead
2 |
--------------------------------------------------------------------------------
/seg/models/task_modules/cost.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Union
2 |
3 | import torch
4 | from mmdet.models.task_modules.assigners.match_cost import BaseMatchCost
5 | from mmengine.structures import InstanceData
6 | from torch import Tensor
7 |
8 | from mmdet.registry import TASK_UTILS
9 |
10 |
11 | @TASK_UTILS.register_module()
12 | class FlexibleClassificationCost(BaseMatchCost):
13 | def __init__(self, weight: Union[float, int] = 1) -> None:
14 | super().__init__(weight=weight)
15 |
16 | def __call__(self,
17 | pred_instances: InstanceData,
18 | gt_instances: InstanceData,
19 | img_meta: Optional[dict] = None,
20 | **kwargs) -> Tensor:
21 | """Compute match cost.
22 |
23 | Args:
24 | pred_instances (:obj:`InstanceData`): ``scores`` inside is
25 | predicted classification logits, of shape
26 | (num_queries, num_class).
27 | gt_instances (:obj:`InstanceData`): ``labels`` inside should have
28 | shape (num_gt, ).
29 | img_meta (Optional[dict]): _description_. Defaults to None.
30 |
31 | Returns:
32 | Tensor: Match Cost matrix of shape (num_preds, num_gts).
33 | """
34 | _pred_scores = pred_instances.scores
35 | gt_labels = gt_instances.labels
36 |
37 | pred_scores = _pred_scores[..., :-1]
38 | iou_score = _pred_scores[..., -1:]
39 |
40 | pred_scores = pred_scores.softmax(-1)
41 | iou_score = iou_score.sigmoid()
42 | pred_scores = torch.cat([pred_scores, iou_score], dim=-1)
43 | cls_cost = -pred_scores[:, gt_labels]
44 |
45 | return cls_cost * self.weight
46 |
--------------------------------------------------------------------------------
/seg/models/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .video_gt_preprocess import preprocess_video_panoptic_gt
2 | from .mask_pool import mask_pool
3 | from .pan_seg_transform import INSTANCE_OFFSET_HB, mmpan2hbpan, mmgt2hbpan
4 | from .class_overlapping import calculate_class_overlapping
5 | from .online_pq_utils import cal_pq, IoUObj, NO_OBJ_ID
6 | from .no_obj import NO_OBJ
7 | from .offline_video_metrics import vpq_eval, stq
8 |
--------------------------------------------------------------------------------
/seg/models/utils/class_overlapping.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 |
4 | def calculate_class_overlapping(classes1: List[str], classes2: List[str]) -> List[bool]:
5 | words1 = [word for item in classes1 for word in item.split(',')]
6 | results = []
7 | for item in classes2:
8 | flag: bool = False
9 | for word in item.split(','):
10 | if word in words1:
11 | flag = True
12 | break
13 | results.append(flag)
14 | return results
15 |
--------------------------------------------------------------------------------
/seg/models/utils/load_checkpoint.py:
--------------------------------------------------------------------------------
1 | from mmengine.runner.checkpoint import CheckpointLoader
2 |
3 |
4 | def load_checkpoint_with_prefix(filename, prefix=None, map_location='cpu', logger='current'):
5 | """Load partial pretrained model with specific prefix.
6 |
7 | Args:
8 | prefix (str): The prefix of sub-module.
9 | filename (str): Accept local filepath, URL, ``torchvision://xxx``,
10 | ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
11 | details.
12 | map_location (str | None): Same as :func:`torch.load`.
13 | Defaults to None.
14 | logger: logger
15 |
16 | Returns:
17 | dict or OrderedDict: The loaded checkpoint.
18 | """
19 |
20 | checkpoint = CheckpointLoader.load_checkpoint(filename, map_location=map_location, logger=logger)
21 |
22 | if 'state_dict' in checkpoint:
23 | state_dict = checkpoint['state_dict']
24 | else:
25 | state_dict = checkpoint
26 | if not prefix:
27 | return state_dict
28 | if not prefix.endswith('.'):
29 | prefix += '.'
30 | prefix_len = len(prefix)
31 |
32 | state_dict = {
33 | k[prefix_len:]: v
34 | for k, v in state_dict.items() if k.startswith(prefix)
35 | }
36 |
37 | assert state_dict, f'{prefix} is not in the pretrained model'
38 | return state_dict
39 |
--------------------------------------------------------------------------------
/seg/models/utils/mask_pool.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 |
4 |
5 | # https://github.com/NVlabs/ODISE/blob/e97b06c424c575fec9fc5368dd4b3e050d91abc4/odise/modeling/meta_arch/odise.py#L923
6 |
7 | def mask_pool(x, mask):
8 | """
9 | Args:
10 | x: [B, C, H, W]
11 | mask: [B, Q, H, W]
12 | """
13 | if not x.shape[-2:] == mask.shape[-2:]:
14 | # reshape mask to x
15 | mask = F.interpolate(mask, size=x.shape[-2:], mode='bilinear', align_corners=False)
16 | with torch.no_grad():
17 | mask = mask.detach()
18 | mask = (mask > 0).to(mask.dtype)
19 | denorm = mask.sum(dim=(-1, -2), keepdim=True) + 1e-8
20 |
21 | mask_pooled_x = torch.einsum(
22 | "bchw,bqhw->bqc",
23 | x,
24 | mask / denorm,
25 | )
26 | return mask_pooled_x
27 |
28 |
--------------------------------------------------------------------------------
/seg/models/utils/no_obj.py:
--------------------------------------------------------------------------------
1 | NO_OBJ = 65535
2 |
--------------------------------------------------------------------------------
/seg/models/utils/pan_seg_transform.py:
--------------------------------------------------------------------------------
1 | import copy
2 |
3 | import torch
4 | import numpy as np
5 | from mmdet.evaluation import INSTANCE_OFFSET
6 |
7 | INSTANCE_OFFSET_HB = 10000
8 |
9 |
10 | def mmpan2hbpan(pred_pan_map, num_classes):
11 | pan_seg_map = - np.ones_like(pred_pan_map)
12 | for itm in np.unique(pred_pan_map):
13 | if itm >= INSTANCE_OFFSET:
14 | # cls labels (from segmentation maps)
15 | cls = itm % INSTANCE_OFFSET
16 | # id labels (from tracking maps)
17 | ins = itm // INSTANCE_OFFSET
18 | pan_seg_map[pred_pan_map == itm] = cls * INSTANCE_OFFSET_HB + ins
19 | elif itm == num_classes:
20 | pan_seg_map[pred_pan_map == itm] = num_classes * INSTANCE_OFFSET_HB
21 | else:
22 | pan_seg_map[pred_pan_map == itm] = itm * INSTANCE_OFFSET_HB
23 | assert -1 not in pan_seg_map
24 | return pan_seg_map
25 |
26 |
27 | def mmgt2hbpan(data_samples):
28 | pan_map = copy.deepcopy(data_samples.gt_sem_seg.sem_seg[0])
29 | pan_map = pan_map * INSTANCE_OFFSET_HB
30 | gt_instances = data_samples.gt_instances
31 | for idx in range(len(gt_instances)):
32 | mask = torch.tensor(gt_instances.masks.masks[idx], dtype=torch.bool)
33 | instance_id = gt_instances.instances_ids[idx].item()
34 | pan_map[mask] = instance_id
35 |
36 | return pan_map
37 |
--------------------------------------------------------------------------------
/tools/dist.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | FILE=$1
4 | CONFIG=$2
5 | GPUS=$3
6 | NNODES=${NNODES:-1}
7 | NODE_RANK=${NODE_RANK:-0}
8 | PORT=${PORT:-$((28500 + $RANDOM % 2000))}
9 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
10 |
11 |
12 | if command -v torchrun &> /dev/null
13 | then
14 | echo "Using torchrun mode."
15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 \
16 | torchrun --nnodes=${NNODES} \
17 | --nnodes=${NNODES} \
18 | --node_rank=${NODE_RANK} \
19 | --master_addr=${MASTER_ADDR} \
20 | --master_port=${PORT} \
21 | --nproc_per_node=${GPUS} \
22 | $(dirname "$0")/${FILE}.py ${CONFIG} --launcher pytorch ${@:4}
23 | else
24 | echo "Using launch mode."
25 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 \
26 | python -m torch.distributed.launch \
27 | --nnodes=${NNODES} \
28 | --node_rank=${NODE_RANK} \
29 | --master_addr=${MASTER_ADDR} \
30 | --master_port=${PORT} \
31 | --nproc_per_node=${GPUS} \
32 | $(dirname "$0")/${FILE}.py ${CONFIG} --launcher pytorch ${@:4}
33 | fi
34 |
--------------------------------------------------------------------------------
/tools/slurm.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -x
4 |
5 | FILE=$1
6 | CONFIG=$2
7 | GPUS=${GPUS:-8}
8 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
9 | CPUS_PER_TASK=${CPUS_PER_TASK:-5}
10 | MASTER_PORT=${MASTER_PORT:-$((28500 + $RANDOM % 2000))}
11 | PARTITION=${PARTITION:-DUMMY}
12 | JOB_NAME=${JOB_NAME:-DUMMY}
13 | QUOTATYPE=${QUOTATYPE:-auto}
14 | SRUN_ARGS=${SRUN_ARGS:-""}
15 | PY_ARGS=${@:3}
16 |
17 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 \
18 | CUDA_HOME=$(dirname $(dirname $(which nvcc))) \
19 | MASTER_PORT=$MASTER_PORT \
20 | srun -p ${PARTITION} \
21 | --job-name=${JOB_NAME} \
22 | --gres=gpu:${GPUS_PER_NODE} \
23 | --ntasks=${GPUS} \
24 | --ntasks-per-node=${GPUS_PER_NODE} \
25 | --cpus-per-task=${CPUS_PER_TASK} \
26 | --kill-on-bad-exit=1 \
27 | --quotatype=${QUOTATYPE} \
28 | ${SRUN_ARGS} \
29 | python -u tools/${FILE}.py ${CONFIG} --launcher="slurm" ${PY_ARGS}
30 |
--------------------------------------------------------------------------------