├── .gitignore ├── Data.md ├── Enviroment.md ├── README.md ├── lmmrotate ├── __init__.py ├── dataset.py ├── deepspeed_config │ ├── internvl │ │ ├── zero_stage1_config.json │ │ ├── zero_stage2_config.json │ │ └── zero_stage3_config.json │ ├── zero2.json │ ├── zero3.json │ └── zero3_offload.json ├── eval.py ├── models │ ├── __init__.py │ ├── api.py │ ├── florence2 │ │ ├── __init__.py │ │ ├── configuration_florence2.py │ │ ├── modeling_florence2.py │ │ └── processing_florence2.py │ ├── internvl2 │ │ ├── __init__.py │ │ ├── allseeing_format.py │ │ ├── configuration_intern_vit.py │ │ ├── configuration_internvl_chat.py │ │ ├── conversation.py │ │ ├── modeling_intern_vit.py │ │ ├── modeling_internvl_chat.py │ │ └── processing_internvl2.py │ ├── llava │ │ ├── __init__.py │ │ ├── apply_delta.py │ │ ├── builder.py │ │ ├── consolidate.py │ │ ├── constants.py │ │ ├── conversation.py │ │ ├── language_model │ │ │ ├── __init__.py │ │ │ ├── llava_gemma.py │ │ │ ├── llava_llama.py │ │ │ ├── llava_mistral.py │ │ │ ├── llava_mixtral.py │ │ │ ├── llava_mpt.py │ │ │ ├── llava_qwen.py │ │ │ ├── llava_qwen_moe.py │ │ │ └── modeling_llama.py │ │ ├── llava_arch.py │ │ ├── make_delta.py │ │ ├── mm_utils.py │ │ ├── multimodal_encoder │ │ │ ├── builder.py │ │ │ ├── clip_encoder.py │ │ │ ├── dev_eva_clip │ │ │ │ ├── eva_clip │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ │ │ ├── constants.py │ │ │ │ │ ├── eva_vit_model.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── hf_configs.py │ │ │ │ │ ├── hf_model.py │ │ │ │ │ ├── loss.py │ │ │ │ │ ├── model.py │ │ │ │ │ ├── model_configs │ │ │ │ │ │ ├── EVA-CLIP-18B.json │ │ │ │ │ │ ├── EVA-CLIP-8B-plus.json │ │ │ │ │ │ ├── EVA-CLIP-8B.json │ │ │ │ │ │ ├── EVA01-CLIP-B-16.json │ │ │ │ │ │ ├── EVA01-CLIP-g-14-plus.json │ │ │ │ │ │ ├── EVA01-CLIP-g-14.json │ │ │ │ │ │ ├── EVA02-CLIP-B-16.json │ │ │ │ │ │ ├── EVA02-CLIP-L-14-336.json │ │ │ │ │ │ ├── EVA02-CLIP-L-14.json │ │ │ │ │ │ ├── EVA02-CLIP-bigE-14-plus.json │ │ │ │ │ │ ├── EVA02-CLIP-bigE-14.json │ │ │ │ │ │ ├── Internal-EVA02-CLIP-10B-14-448.json │ │ │ │ │ │ └── Internal-EVA02-CLIP-10B-14.json │ │ │ │ │ ├── modified_resnet.py │ │ │ │ │ ├── openai.py │ │ │ │ │ ├── pretrained.py │ │ │ │ │ ├── rope.py │ │ │ │ │ ├── timm_model.py │ │ │ │ │ ├── tokenizer.py │ │ │ │ │ ├── transform.py │ │ │ │ │ ├── transformer.py │ │ │ │ │ └── utils.py │ │ │ │ └── eva_vit.py │ │ │ ├── eva_clip │ │ │ │ ├── eva_clip_encoder.py │ │ │ │ ├── eva_clip_processors.py │ │ │ │ ├── eva_vit.py │ │ │ │ ├── factory.py │ │ │ │ └── model_configs │ │ │ │ │ ├── EVA-CLIP-18B.json │ │ │ │ │ ├── EVA-CLIP-8B-plus.json │ │ │ │ │ ├── EVA-CLIP-8B.json │ │ │ │ │ ├── EVA01-CLIP-B-16.json │ │ │ │ │ ├── EVA01-CLIP-g-14-plus.json │ │ │ │ │ ├── EVA01-CLIP-g-14.json │ │ │ │ │ ├── EVA02-CLIP-B-16.json │ │ │ │ │ ├── EVA02-CLIP-L-14-336.json │ │ │ │ │ ├── EVA02-CLIP-L-14.json │ │ │ │ │ ├── EVA02-CLIP-bigE-14-plus.json │ │ │ │ │ ├── EVA02-CLIP-bigE-14.json │ │ │ │ │ ├── Internal-EVA02-CLIP-10B-14-448.json │ │ │ │ │ └── Internal-EVA02-CLIP-10B-14.json │ │ │ ├── hf_vision.py │ │ │ ├── imagebind.py │ │ │ ├── open_clip_encoder.py │ │ │ └── siglip_encoder.py │ │ ├── multimodal_projector │ │ │ ├── builder.py │ │ │ └── pooler_projector.py │ │ ├── multimodal_resampler │ │ │ ├── builder.py │ │ │ ├── masked_drop.py │ │ │ ├── perceiver.py │ │ │ ├── qformer.py │ │ │ └── spatial_pool.py │ │ ├── processing_llava.py │ │ └── utils.py │ └── version.py ├── modules │ ├── f1_metric.py │ ├── fair_dataset.py │ ├── fair_metric.py │ └── rsar_dataset.py ├── train.py ├── trainer.py └── utils.py ├── playground ├── P0368__1024__0___0.png ├── convert_dota_for_sft.py ├── demo.ipynb ├── eval_mmrotate_detector_mapnc.py ├── evaluate_without_scores.ipynb ├── get_num_params.ipynb ├── gradio_app_poly.py ├── map_nc_robustness_cal.py ├── map_nc_robustness_draw.ipynb ├── mmrotate_configs │ ├── dior-1024.py │ ├── dior.py │ ├── fair1m.py │ ├── rotated-fcos-le90_r50_fpn_1x_dior-1024.py │ ├── rotated-fcos-le90_r50_fpn_1x_dior.py │ ├── rotated-fcos-le90_r50_fpn_1x_dota-train.py │ ├── rotated-fcos-le90_r50_fpn_1x_fair.py │ ├── rotated-fcos-le90_r50_fpn_1x_rsar-1024.py │ ├── rotated-fcos-le90_r50_fpn_1x_rsar.py │ ├── rotated-fcos-le90_r50_fpn_6x_srsdd.py │ ├── rotated-retinanet-rbox-le90_r50_fpn_1x_dior-1024.py │ ├── rotated-retinanet-rbox-le90_r50_fpn_1x_dota-train.py │ ├── rotated-retinanet-rbox-le90_r50_fpn_1x_dota.py │ ├── rotated-retinanet-rbox-le90_r50_fpn_1x_fair.py │ ├── rotated-retinanet-rbox-le90_r50_fpn_1x_rsar-1024.py │ ├── rotated-retinanet-rbox-le90_r50_fpn_1x_rsar.py │ ├── rotated-retinanet-rbox-le90_r50_fpn_6x_srsdd.py │ ├── rsar-1024.py │ ├── rsar.py │ └── srsdd.py ├── mmrotate_img_split.py ├── mmrotate_test.py ├── mmrotate_train.py └── times.ttf ├── pyproject.toml └── scripts ├── eval_standalone.sh ├── florence-2-l_vis1024-lang2048_dota1-v2_b1x1xga32-50e.sh ├── florence-2-l_vis1024-lang2048_dota1-v2_b2x2xga8-50e.sh ├── florence-2-l_vis1024-lang2048_dota1-v2_b2x4xga4-50e.sh └── slurm ├── eval_slurm.sh └── florence-2-l_vis1024-lang2048_dota1-v2_b2x16-100e.sh /.gitignore: -------------------------------------------------------------------------------- 1 | checkpoints* 2 | playground/data* 3 | *mmrotate_workdir* 4 | .vscode/launch.json 5 | tmp* 6 | .trash/** 7 | scripts_py/eval_rfcos/* 8 | scripts_py/eval_mmrotate/* 9 | scripts_py/mask_rcnn_demo/* 10 | **.pth 11 | **.pyc 12 | lmmrotate.egg-info/** 13 | data 14 | work_dirs/** 15 | **.svg 16 | -------------------------------------------------------------------------------- /Enviroment.md: -------------------------------------------------------------------------------- 1 | ## Enviroment 2 | **NOTE: a misaligned enviroment between inference and training may cause bad effect.** 3 | 4 | - create env and install torch 5 | ```shell 6 | conda create -n lmmrotate python=3.10.12 7 | conda activate lmmrotate 8 | ``` 9 | 10 | - set cuda&gcc (recommanded for current enviroment, you can also set it in ~/.bashrc) 11 | ```shell 12 | mkdir -p $CONDA_PREFIX/etc/conda/activate.d 13 | touch $CONDA_PREFIX/etc/conda/activate.d/cuda_env.sh 14 | vim $CONDA_PREFIX/etc/conda/activate.d/cuda_env.sh 15 | ``` 16 | write the following lines 17 | ```shell 18 | # set cuda&gcc home 19 | export CUDA_HOME=todo # change this to 20 | export GCC_HOME=todo # change this to 21 | # remove redundant cuda&gcc path 22 | export PATH=$(echo "$PATH" | sed -e 's#[^:]*cuda[^:]*:##g' -e 's#:[^:]*cuda[^:]*##g' -e 's#[^:]*gcc[^:]*:##g' -e 's#:[^:]*gcc[^:]*##g') 23 | export LD_LIBRARY_PATH=$(echo "$LD_LIBRARY_PATH" | sed -e 's#[^:]*cuda[^:]*:##g' -e 's#:[^:]*cuda[^:]*##g' -e 's#[^:]*gcc[^:]*:##g' -e 's#:[^:]*gcc[^:]*##g') 24 | # set cuda&gcc path 25 | export PATH=$CUDA_HOME/bin:$GCC_HOME/bin:$PATH 26 | export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$GCC_HOME/lib64:$LD_LIBRARY_PATH 27 | # set site-packages path 28 | export SITE_PACKAGES_PATH=$(python -c "import site; print(site.getsitepackages()[0])") 29 | ``` 30 | then `conda activate lmmrotate` to enable these env vars 31 | 32 | - install torch 33 | ```shell 34 | pip install torch==2.3.0 torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cu121 35 | ``` 36 | 37 | - build and install [mmcv](https://mmcv.readthedocs.io/en/latest/) 38 | ```shell 39 | # install mim 40 | pip install openmim 41 | 42 | # install mmcv 43 | # install with openmim 44 | mim install "mmcv==2.0.1" 45 | # install from source (recommanded) 46 | git clone https://github.com/open-mmlab/mmcv.git $SITE_PACKAGES_PATH/mmcv 47 | cd $SITE_PACKAGES_PATH/mmcv 48 | git checkout v2.0.1 49 | pip install -r requirements/optional.txt 50 | echo 'set -x;TORCH_CUDA_ARCH_LIST=$(python -c "import torch; print(f'\''{torch.cuda.get_device_capability()[0]}.{torch.cuda.get_device_capability()[1]}'\'')") pip install -e . -v' >> install.sh 51 | bash install.sh 52 | ``` 53 | The compiling of mmcv-v2.0.1 may raise error, because torch require C++17 or later compatible compiler. One solution is in [this issue](https://github.com/open-mmlab/mmcv/issues/2860). 54 | > Changing `c++14` to `c++17` in [the 204 line](https://github.com/open-mmlab/mmcv/blob/d28aa8a9cced3158e724585d5e6839947ca5c449/setup.py#L204) and [the 421 line](https://github.com/open-mmlab/mmcv/blob/d28aa8a9cced3158e724585d5e6839947ca5c449/setup.py#L421) of the `setup.py` can temporarily fix this issue. 55 | 56 | - install openmmlab mmdet and mmrotate 57 | ```shell 58 | mim install "mmdet==3.0.0" 59 | mim install "mmrotate==1.0.0rc1" 60 | ``` 61 | 62 | - install [flash-attention](https://github.com/Dao-AILab/flash-attention) 63 | ```shell 64 | pip install flash-attn==2.7.0.post2 --no-build-isolation 65 | ``` 66 | 67 | - install lmmrotate 68 | ```shell 69 | pip install -e . 70 | ``` 71 | 72 | - The torch-2.3.0 may raise a warning as: 73 | > site-packages/torch/nn/modules/conv.py:456: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.) In version 2.3.0 of pytorch, it prints this unwanted warning even if no exception is thrown: see https://github.com/pytorch/pytorch/pull/125790. 74 | -------------------------------------------------------------------------------- /lmmrotate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionXLab/mllm-mmrotate/110f0ffdfd6788c0a73eaa3457cc4dfbdfc28bbb/lmmrotate/__init__.py -------------------------------------------------------------------------------- /lmmrotate/deepspeed_config/internvl/zero_stage1_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 1, 4 | "allgather_partitions": true, 5 | "allgather_bucket_size": 1e9, 6 | "overlap_comm": true, 7 | "reduce_scatter": true, 8 | "reduce_bucket_size": 1e9, 9 | "contiguous_gradients": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "auto_cast": true, 14 | "loss_scale": 0, 15 | "initial_scale_power": 32, 16 | "loss_scale_window": 1000, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1 19 | }, 20 | "bf16": { 21 | "enabled": "auto" 22 | }, 23 | "optimizer": { 24 | "type": "AdamW", 25 | "params": { 26 | "lr": "auto", 27 | "betas": [ 28 | 0.9, 29 | 0.999 30 | ], 31 | "eps": 1e-8, 32 | "weight_decay": "auto" 33 | } 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 2000, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": true 41 | } -------------------------------------------------------------------------------- /lmmrotate/deepspeed_config/internvl/zero_stage2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 2, 4 | "allgather_partitions": true, 5 | "allgather_bucket_size": 1e8, 6 | "overlap_comm": true, 7 | "reduce_scatter": true, 8 | "reduce_bucket_size": 1e8, 9 | "contiguous_gradients": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "auto_cast": true, 14 | "loss_scale": 0, 15 | "initial_scale_power": 32, 16 | "loss_scale_window": 1000, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1 19 | }, 20 | "bf16": { 21 | "enabled": "auto" 22 | }, 23 | "optimizer": { 24 | "type": "AdamW", 25 | "params": { 26 | "lr": "auto", 27 | "betas": [ 28 | 0.9, 29 | 0.999 30 | ], 31 | "eps": 1e-8, 32 | "weight_decay": "auto" 33 | } 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 2000, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } -------------------------------------------------------------------------------- /lmmrotate/deepspeed_config/internvl/zero_stage3_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "overlap_comm": true, 5 | "contiguous_gradients": true, 6 | "sub_group_size": 1e9, 7 | "reduce_bucket_size": 1e9, 8 | "stage3_prefetch_bucket_size": 1e9, 9 | "stage3_param_persistence_threshold": 1e7, 10 | "stage3_max_live_parameters": 1e9, 11 | "stage3_max_reuse_distance": 1e9, 12 | "stage3_gather_16bit_weights_on_model_save": true 13 | }, 14 | "fp16": { 15 | "enabled": "auto", 16 | "auto_cast": true, 17 | "loss_scale": 0, 18 | "initial_scale_power": 32, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "bf16": { 24 | "enabled": "auto" 25 | }, 26 | "optimizer": { 27 | "type": "AdamW", 28 | "params": { 29 | "lr": "auto", 30 | "betas": [ 31 | 0.9, 32 | 0.999 33 | ], 34 | "eps": 1e-8, 35 | "weight_decay": "auto" 36 | } 37 | }, 38 | "gradient_accumulation_steps": "auto", 39 | "gradient_clipping": "auto", 40 | "steps_per_print": 2000, 41 | "train_batch_size": "auto", 42 | "train_micro_batch_size_per_gpu": "auto", 43 | "wall_clock_breakdown": true 44 | } -------------------------------------------------------------------------------- /lmmrotate/deepspeed_config/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /lmmrotate/deepspeed_config/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /lmmrotate/deepspeed_config/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "gather_16bit_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "steps_per_print": 1e5, 55 | "wall_clock_breakdown": false 56 | } -------------------------------------------------------------------------------- /lmmrotate/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .florence2 import ( 2 | Florence2ForConditionalGeneration, Florence2Config, Florence2Processor, Florence2PostProcesser 3 | ) 4 | from .internvl2 import ( 5 | InternVLChatModel, InternVLChatConfig, SimpleInternVL2Processor 6 | ) 7 | from .llava import ( 8 | LlavaQwenForCausalLM, LlavaQwenConfig, SimpleLlavaQwenProcessor 9 | ) 10 | from .api import get_inferencer 11 | from .version import default_version_commit_id 12 | 13 | __all__ = [ 14 | "Florence2ForConditionalGeneration", 15 | "Florence2Config", 16 | "Florence2VisionConfig", 17 | "Florence2LanguageConfig", 18 | "Florence2Processor", 19 | "Florence2PostProcesser", 20 | "InternVLChatModel", 21 | "InternVLChatConfig", 22 | "SimpleInternVL2Processor", 23 | "LlavaQwenForCausalLM", 24 | "LlavaQwenConfig", 25 | "SimpleLlavaQwenProcessor", 26 | "get_inferencer", 27 | ] -------------------------------------------------------------------------------- /lmmrotate/models/florence2/__init__.py: -------------------------------------------------------------------------------- 1 | # current version was based on https://huggingface.co/microsoft/Florence-2-large/tree/6bf179230dd8855083a51a5e11beb04aec1291fd 2 | from .modeling_florence2 import Florence2ForConditionalGeneration 3 | from .configuration_florence2 import Florence2Config, Florence2VisionConfig, Florence2LanguageConfig 4 | from .processing_florence2 import Florence2Processor, Florence2PostProcesser 5 | 6 | __all__ = [ 7 | "Florence2ForConditionalGeneration", 8 | "Florence2Config", 9 | "Florence2VisionConfig", 10 | "Florence2LanguageConfig", 11 | "Florence2Processor", 12 | "Florence2PostProcesser", 13 | ] 14 | -------------------------------------------------------------------------------- /lmmrotate/models/internvl2/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling_internvl_chat import InternVLChatModel 2 | from .configuration_internvl_chat import InternVLChatConfig 3 | from .processing_internvl2 import SimpleInternVL2Processor 4 | 5 | 6 | __all__ = [ 7 | "InternVLChatModel", 8 | "InternVLChatConfig", 9 | "SimpleInternVL2Processor", 10 | ] -------------------------------------------------------------------------------- /lmmrotate/models/internvl2/configuration_intern_vit.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | import os 8 | from typing import Union 9 | 10 | from transformers.configuration_utils import PretrainedConfig 11 | from transformers.utils import logging 12 | 13 | logger = logging.get_logger(__name__) 14 | 15 | 16 | class InternVisionConfig(PretrainedConfig): 17 | r""" 18 | This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to 19 | instantiate a vision encoder according to the specified arguments, defining the model architecture. 20 | 21 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 22 | documentation from [`PretrainedConfig`] for more information. 23 | 24 | Args: 25 | num_channels (`int`, *optional*, defaults to 3): 26 | Number of color channels in the input images (e.g., 3 for RGB). 27 | patch_size (`int`, *optional*, defaults to 14): 28 | The size (resolution) of each patch. 29 | image_size (`int`, *optional*, defaults to 224): 30 | The size (resolution) of each image. 31 | qkv_bias (`bool`, *optional*, defaults to `False`): 32 | Whether to add a bias to the queries and values in the self-attention layers. 33 | hidden_size (`int`, *optional*, defaults to 3200): 34 | Dimensionality of the encoder layers and the pooler layer. 35 | num_attention_heads (`int`, *optional*, defaults to 25): 36 | Number of attention heads for each attention layer in the Transformer encoder. 37 | intermediate_size (`int`, *optional*, defaults to 12800): 38 | Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 39 | qk_normalization (`bool`, *optional*, defaults to `True`): 40 | Whether to normalize the queries and keys in the self-attention layers. 41 | num_hidden_layers (`int`, *optional*, defaults to 48): 42 | Number of hidden layers in the Transformer encoder. 43 | use_flash_attn (`bool`, *optional*, defaults to `True`): 44 | Whether to use flash attention mechanism. 45 | hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): 46 | The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, 47 | `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. 48 | layer_norm_eps (`float`, *optional*, defaults to 1e-6): 49 | The epsilon used by the layer normalization layers. 50 | dropout (`float`, *optional*, defaults to 0.0): 51 | The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. 52 | drop_path_rate (`float`, *optional*, defaults to 0.0): 53 | Dropout rate for stochastic depth. 54 | attention_dropout (`float`, *optional*, defaults to 0.0): 55 | The dropout ratio for the attention probabilities. 56 | initializer_range (`float`, *optional*, defaults to 0.02): 57 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 58 | initializer_factor (`float`, *optional*, defaults to 0.1): 59 | A factor for layer scale. 60 | """ 61 | 62 | model_type = 'intern_vit_6b' 63 | 64 | def __init__( 65 | self, 66 | num_channels=3, 67 | patch_size=14, 68 | image_size=224, 69 | qkv_bias=False, 70 | hidden_size=3200, 71 | num_attention_heads=25, 72 | intermediate_size=12800, 73 | qk_normalization=True, 74 | num_hidden_layers=48, 75 | use_flash_attn=True, 76 | hidden_act='gelu', 77 | norm_type='rms_norm', 78 | layer_norm_eps=1e-6, 79 | dropout=0.0, 80 | drop_path_rate=0.0, 81 | attention_dropout=0.0, 82 | initializer_range=0.02, 83 | initializer_factor=0.1, 84 | **kwargs, 85 | ): 86 | super().__init__(**kwargs) 87 | 88 | self.hidden_size = hidden_size 89 | self.intermediate_size = intermediate_size 90 | self.dropout = dropout 91 | self.drop_path_rate = drop_path_rate 92 | self.num_hidden_layers = num_hidden_layers 93 | self.num_attention_heads = num_attention_heads 94 | self.num_channels = num_channels 95 | self.patch_size = patch_size 96 | self.image_size = image_size 97 | self.initializer_range = initializer_range 98 | self.initializer_factor = initializer_factor 99 | self.attention_dropout = attention_dropout 100 | self.layer_norm_eps = layer_norm_eps 101 | self.hidden_act = hidden_act 102 | self.norm_type = norm_type 103 | self.qkv_bias = qkv_bias 104 | self.qk_normalization = qk_normalization 105 | self.use_flash_attn = use_flash_attn 106 | 107 | @classmethod 108 | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig': 109 | config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) 110 | 111 | if 'vision_config' in config_dict: 112 | config_dict = config_dict['vision_config'] 113 | 114 | if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type: 115 | logger.warning( 116 | f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " 117 | f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.' 118 | ) 119 | 120 | return cls.from_dict(config_dict, **kwargs) 121 | -------------------------------------------------------------------------------- /lmmrotate/models/internvl2/configuration_internvl_chat.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | import copy 8 | 9 | from transformers import AutoConfig, LlamaConfig, Qwen2Config 10 | from transformers.configuration_utils import PretrainedConfig 11 | from transformers.utils import logging 12 | 13 | from .configuration_intern_vit import InternVisionConfig 14 | 15 | logger = logging.get_logger(__name__) 16 | 17 | 18 | class InternVLChatConfig(PretrainedConfig): 19 | model_type = 'internvl_chat' 20 | is_composition = True 21 | 22 | def __init__( 23 | self, 24 | vision_config=None, 25 | llm_config=None, 26 | use_backbone_lora=0, 27 | use_llm_lora=0, 28 | select_layer=-1, 29 | force_image_size=None, 30 | downsample_ratio=0.5, 31 | template=None, 32 | dynamic_image_size=False, 33 | use_thumbnail=False, 34 | ps_version='v1', 35 | min_dynamic_patch=1, 36 | max_dynamic_patch=6, 37 | **kwargs): 38 | super().__init__(**kwargs) 39 | 40 | if vision_config is None: 41 | vision_config = {'architectures': ['InternVisionModel']} 42 | logger.info('vision_config is None. Initializing the InternVisionConfig with default values.') 43 | 44 | if llm_config is None: 45 | llm_config = {'architectures': ['Qwen2ForCausalLM']} 46 | logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).') 47 | 48 | self.vision_config = InternVisionConfig(**vision_config) 49 | if llm_config.get('architectures')[0] == 'LlamaForCausalLM': 50 | self.llm_config = LlamaConfig(**llm_config) 51 | elif llm_config.get('architectures')[0] == 'Qwen2ForCausalLM': 52 | self.llm_config = Qwen2Config(**llm_config) 53 | else: 54 | raise ValueError('Unsupported architecture: {}'.format(llm_config.get('architectures')[0])) 55 | self.use_backbone_lora = use_backbone_lora 56 | self.use_llm_lora = use_llm_lora 57 | self.select_layer = select_layer 58 | self.force_image_size = force_image_size 59 | self.downsample_ratio = downsample_ratio 60 | self.template = template 61 | self.dynamic_image_size = dynamic_image_size 62 | self.use_thumbnail = use_thumbnail 63 | self.ps_version = ps_version # pixel shuffle version 64 | self.min_dynamic_patch = min_dynamic_patch 65 | self.max_dynamic_patch = max_dynamic_patch 66 | 67 | logger.info(f'vision_select_layer: {self.select_layer}') 68 | logger.info(f'ps_version: {self.ps_version}') 69 | logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}') 70 | logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}') 71 | 72 | def to_dict(self): 73 | """ 74 | Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. 75 | 76 | Returns: 77 | `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, 78 | """ 79 | output = copy.deepcopy(self.__dict__) 80 | output['vision_config'] = self.vision_config.to_dict() 81 | output['llm_config'] = self.llm_config.to_dict() 82 | output['model_type'] = self.__class__.model_type 83 | output['use_backbone_lora'] = self.use_backbone_lora 84 | output['use_llm_lora'] = self.use_llm_lora 85 | output['select_layer'] = self.select_layer 86 | output['force_image_size'] = self.force_image_size 87 | output['downsample_ratio'] = self.downsample_ratio 88 | output['template'] = self.template 89 | output['dynamic_image_size'] = self.dynamic_image_size 90 | output['use_thumbnail'] = self.use_thumbnail 91 | output['ps_version'] = self.ps_version 92 | output['min_dynamic_patch'] = self.min_dynamic_patch 93 | output['max_dynamic_patch'] = self.max_dynamic_patch 94 | 95 | return output 96 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/__init__.py: -------------------------------------------------------------------------------- 1 | # Copied from https://github.com/LLaVA-VL/LLaVA-NeXT/tree/79ef45a6d8b89b92d7a8525f077c3a3a9894a87d/llava/model # noqa 2 | from .builder import load_pretrained_model 3 | from .language_model import ( 4 | LlavaLlamaForCausalLM, LlavaQwenForCausalLM, LlavaMistralForCausalLM, LlavaMixtralForCausalLM, 5 | LlavaQwenMoeForCausalLM, LlavaGemmaForCausalLM, LlavaMptForCausalLM, 6 | LlavaLlamaConfig, LlavaQwenConfig, LlavaMistralConfig, LlavaMixtralConfig, 7 | LlavaQwenMoeConfig, LlavaGemmaConfig, LlavaMptConfig, 8 | ) 9 | from .processing_llava import SimpleLlavaQwenProcessor 10 | 11 | __all__ = [ 12 | "load_pretrained_model", 13 | "LlavaLlamaForCausalLM", 14 | "LlavaQwenForCausalLM", 15 | "LlavaMistralForCausalLM", 16 | "LlavaMixtralForCausalLM", 17 | "LlavaQwenMoeForCausalLM", 18 | "LlavaGemmaForCausalLM", 19 | "LlavaMptForCausalLM", 20 | "LlavaLlamaConfig", 21 | "LlavaQwenConfig", 22 | "LlavaMistralConfig", 23 | "LlavaMixtralConfig", 24 | "LlavaQwenMoeConfig", 25 | "LlavaGemmaConfig", 26 | "LlavaMptConfig", 27 | "SimpleLlavaQwenProcessor", 28 | ] 29 | 30 | AVAILABLE_MODELS = { 31 | # "llava_llama": "LlavaLlamaForCausalLM, LlavaConfig", 32 | "llava_qwen": "LlavaQwenForCausalLM, LlavaQwenConfig", 33 | # "llava_mistral": "LlavaMistralForCausalLM, LlavaMistralConfig", 34 | # "llava_mixtral": "LlavaMixtralForCausalLM, LlavaMixtralConfig", 35 | # "llava_qwen_moe": "LlavaQwenMoeForCausalLM, LlavaQwenMoeConfig", 36 | # Add other models as needed 37 | } 38 | 39 | for model_name, model_classes in AVAILABLE_MODELS.items(): 40 | try: 41 | exec(f"from .language_model.{model_name} import {model_classes}") 42 | except Exception as e: 43 | print(f"Failed to import {model_name} from llava.language_model.{model_name}. Error: {e}") 44 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | 6 | import argparse 7 | 8 | import torch 9 | from tqdm import tqdm 10 | from transformers import AutoTokenizer, AutoModelForCausalLM 11 | from lmmrotate.models.llava import LlavaLlamaForCausalLM 12 | 13 | 14 | def apply_delta(base_model_path, target_model_path, delta_path): 15 | print("Loading base model") 16 | base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ["model.mm_projector.weight", "model.mm_projector.bias"], f"{name} not in base model" 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ["model.embed_tokens.weight", "lm_head.weight"], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}" 31 | bparam = base.state_dict()[name] 32 | param.data[: bparam.shape[0], : bparam.shape[1]] += bparam 33 | 34 | print("Saving target model") 35 | delta.save_pretrained(target_model_path) 36 | delta_tokenizer.save_pretrained(target_model_path) 37 | 38 | 39 | if __name__ == "__main__": 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument("--base-model-path", type=str, required=True) 42 | parser.add_argument("--target-model-path", type=str, required=True) 43 | parser.add_argument("--delta-path", type=str, required=True) 44 | 45 | args = parser.parse_args() 46 | 47 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 48 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | 6 | import argparse 7 | 8 | import torch 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from lmmrotate.models.llava import * 11 | from llava.model.utils import auto_upgrade # NotImplement Yet 12 | 13 | 14 | def consolidate_ckpt(src_path, dst_path): 15 | print("Loading model") 16 | auto_upgrade(src_path) 17 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 18 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 19 | src_model.save_pretrained(dst_path) 20 | src_tokenizer.save_pretrained(dst_path) 21 | 22 | 23 | if __name__ == "__main__": 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--src", type=str, required=True) 26 | parser.add_argument("--dst", type=str, required=True) 27 | 28 | args = parser.parse_args() 29 | 30 | consolidate_ckpt(args.src, args.dst) 31 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/constants.py: -------------------------------------------------------------------------------- 1 | # Model Constants 2 | IGNORE_INDEX = -100 3 | IMAGE_TOKEN_INDEX = -200 4 | DEFAULT_IMAGE_TOKEN = "" 5 | DEFAULT_IMAGE_PATCH_TOKEN = "" 6 | DEFAULT_IM_START_TOKEN = "" 7 | DEFAULT_IM_END_TOKEN = "" -------------------------------------------------------------------------------- /lmmrotate/models/llava/language_model/__init__.py: -------------------------------------------------------------------------------- 1 | from .llava_llama import LlavaLlamaForCausalLM, LlavaConfig as LlavaLlamaConfig 2 | from .llava_qwen import LlavaQwenForCausalLM, LlavaQwenConfig 3 | from .llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig 4 | from .llava_mixtral import LlavaMixtralForCausalLM, LlavaMixtralConfig 5 | from .llava_qwen_moe import LlavaQwenMoeForCausalLM, LlavaQwenMoeConfig 6 | from .llava_gemma import LlavaGemmaForCausalLM, LlavaGemmaConfig 7 | from .llava_mpt import LlavaMptForCausalLM, LlavaMptConfig 8 | 9 | __all__ = [ 10 | "LlavaLlamaForCausalLM", "LlavaLlamaConfig", 11 | "LlavaQwenForCausalLM", "LlavaQwenConfig", 12 | "LlavaMistralForCausalLM", "LlavaMistralConfig", 13 | "LlavaMixtralForCausalLM", "LlavaMixtralConfig", 14 | "LlavaQwenMoeForCausalLM", "LlavaQwenMoeConfig", 15 | "LlavaGemmaForCausalLM", "LlavaGemmaConfig", 16 | "LlavaMptForCausalLM", "LlavaMptConfig", 17 | ] 18 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/language_model/llava_gemma.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Duc Q. Nguyen, Haotian Liu and Bo Li 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import List, Optional, Tuple, Union 17 | 18 | import torch 19 | import torch.nn as nn 20 | from torch.nn import CrossEntropyLoss 21 | 22 | from transformers import AutoConfig, AutoModelForCausalLM, GemmaConfig, GemmaModel, GemmaForCausalLM 23 | 24 | from transformers.modeling_outputs import CausalLMOutputWithPast 25 | from transformers.generation.utils import GenerateOutput 26 | 27 | from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 28 | 29 | 30 | class LlavaGemmaConfig(GemmaConfig): 31 | model_type = "llava_gemma" 32 | 33 | 34 | class LlavaGemmaModel(LlavaMetaModel, GemmaModel): 35 | config_class = LlavaGemmaConfig 36 | 37 | def __init__(self, config: GemmaConfig): 38 | super(LlavaGemmaModel, self).__init__(config) 39 | 40 | 41 | class LlavaGemmaForCausalLM(GemmaForCausalLM, LlavaMetaForCausalLM): 42 | config_class = LlavaGemmaConfig 43 | 44 | def __init__(self, config): 45 | super(GemmaForCausalLM, self).__init__(config) 46 | self.model = LlavaGemmaModel(config) 47 | 48 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 49 | 50 | # Initialize weights and apply final processing 51 | self.post_init() 52 | 53 | def get_model(self): 54 | return self.model 55 | 56 | def forward( 57 | self, 58 | input_ids: torch.LongTensor = None, 59 | attention_mask: Optional[torch.Tensor] = None, 60 | position_ids: Optional[torch.LongTensor] = None, 61 | past_key_values: Optional[List[torch.FloatTensor]] = None, 62 | inputs_embeds: Optional[torch.FloatTensor] = None, 63 | labels: Optional[torch.LongTensor] = None, 64 | use_cache: Optional[bool] = None, 65 | output_attentions: Optional[bool] = None, 66 | output_hidden_states: Optional[bool] = None, 67 | images: Optional[torch.FloatTensor] = None, 68 | image_sizes: Optional[List[List[int]]] = None, 69 | return_dict: Optional[bool] = None, 70 | cache_position: Optional[torch.LongTensor] = None, 71 | ) -> Union[Tuple, CausalLMOutputWithPast]: 72 | 73 | if inputs_embeds is None: 74 | (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes) 75 | 76 | return super().forward( 77 | input_ids=input_ids, 78 | attention_mask=attention_mask, 79 | position_ids=position_ids, 80 | past_key_values=past_key_values, 81 | inputs_embeds=inputs_embeds, 82 | labels=labels, 83 | use_cache=use_cache, 84 | output_attentions=output_attentions, 85 | output_hidden_states=output_hidden_states, 86 | return_dict=return_dict, 87 | cache_position=cache_position, 88 | ) 89 | 90 | @torch.no_grad() 91 | def generate( 92 | self, 93 | inputs: Optional[torch.Tensor] = None, 94 | images: Optional[torch.Tensor] = None, 95 | image_sizes: Optional[torch.Tensor] = None, 96 | **kwargs, 97 | ) -> Union[GenerateOutput, torch.LongTensor]: 98 | position_ids = kwargs.pop("position_ids", None) 99 | attention_mask = kwargs.pop("attention_mask", None) 100 | if "inputs_embeds" in kwargs: 101 | raise NotImplementedError("`inputs_embeds` is not supported") 102 | 103 | if images is not None: 104 | (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, image_sizes=image_sizes) 105 | else: 106 | inputs_embeds = self.get_model().embed_tokens(inputs) 107 | 108 | return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs) 109 | 110 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 111 | images = kwargs.pop("images", None) 112 | image_sizes = kwargs.pop("image_sizes", None) 113 | inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) 114 | if images is not None: 115 | inputs["images"] = images 116 | if image_sizes is not None: 117 | inputs["image_sizes"] = image_sizes 118 | return inputs 119 | 120 | 121 | AutoConfig.register("llava_gemma", LlavaGemmaConfig) 122 | AutoModelForCausalLM.register(LlavaGemmaConfig, LlavaGemmaForCausalLM) 123 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/language_model/llava_mistral.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import List, Optional, Tuple, Union 17 | 18 | import torch 19 | import torch.nn as nn 20 | from torch.nn import CrossEntropyLoss 21 | 22 | from transformers import AutoConfig, AutoModelForCausalLM, MistralConfig, MistralModel, MistralForCausalLM, GenerationConfig 23 | 24 | from transformers.modeling_outputs import CausalLMOutputWithPast 25 | from transformers.generation.utils import GenerateOutput 26 | 27 | from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 28 | 29 | 30 | class LlavaMistralConfig(MistralConfig): 31 | model_type = "llava_mistral" 32 | temperature: float = 0.0 # reset to 0.0, previously 0.9 for Vicuna 33 | max_new_tokens: int = 1024 34 | do_sample: bool = False 35 | top_p: Optional[float] = None 36 | 37 | 38 | class LlavaMistralModel(LlavaMetaModel, MistralModel): 39 | config_class = LlavaMistralConfig 40 | 41 | def __init__(self, config: MistralConfig): 42 | super(LlavaMistralModel, self).__init__(config) 43 | 44 | 45 | class LlavaMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM): 46 | config_class = LlavaMistralConfig 47 | 48 | def __init__(self, config): 49 | super(MistralForCausalLM, self).__init__(config) 50 | 51 | config.model_type = "llava_mistral" 52 | config.rope_scaling = None 53 | 54 | self.model = LlavaMistralModel(config) 55 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 56 | # Initialize weights and apply final processing 57 | self.post_init() 58 | 59 | def get_model(self): 60 | return self.model 61 | 62 | def forward( 63 | self, 64 | input_ids: torch.LongTensor = None, 65 | attention_mask: Optional[torch.Tensor] = None, 66 | position_ids: Optional[torch.LongTensor] = None, 67 | past_key_values: Optional[List[torch.FloatTensor]] = None, 68 | inputs_embeds: Optional[torch.FloatTensor] = None, 69 | labels: Optional[torch.LongTensor] = None, 70 | use_cache: Optional[bool] = None, 71 | output_attentions: Optional[bool] = None, 72 | output_hidden_states: Optional[bool] = None, 73 | images: Optional[torch.FloatTensor] = None, 74 | image_sizes: Optional[List[List[int]]] = None, 75 | return_dict: Optional[bool] = None, 76 | cache_position=None, 77 | ) -> Union[Tuple, CausalLMOutputWithPast]: 78 | 79 | if inputs_embeds is None: 80 | (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes) 81 | 82 | return super().forward( 83 | input_ids=input_ids, 84 | attention_mask=attention_mask, 85 | position_ids=position_ids, 86 | past_key_values=past_key_values, 87 | inputs_embeds=inputs_embeds, 88 | labels=labels, 89 | use_cache=use_cache, 90 | output_attentions=output_attentions, 91 | output_hidden_states=output_hidden_states, 92 | return_dict=return_dict, 93 | ) 94 | 95 | @torch.no_grad() 96 | def generate( 97 | self, 98 | inputs: Optional[torch.Tensor] = None, 99 | images: Optional[torch.Tensor] = None, 100 | image_sizes: Optional[torch.Tensor] = None, 101 | **kwargs, 102 | ) -> Union[GenerateOutput, torch.LongTensor]: 103 | position_ids = kwargs.pop("position_ids", None) 104 | attention_mask = kwargs.pop("attention_mask", None) 105 | if "inputs_embeds" in kwargs: 106 | raise NotImplementedError("`inputs_embeds` is not supported") 107 | 108 | if images is not None: 109 | (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, image_sizes=image_sizes) 110 | else: 111 | inputs_embeds = self.get_model().embed_tokens(inputs) 112 | 113 | return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs) 114 | 115 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 116 | images = kwargs.pop("images", None) 117 | image_sizes = kwargs.pop("image_sizes", None) 118 | inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) 119 | if images is not None: 120 | inputs["images"] = images 121 | if image_sizes is not None: 122 | inputs["image_sizes"] = image_sizes 123 | return inputs 124 | 125 | 126 | AutoConfig.register("llava_mistral", LlavaMistralConfig) 127 | AutoModelForCausalLM.register(LlavaMistralConfig, LlavaMistralForCausalLM) 128 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/language_model/llava_mpt.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import Optional, Tuple 17 | 18 | import torch 19 | 20 | from transformers import AutoConfig, AutoModelForCausalLM, MptConfig, MptForCausalLM, MptModel, GenerationConfig 21 | from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 22 | 23 | 24 | class LlavaMptConfig(MptConfig): 25 | model_type = "llava_mpt" 26 | 27 | 28 | class LlavaMptModel(LlavaMetaModel, MptModel): 29 | config_class = LlavaMptConfig 30 | 31 | def __init__(self, config: MptConfig): 32 | config.hidden_size = config.d_model 33 | super(LlavaMptModel, self).__init__(config) 34 | 35 | def embed_tokens(self, x): 36 | return self.wte(x) 37 | 38 | 39 | class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM): 40 | config_class = LlavaMptConfig 41 | supports_gradient_checkpointing = True 42 | 43 | def __init__(self, config): 44 | super(MptForCausalLM, self).__init__(config) 45 | 46 | config.model_type = "llava_mpt" 47 | config.rope_scaling = None 48 | self.generation_config = GenerationConfig( 49 | temperature=0.0, 50 | max_new_tokens=1024, 51 | do_sample=False, 52 | top_p=None, 53 | ) 54 | 55 | self.transformer = LlavaMptModel(config) 56 | self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False) 57 | 58 | # Initialize weights and apply final processing 59 | self.post_init() 60 | 61 | def get_model(self): 62 | return self.transformer 63 | 64 | def _set_gradient_checkpointing(self, module, value=False): 65 | if isinstance(module, LlavaMptModel): 66 | module.gradient_checkpointing = value 67 | 68 | def forward( 69 | self, 70 | input_ids: Optional[torch.LongTensor] = None, 71 | past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, 72 | attention_mask: Optional[torch.Tensor] = None, 73 | inputs_embeds: Optional[torch.Tensor] = None, 74 | labels: Optional[torch.Tensor] = None, 75 | use_cache: Optional[bool] = None, 76 | output_attentions: Optional[bool] = None, 77 | output_hidden_states: Optional[bool] = None, 78 | return_dict: Optional[bool] = None, 79 | cache_position=None, 80 | images=None, 81 | ): 82 | 83 | input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images) 84 | 85 | return super().forward( 86 | input_ids, 87 | past_key_values=past_key_values, 88 | attention_mask=attention_mask, 89 | inputs_embeds=inputs_embeds, 90 | labels=labels, 91 | use_cache=use_cache, 92 | output_attentions=output_attentions, 93 | output_hidden_states=output_hidden_states, 94 | return_dict=return_dict, 95 | ) 96 | 97 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 98 | images = kwargs.pop("images", None) 99 | _inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) 100 | _inputs["images"] = images 101 | return _inputs 102 | 103 | 104 | AutoConfig.register("llava_mpt", LlavaMptConfig) 105 | AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM) 106 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta 4 | """ 5 | 6 | import argparse 7 | 8 | import torch 9 | from tqdm import tqdm 10 | from transformers import AutoTokenizer, AutoModelForCausalLM 11 | from llava.model.utils import auto_upgrade # NotImplement Yet 12 | 13 | 14 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id): 15 | print("Loading base model") 16 | base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading target model") 19 | auto_upgrade(target_model_path) 20 | target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 21 | 22 | print("Calculating delta") 23 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 24 | if name not in base.state_dict(): 25 | assert name in ["model.mm_projector.weight", "model.mm_projector.bias"], f"{name} not in base model" 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data -= base.state_dict()[name] 29 | else: 30 | assert name in ["model.embed_tokens.weight", "lm_head.weight"], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}" 31 | bparam = base.state_dict()[name] 32 | param.data[: bparam.shape[0], : bparam.shape[1]] -= bparam 33 | 34 | print("Saving delta") 35 | if hub_repo_id: 36 | kwargs = {"push_to_hub": True, "repo_id": hub_repo_id} 37 | else: 38 | kwargs = {} 39 | target.save_pretrained(delta_path, **kwargs) 40 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path) 41 | target_tokenizer.save_pretrained(delta_path, **kwargs) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument("--base-model-path", type=str, required=True) 47 | parser.add_argument("--target-model-path", type=str, required=True) 48 | parser.add_argument("--delta-path", type=str, required=True) 49 | parser.add_argument("--hub-repo-id", type=str, default=None) 50 | args = parser.parse_args() 51 | 52 | make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id) 53 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | from .imagebind import ImageBindWrapper 4 | from .open_clip_encoder import OpenCLIPVisionTower 5 | from .hf_vision import HFVisionTower 6 | from .siglip_encoder import SigLipVisionTower 7 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 8 | 9 | # from .eva_clip.eva_clip_encoder import EvaClipVisionTower 10 | # from .dev_eva_clip.eva_vit import EvaViTWrapper 11 | 12 | 13 | def build_vision_tower(vision_tower_cfg, **kwargs): 14 | vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None)) 15 | is_absolute_path_exists = os.path.exists(vision_tower) 16 | use_s2 = getattr(vision_tower_cfg, "s2", False) 17 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 18 | if use_s2: 19 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 20 | else: 21 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 22 | elif "siglip" in vision_tower: 23 | return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs) 24 | elif vision_tower.startswith("hf:"): 25 | return HFVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 26 | elif vision_tower in ["imagebind_huge"]: 27 | return ImageBindWrapper(vision_tower, args=vision_tower_cfg, **kwargs) 28 | elif vision_tower.startswith("open_clip_hub"): 29 | return OpenCLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 30 | # elif "internal-eva" in vision_tower.lower() or "eva02" in vision_tower.lower(): 31 | # return EvaClipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 32 | # elif vision_tower in ["EVA-CLIP-8B", "EVA-CLIP-8B-plus"]: 33 | # return EvaViTWrapper(vision_tower, args=vision_tower_cfg, **kwargs) 34 | 35 | raise ValueError(f"Unknown vision tower: {vision_tower}") 36 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 2 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer 3 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint 4 | from .loss import ClipLoss 5 | from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg, convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype 6 | from .openai import load_openai_model, list_openai_models 7 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained 8 | from .tokenizer import SimpleTokenizer, tokenize 9 | from .transform import image_transform 10 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionXLab/mllm-mmrotate/110f0ffdfd6788c0a73eaa3457cc4dfbdfc28bbb/lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/constants.py: -------------------------------------------------------------------------------- 1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 3 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/hf_configs.py: -------------------------------------------------------------------------------- 1 | # HF architecture dict: 2 | arch_dict = { 3 | # https://huggingface.co/docs/transformers/model_doc/roberta#roberta 4 | "roberta": { 5 | "config_names": { 6 | "context_length": "max_position_embeddings", 7 | "vocab_size": "vocab_size", 8 | "width": "hidden_size", 9 | "heads": "num_attention_heads", 10 | "layers": "num_hidden_layers", 11 | "layer_attr": "layer", 12 | "token_embeddings_attr": "embeddings", 13 | }, 14 | "pooler": "mean_pooler", 15 | }, 16 | # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig 17 | "xlm-roberta": { 18 | "config_names": { 19 | "context_length": "max_position_embeddings", 20 | "vocab_size": "vocab_size", 21 | "width": "hidden_size", 22 | "heads": "num_attention_heads", 23 | "layers": "num_hidden_layers", 24 | "layer_attr": "layer", 25 | "token_embeddings_attr": "embeddings", 26 | }, 27 | "pooler": "mean_pooler", 28 | }, 29 | # https://huggingface.co/docs/transformers/model_doc/mt5#mt5 30 | "mt5": { 31 | "config_names": { 32 | # unlimited seqlen 33 | # https://github.com/google-research/text-to-text-transfer-transformer/issues/273 34 | # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374 35 | "context_length": "", 36 | "vocab_size": "vocab_size", 37 | "width": "d_model", 38 | "heads": "num_heads", 39 | "layers": "num_layers", 40 | "layer_attr": "block", 41 | "token_embeddings_attr": "embed_tokens", 42 | }, 43 | "pooler": "mean_pooler", 44 | }, 45 | "bert": { 46 | "config_names": { 47 | "context_length": "max_position_embeddings", 48 | "vocab_size": "vocab_size", 49 | "width": "hidden_size", 50 | "heads": "num_attention_heads", 51 | "layers": "num_hidden_layers", 52 | "layer_attr": "layer", 53 | "token_embeddings_attr": "embeddings", 54 | }, 55 | "pooler": "mean_pooler", 56 | }, 57 | } 58 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-18B.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1536, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 5120, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-18b-14-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": true, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": 32, 6 | "width": 4096, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-8b-14-plus-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": false, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 4096, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-8b-14-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": false, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16, 8 | "eva_model_name": "eva-clip-b-16", 9 | "ls_init_value": 0.1, 10 | "drop_path_rate": 0.0 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 1024, 19 | "heads": 16, 20 | "layers": 24, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0.4, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 768, 19 | "heads": 12, 20 | "layers": 12, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "head_width": 64, 8 | "patch_size": 16, 9 | "mlp_ratio": 2.6667, 10 | "eva_model_name": "eva-clip-b-16-X", 11 | "drop_path_rate": 0.0, 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 512, 24 | "heads": 8, 25 | "layers": 12, 26 | "xattn": true, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14-336", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1024, 20 | "heads": 16, 21 | "layers": 24, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": 77, 6 | "width": 2304, 7 | "head_width": 144, 8 | "mlp_ratio": 10.9722, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-10b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": false, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 77, 6 | "width": 2304, 7 | "head_width": 144, 8 | "mlp_ratio": 10.9722, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-10b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": false, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/openai.py: -------------------------------------------------------------------------------- 1 | """ OpenAI pretrained model functions 2 | 3 | Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI. 4 | """ 5 | 6 | import os 7 | import warnings 8 | from typing import List, Optional, Union 9 | 10 | import torch 11 | 12 | from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype 13 | from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url 14 | 15 | __all__ = ["list_openai_models", "load_openai_model"] 16 | 17 | 18 | def list_openai_models() -> List[str]: 19 | """Returns the names of available CLIP models""" 20 | return list_pretrained_models_by_tag("openai") 21 | 22 | 23 | def load_openai_model( 24 | name: str, 25 | precision: Optional[str] = None, 26 | device: Optional[Union[str, torch.device]] = None, 27 | jit: bool = True, 28 | cache_dir: Optional[str] = None, 29 | ): 30 | """Load a CLIP model 31 | 32 | Parameters 33 | ---------- 34 | name : str 35 | A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict 36 | precision: str 37 | Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'. 38 | device : Union[str, torch.device] 39 | The device to put the loaded model 40 | jit : bool 41 | Whether to load the optimized JIT model (default) or more hackable non-JIT model. 42 | cache_dir : Optional[str] 43 | The directory to cache the downloaded model weights 44 | 45 | Returns 46 | ------- 47 | model : torch.nn.Module 48 | The CLIP model 49 | preprocess : Callable[[PIL.Image], torch.Tensor] 50 | A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input 51 | """ 52 | if device is None: 53 | device = "cuda" if torch.cuda.is_available() else "cpu" 54 | if precision is None: 55 | precision = "fp32" if device == "cpu" else "fp16" 56 | 57 | if get_pretrained_url(name, "openai"): 58 | model_path = download_pretrained_from_url(get_pretrained_url(name, "openai"), cache_dir=cache_dir) 59 | elif os.path.isfile(name): 60 | model_path = name 61 | else: 62 | raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}") 63 | 64 | try: 65 | # loading JIT archive 66 | model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval() 67 | state_dict = None 68 | except RuntimeError: 69 | # loading saved state dict 70 | if jit: 71 | warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead") 72 | jit = False 73 | state_dict = torch.load(model_path, map_location="cpu") 74 | 75 | if not jit: 76 | # Build a non-jit model from the OpenAI jitted model state dict 77 | cast_dtype = get_cast_dtype(precision) 78 | try: 79 | model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype) 80 | except KeyError: 81 | sd = {k[7:]: v for k, v in state_dict["state_dict"].items()} 82 | model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype) 83 | 84 | # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use 85 | model = model.to(device) 86 | if precision.startswith("amp") or precision == "fp32": 87 | model.float() 88 | elif precision == "bf16": 89 | convert_weights_to_lp(model, dtype=torch.bfloat16) 90 | 91 | return model 92 | 93 | # patch the device names 94 | device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]) 95 | device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1] 96 | 97 | def patch_device(module): 98 | try: 99 | graphs = [module.graph] if hasattr(module, "graph") else [] 100 | except RuntimeError: 101 | graphs = [] 102 | 103 | if hasattr(module, "forward1"): 104 | graphs.append(module.forward1.graph) 105 | 106 | for graph in graphs: 107 | for node in graph.findAllNodes("prim::Constant"): 108 | if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"): 109 | node.copyAttributes(device_node) 110 | 111 | model.apply(patch_device) 112 | patch_device(model.encode_image) 113 | patch_device(model.encode_text) 114 | 115 | # patch dtype to float32 (typically for CPU) 116 | if precision == "fp32": 117 | float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[]) 118 | float_input = list(float_holder.graph.findNode("aten::to").inputs())[1] 119 | float_node = float_input.node() 120 | 121 | def patch_float(module): 122 | try: 123 | graphs = [module.graph] if hasattr(module, "graph") else [] 124 | except RuntimeError: 125 | graphs = [] 126 | 127 | if hasattr(module, "forward1"): 128 | graphs.append(module.forward1.graph) 129 | 130 | for graph in graphs: 131 | for node in graph.findAllNodes("aten::to"): 132 | inputs = list(node.inputs()) 133 | for i in [1, 2]: # dtype can be the second or third argument to aten::to() 134 | if inputs[i].node()["value"] == 5: 135 | inputs[i].node().copyAttributes(float_node) 136 | 137 | model.apply(patch_float) 138 | patch_float(model.encode_image) 139 | patch_float(model.encode_text) 140 | model.float() 141 | 142 | # ensure image_size attr available at consistent location for both jit and non-jit 143 | model.visual.image_size = model.input_resolution.item() 144 | return model 145 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/rope.py: -------------------------------------------------------------------------------- 1 | from math import pi 2 | import torch 3 | from torch import nn 4 | from einops import rearrange, repeat 5 | import logging 6 | 7 | 8 | def broadcat(tensors, dim=-1): 9 | num_tensors = len(tensors) 10 | shape_lens = set(list(map(lambda t: len(t.shape), tensors))) 11 | assert len(shape_lens) == 1, "tensors must all have the same number of dimensions" 12 | shape_len = list(shape_lens)[0] 13 | dim = (dim + shape_len) if dim < 0 else dim 14 | dims = list(zip(*map(lambda t: list(t.shape), tensors))) 15 | expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim] 16 | assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), "invalid dimensions for broadcastable concatentation" 17 | max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims)) 18 | expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims)) 19 | expanded_dims.insert(dim, (dim, dims[dim])) 20 | expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims))) 21 | tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes))) 22 | return torch.cat(tensors, dim=dim) 23 | 24 | 25 | def rotate_half(x): 26 | x = rearrange(x, "... (d r) -> ... d r", r=2) 27 | x1, x2 = x.unbind(dim=-1) 28 | x = torch.stack((-x2, x1), dim=-1) 29 | return rearrange(x, "... d r -> ... (d r)") 30 | 31 | 32 | class VisionRotaryEmbedding(nn.Module): 33 | def __init__( 34 | self, 35 | dim, 36 | pt_seq_len, 37 | ft_seq_len=None, 38 | custom_freqs=None, 39 | freqs_for="lang", 40 | theta=10000, 41 | max_freq=10, 42 | num_freqs=1, 43 | ): 44 | super().__init__() 45 | if custom_freqs: 46 | freqs = custom_freqs 47 | elif freqs_for == "lang": 48 | freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) 49 | elif freqs_for == "pixel": 50 | freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi 51 | elif freqs_for == "constant": 52 | freqs = torch.ones(num_freqs).float() 53 | else: 54 | raise ValueError(f"unknown modality {freqs_for}") 55 | 56 | if ft_seq_len is None: 57 | ft_seq_len = pt_seq_len 58 | t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len 59 | 60 | freqs_h = torch.einsum("..., f -> ... f", t, freqs) 61 | freqs_h = repeat(freqs_h, "... n -> ... (n r)", r=2) 62 | 63 | freqs_w = torch.einsum("..., f -> ... f", t, freqs) 64 | freqs_w = repeat(freqs_w, "... n -> ... (n r)", r=2) 65 | 66 | freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim=-1) 67 | 68 | self.register_buffer("freqs_cos", freqs.cos()) 69 | self.register_buffer("freqs_sin", freqs.sin()) 70 | 71 | logging.info(f"Shape of rope freq: {self.freqs_cos.shape}") 72 | 73 | def forward(self, t, start_index=0): 74 | rot_dim = self.freqs_cos.shape[-1] 75 | end_index = start_index + rot_dim 76 | assert rot_dim <= t.shape[-1], f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}" 77 | t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:] 78 | t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin) 79 | 80 | return torch.cat((t_left, t, t_right), dim=-1) 81 | 82 | 83 | class VisionRotaryEmbeddingFast(nn.Module): 84 | def __init__(self, dim, pt_seq_len, ft_seq_len=None, custom_freqs=None, freqs_for="lang", theta=10000, max_freq=10, num_freqs=1, patch_dropout=0.0): 85 | super().__init__() 86 | if custom_freqs: 87 | freqs = custom_freqs 88 | elif freqs_for == "lang": 89 | freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) 90 | elif freqs_for == "pixel": 91 | freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi 92 | elif freqs_for == "constant": 93 | freqs = torch.ones(num_freqs).float() 94 | else: 95 | raise ValueError(f"unknown modality {freqs_for}") 96 | 97 | if ft_seq_len is None: 98 | ft_seq_len = pt_seq_len 99 | t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len 100 | 101 | freqs = torch.einsum("..., f -> ... f", t, freqs) 102 | freqs = repeat(freqs, "... n -> ... (n r)", r=2) 103 | freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim=-1) 104 | 105 | freqs_cos = freqs.cos().view(-1, freqs.shape[-1]) 106 | freqs_sin = freqs.sin().view(-1, freqs.shape[-1]) 107 | 108 | self.patch_dropout = patch_dropout 109 | 110 | self.register_buffer("freqs_cos", freqs_cos) 111 | self.register_buffer("freqs_sin", freqs_sin) 112 | 113 | logging.info(f"Shape of rope freq: {self.freqs_cos.shape}") 114 | 115 | def forward(self, t, patch_indices_keep=None): 116 | if patch_indices_keep is not None: 117 | batch = t.size()[0] 118 | batch_indices = torch.arange(batch) 119 | batch_indices = batch_indices[..., None] 120 | 121 | freqs_cos = repeat(self.freqs_cos, "i j -> n i m j", n=t.shape[0], m=t.shape[1]) 122 | freqs_sin = repeat(self.freqs_sin, "i j -> n i m j", n=t.shape[0], m=t.shape[1]) 123 | 124 | freqs_cos = freqs_cos[batch_indices, patch_indices_keep] 125 | freqs_cos = rearrange(freqs_cos, "n i m j -> n m i j") 126 | freqs_sin = freqs_sin[batch_indices, patch_indices_keep] 127 | freqs_sin = rearrange(freqs_sin, "n i m j -> n m i j") 128 | 129 | return t * freqs_cos + rotate_half(t) * freqs_sin 130 | 131 | return t * self.freqs_cos + rotate_half(t) * self.freqs_sin 132 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/timm_model.py: -------------------------------------------------------------------------------- 1 | """ timm model adapter 2 | 3 | Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model. 4 | """ 5 | 6 | import logging 7 | from collections import OrderedDict 8 | 9 | import torch 10 | import torch.nn as nn 11 | 12 | try: 13 | import timm 14 | from timm.models.layers import Mlp, to_2tuple 15 | 16 | try: 17 | # old timm imports < 0.8.1 18 | from timm.models.layers.attention_pool2d import RotAttentionPool2d 19 | from timm.models.layers.attention_pool2d import AttentionPool2d as AbsAttentionPool2d 20 | except ImportError: 21 | # new timm imports >= 0.8.1 22 | from timm.layers import RotAttentionPool2d 23 | from timm.layers import AttentionPool2d as AbsAttentionPool2d 24 | except ImportError: 25 | timm = None 26 | 27 | from .utils import freeze_batch_norm_2d 28 | 29 | 30 | class TimmModel(nn.Module): 31 | """timm model adapter 32 | # FIXME this adapter is a work in progress, may change in ways that break weight compat 33 | """ 34 | 35 | def __init__(self, model_name, embed_dim, image_size=224, pool="avg", proj="linear", proj_bias=False, drop=0.0, pretrained=False): 36 | super().__init__() 37 | if timm is None: 38 | raise RuntimeError("Please `pip install timm` to use timm models.") 39 | 40 | self.image_size = to_2tuple(image_size) 41 | self.trunk = timm.create_model(model_name, pretrained=pretrained) 42 | feat_size = self.trunk.default_cfg.get("pool_size", None) 43 | feature_ndim = 1 if not feat_size else 2 44 | if pool in ("abs_attn", "rot_attn"): 45 | assert feature_ndim == 2 46 | # if attn pooling used, remove both classifier and default pool 47 | self.trunk.reset_classifier(0, global_pool="") 48 | else: 49 | # reset global pool if pool config set, otherwise leave as network default 50 | reset_kwargs = dict(global_pool=pool) if pool else {} 51 | self.trunk.reset_classifier(0, **reset_kwargs) 52 | prev_chs = self.trunk.num_features 53 | 54 | head_layers = OrderedDict() 55 | if pool == "abs_attn": 56 | head_layers["pool"] = AbsAttentionPool2d(prev_chs, feat_size=feat_size, out_features=embed_dim) 57 | prev_chs = embed_dim 58 | elif pool == "rot_attn": 59 | head_layers["pool"] = RotAttentionPool2d(prev_chs, out_features=embed_dim) 60 | prev_chs = embed_dim 61 | else: 62 | assert proj, "projection layer needed if non-attention pooling is used." 63 | 64 | # NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used 65 | if proj == "linear": 66 | head_layers["drop"] = nn.Dropout(drop) 67 | head_layers["proj"] = nn.Linear(prev_chs, embed_dim, bias=proj_bias) 68 | elif proj == "mlp": 69 | head_layers["mlp"] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=drop, bias=(True, proj_bias)) 70 | 71 | self.head = nn.Sequential(head_layers) 72 | 73 | def lock(self, unlocked_groups=0, freeze_bn_stats=False): 74 | """lock modules 75 | Args: 76 | unlocked_groups (int): leave last n layer groups unlocked (default: 0) 77 | """ 78 | if not unlocked_groups: 79 | # lock full model 80 | for param in self.trunk.parameters(): 81 | param.requires_grad = False 82 | if freeze_bn_stats: 83 | freeze_batch_norm_2d(self.trunk) 84 | else: 85 | # NOTE: partial freeze requires latest timm (master) branch and is subject to change 86 | try: 87 | # FIXME import here until API stable and in an official release 88 | from timm.models.helpers import group_parameters, group_modules 89 | except ImportError: 90 | raise RuntimeError("Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`") 91 | matcher = self.trunk.group_matcher() 92 | gparams = group_parameters(self.trunk, matcher) 93 | max_layer_id = max(gparams.keys()) 94 | max_layer_id = max_layer_id - unlocked_groups 95 | for group_idx in range(max_layer_id + 1): 96 | group = gparams[group_idx] 97 | for param in group: 98 | self.trunk.get_parameter(param).requires_grad = False 99 | if freeze_bn_stats: 100 | gmodules = group_modules(self.trunk, matcher, reverse=True) 101 | gmodules = {k for k, v in gmodules.items() if v <= max_layer_id} 102 | freeze_batch_norm_2d(self.trunk, gmodules) 103 | 104 | @torch.jit.ignore 105 | def set_grad_checkpointing(self, enable=True): 106 | try: 107 | self.trunk.set_grad_checkpointing(enable) 108 | except Exception as e: 109 | logging.warning("grad checkpointing not supported for this timm image tower, continuing without...") 110 | 111 | def forward(self, x): 112 | x = self.trunk(x) 113 | x = self.head(x) 114 | return x 115 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/dev_eva_clip/eva_clip/transform.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence, Tuple 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torchvision.transforms.functional as F 6 | 7 | from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, CenterCrop 8 | 9 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 10 | 11 | 12 | class ResizeMaxSize(nn.Module): 13 | 14 | def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn="max", fill=0): 15 | super().__init__() 16 | if not isinstance(max_size, int): 17 | raise TypeError(f"Size should be int. Got {type(max_size)}") 18 | self.max_size = max_size 19 | self.interpolation = interpolation 20 | self.fn = min if fn == "min" else min 21 | self.fill = fill 22 | 23 | def forward(self, img): 24 | if isinstance(img, torch.Tensor): 25 | height, width = img.shape[:2] 26 | else: 27 | width, height = img.size 28 | scale = self.max_size / float(max(height, width)) 29 | if scale != 1.0: 30 | new_size = tuple(round(dim * scale) for dim in (height, width)) 31 | img = F.resize(img, new_size, self.interpolation) 32 | pad_h = self.max_size - new_size[0] 33 | pad_w = self.max_size - new_size[1] 34 | img = F.pad(img, padding=[pad_w // 2, pad_h // 2, pad_w - pad_w // 2, pad_h - pad_h // 2], fill=self.fill) 35 | return img 36 | 37 | 38 | def _convert_to_rgb(image): 39 | return image.convert("RGB") 40 | 41 | 42 | # class CatGen(nn.Module): 43 | # def __init__(self, num=4): 44 | # self.num = num 45 | # def mixgen_batch(image, text): 46 | # batch_size = image.shape[0] 47 | # index = np.random.permutation(batch_size) 48 | 49 | # cat_images = [] 50 | # for i in range(batch_size): 51 | # # image mixup 52 | # image[i,:] = lam * image[i,:] + (1 - lam) * image[index[i],:] 53 | # # text concat 54 | # text[i] = tokenizer((str(text[i]) + " " + str(text[index[i]])))[0] 55 | # text = torch.stack(text) 56 | # return image, text 57 | 58 | 59 | def image_transform( 60 | image_size: int, 61 | is_train: bool, 62 | mean: Optional[Tuple[float, ...]] = None, 63 | std: Optional[Tuple[float, ...]] = None, 64 | resize_longest_max: bool = False, 65 | fill_color: int = 0, 66 | ): 67 | mean = mean or OPENAI_DATASET_MEAN 68 | if not isinstance(mean, (list, tuple)): 69 | mean = (mean,) * 3 70 | 71 | std = std or OPENAI_DATASET_STD 72 | if not isinstance(std, (list, tuple)): 73 | std = (std,) * 3 74 | 75 | if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]: 76 | # for square size, pass size as int so that Resize() uses aspect preserving shortest edge 77 | image_size = image_size[0] 78 | 79 | normalize = Normalize(mean=mean, std=std) 80 | if is_train: 81 | return Compose( 82 | [ 83 | RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC), 84 | _convert_to_rgb, 85 | ToTensor(), 86 | normalize, 87 | ] 88 | ) 89 | else: 90 | if resize_longest_max: 91 | transforms = [ResizeMaxSize(image_size, fill=fill_color)] 92 | else: 93 | transforms = [ 94 | Resize(image_size, interpolation=InterpolationMode.BICUBIC), 95 | CenterCrop(image_size), 96 | ] 97 | transforms.extend( 98 | [ 99 | _convert_to_rgb, 100 | ToTensor(), 101 | normalize, 102 | ] 103 | ) 104 | return Compose(transforms) 105 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/eva_clip_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .eva_clip_processors import EvaClipImageTrainProcessor 5 | from .eva_vit import EVAEncoderWrapper 6 | from .factory import list_models, add_model_config, get_model_config 7 | 8 | from lmmrotate.utils import rank0_print 9 | 10 | 11 | class EvaClipVisionTower(nn.Module): 12 | def __init__(self, vision_tower, args, delay_load=False): 13 | super().__init__() 14 | 15 | self.is_loaded = False 16 | self.vision_tower_name = vision_tower 17 | self.vision_tower_pretrained = args.vision_tower_pretrained 18 | self.config = get_model_config(vision_tower) 19 | 20 | if not delay_load: 21 | rank0_print(f"Loading EVA ViT: {self.vision_tower_name}") 22 | self.load_model() 23 | elif getattr(args, "unfreeze_mm_vision_tower", False): 24 | # TODO: better detector is needed. 25 | rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") 26 | self.load_model() 27 | elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: 28 | rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") 29 | self.load_model() 30 | else: 31 | self.cfg_only = self.config 32 | 33 | def load_model(self, device_map=None): 34 | rank0_print(f"Pretrained: {self.vision_tower_pretrained}") 35 | self.image_processor = EvaClipImageTrainProcessor(self.config["vision_cfg"]["image_size"]) 36 | self.vision_tower = EVAEncoderWrapper(self.vision_tower_pretrained, self.config) 37 | rank0_print(f"Loaded image processor: {self.image_processor}") 38 | self.vision_tower.requires_grad_(False) 39 | self.is_loaded = True 40 | 41 | def forward(self, images): 42 | if type(images) is list: 43 | image_features = [] 44 | for image in images: 45 | image_feature = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0)).to(image.dtype) 46 | image_features.append(image_feature) 47 | else: 48 | image_features = self.vision_tower(images.to(device=self.device, dtype=self.dtype)).to(images.dtype) 49 | 50 | return image_features 51 | 52 | @property 53 | def dtype(self): 54 | return self.vision_tower.dtype 55 | 56 | @property 57 | def device(self): 58 | return self.vision_tower.device 59 | 60 | @property 61 | def hidden_size(self): 62 | return self.config["vision_cfg"]["width"] 63 | 64 | @property 65 | def num_patches(self): 66 | return (self.config["vision_cfg"]["image_size"] // self.config["vision_cfg"]["patch_size"]) ** 2 67 | 68 | @property 69 | def num_patches_per_side(self): 70 | return self.config["vision_cfg"]["image_size"] // self.config["vision_cfg"]["patch_size"] 71 | 72 | @property 73 | def image_size(self): 74 | return self.config["vision_cfg"]["image_size"] 75 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/eva_clip_processors.py: -------------------------------------------------------------------------------- 1 | """ 2 | # Adapted from https://github.com/baaivision/EVA/tree/master/EVA-CLIP 3 | """ 4 | 5 | from torchvision import transforms 6 | from torchvision.transforms.functional import InterpolationMode 7 | from transformers.image_processing_utils import BatchFeature 8 | from PIL import Image 9 | from transformers.image_transforms import convert_to_rgb 10 | 11 | 12 | class BaseProcessor: 13 | def __init__(self): 14 | self.transform = lambda x: x 15 | return 16 | 17 | def __call__(self, item): 18 | return self.transform(item) 19 | 20 | 21 | class EvaClipImageBaseProcessor(BaseProcessor): 22 | def __init__(self, mean=None, std=None): 23 | self.mean = (0.48145466, 0.4578275, 0.40821073) if mean is None else mean 24 | self.std = (0.26862954, 0.26130258, 0.27577711) if std is None else std 25 | 26 | self.normalize = transforms.Normalize(self.mean, self.std) 27 | 28 | @property 29 | def image_mean(self): 30 | return self.mean 31 | 32 | 33 | class EvaClipImageTrainProcessor(EvaClipImageBaseProcessor): 34 | def __init__(self, image_size=224, mean=None, std=None, min_scale=0.5, max_scale=1.0): 35 | super().__init__(mean=mean, std=std) 36 | 37 | self.transform = transforms.Compose( 38 | [ 39 | convert_to_rgb, 40 | transforms.Resize( 41 | image_size, 42 | interpolation=InterpolationMode.BICUBIC, 43 | ), 44 | transforms.CenterCrop(image_size), 45 | transforms.ToTensor(), 46 | self.normalize, 47 | ] 48 | ) 49 | 50 | self.image_size = image_size 51 | 52 | def preprocess(self, images, return_tensors): 53 | if isinstance(images, Image.Image): 54 | images = [images] 55 | else: 56 | assert isinstance(images, list) 57 | 58 | transformed_images = [self.transform(image).numpy() for image in images] 59 | data = {"pixel_values": transformed_images} 60 | 61 | return BatchFeature(data=data, tensor_type=return_tensors) 62 | 63 | def __call__(self, item): 64 | return self.transform(item) 65 | 66 | @property 67 | def crop_size(self): 68 | return {"height": self.image_size, "width": self.image_size} 69 | 70 | @property 71 | def size(self): 72 | return {"shortest_edge": self.image_size} 73 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/factory.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import pathlib 5 | import re 6 | from copy import deepcopy 7 | from pathlib import Path 8 | from typing import Optional, Tuple, Union, Dict, Any 9 | import torch 10 | 11 | _MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"] 12 | _MODEL_CONFIGS = {} # directory (model_name: config) of model architecture configs 13 | 14 | 15 | def _natural_key(string_): 16 | return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())] 17 | 18 | 19 | def _rescan_model_configs(): 20 | global _MODEL_CONFIGS 21 | 22 | config_ext = (".json",) 23 | config_files = [] 24 | for config_path in _MODEL_CONFIG_PATHS: 25 | if config_path.is_file() and config_path.suffix in config_ext: 26 | config_files.append(config_path) 27 | elif config_path.is_dir(): 28 | for ext in config_ext: 29 | config_files.extend(config_path.glob(f"*{ext}")) 30 | 31 | for cf in config_files: 32 | with open(cf, "r", encoding="utf8") as f: 33 | model_cfg = json.load(f) 34 | if all(a in model_cfg for a in ("embed_dim", "vision_cfg", "text_cfg")): 35 | _MODEL_CONFIGS[cf.stem] = model_cfg 36 | 37 | _MODEL_CONFIGS = dict(sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))) 38 | 39 | 40 | _rescan_model_configs() # initial populate of model config registry 41 | 42 | 43 | def list_models(): 44 | """enumerate available model architectures based on config files""" 45 | return list(_MODEL_CONFIGS.keys()) 46 | 47 | 48 | def add_model_config(path): 49 | """add model config path or file and update registry""" 50 | if not isinstance(path, Path): 51 | path = Path(path) 52 | _MODEL_CONFIG_PATHS.append(path) 53 | _rescan_model_configs() 54 | 55 | 56 | def get_model_config(model_name): 57 | if model_name in _MODEL_CONFIGS: 58 | return deepcopy(_MODEL_CONFIGS[model_name]) 59 | else: 60 | return None 61 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-18B.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1536, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 5120, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-18b-14-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": true, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": 32, 6 | "width": 4096, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-8b-14-plus-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": false, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 4096, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-8b-14-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": false, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16, 8 | "eva_model_name": "eva-clip-b-16", 9 | "ls_init_value": 0.1, 10 | "drop_path_rate": 0.0 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 1024, 19 | "heads": 16, 20 | "layers": 24, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0.4, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 768, 19 | "heads": 12, 20 | "layers": 12, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "head_width": 64, 8 | "patch_size": 16, 9 | "mlp_ratio": 2.6667, 10 | "eva_model_name": "eva-clip-b-16-X", 11 | "drop_path_rate": 0.0, 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 512, 24 | "heads": 8, 25 | "layers": 12, 26 | "xattn": true, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14-336", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1024, 20 | "heads": 16, 21 | "layers": 24, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": 77, 6 | "width": 2304, 7 | "head_width": 144, 8 | "mlp_ratio": 10.9722, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-10b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": false, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 77, 6 | "width": 2304, 7 | "head_width": 144, 8 | "mlp_ratio": 10.9722, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-10b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": false, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/hf_vision.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from transformers import AutoModel, AutoImageProcessor, AutoConfig, CLIPImageProcessor 5 | from lmmrotate.utils import rank0_print 6 | 7 | 8 | class HFVisionTower(nn.Module): 9 | def __init__(self, vision_tower, args, delay_load=False): 10 | super().__init__() 11 | 12 | self.is_loaded = False 13 | 14 | self.vision_tower_name = vision_tower.replace("hf:", "", 1) 15 | self.select_layer = args.mm_vision_select_layer 16 | self.select_feature = getattr(args, "mm_vision_select_feature", "patch") 17 | 18 | if not delay_load: 19 | self.load_model() 20 | else: 21 | self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name) 22 | 23 | def load_model(self): 24 | try: 25 | self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name) 26 | except Exception as e: 27 | if "448" in self.vision_tower_name: 28 | image_size = 448 29 | # use image processor with conig 30 | self.image_processor = CLIPImageProcessor(size={"shortest_edge": image_size}, do_center_crop=True, crop_size=image_size) 31 | else: 32 | self.image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14") 33 | rank0_print(f"Loaded image processor: {self.image_processor}") 34 | self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, torch_dtype=torch.bfloat16, trust_remote_code=True).to("cuda") 35 | self.device = self.vision_tower.device 36 | self.dtype = self.vision_tower.dtype 37 | self.config = self.vision_tower.config 38 | 39 | if hasattr(self.vision_tower, "vision_model"): 40 | self.vision_tower = self.vision_tower.vision_model 41 | self.vision_tower.requires_grad_(False) 42 | # self.vision_tower.eval() 43 | self.is_loaded = True 44 | 45 | def feature_select(self, image_forward_outs): 46 | select_feature_type = self.select_feature 47 | 48 | if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: 49 | select_every_k_layer = len(image_forward_outs.hidden_states) // 4 50 | image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1) 51 | select_feature_type = select_feature_type.replace("slicefour_", "") 52 | else: 53 | image_features = image_forward_outs.hidden_states[self.select_layer] 54 | 55 | if select_feature_type == "patch": 56 | image_features = image_features[:, 1:] 57 | elif select_feature_type == "cls_patch": 58 | image_features = image_features 59 | else: 60 | raise ValueError(f"Unexpected select feature: {select_feature_type}") 61 | return image_features 62 | 63 | def forward(self, images): 64 | if type(images) is list: 65 | image_features = [] 66 | for image in images: 67 | image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) 68 | image_feature = self.feature_select(image_forward_out).to(image.dtype) 69 | image_features.append(image_feature) 70 | else: 71 | image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) 72 | image_features = self.feature_select(image_forward_outs).to(images.dtype) 73 | 74 | return image_features 75 | 76 | @property 77 | def dummy_feature(self): 78 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) 79 | 80 | # @property 81 | # def dtype(self): 82 | # return self.vision_tower.dtype 83 | 84 | # @property 85 | # def device(self): 86 | # return self.vision_tower.device 87 | 88 | @property 89 | def hidden_size(self): 90 | try: 91 | _hidden_size = self.config.hidden_size 92 | except: 93 | _hidden_size = self.config.vision_config.hidden_size 94 | if "slicefour" in self.select_feature: 95 | _hidden_size *= 4 96 | return _hidden_size 97 | 98 | @property 99 | def num_patches(self): 100 | _num_patches = (self.config.image_size // self.config.patch_size) ** 2 101 | if "cls_patch" in self.select_feature: 102 | _num_patches += 1 103 | return _num_patches 104 | 105 | @property 106 | def num_patches_per_side(self): 107 | return self.config.image_size // self.config.patch_size 108 | 109 | @property 110 | def image_size(self): 111 | return self.config.image_size 112 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_encoder/imagebind.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from transformers import CLIPImageProcessor 5 | 6 | try: 7 | from imagebind.models import imagebind_model 8 | from imagebind.models.imagebind_model import ModalityType 9 | from imagebind.data import load_and_transform_audio_data 10 | except ImportError: 11 | pass 12 | 13 | 14 | class ImageBindWrapper(nn.Module): 15 | def __init__(self, vision_tower, select_layer, select_feature="patch", delay_load=False): 16 | super().__init__() 17 | 18 | self.is_loaded = False 19 | 20 | self.vision_tower_name = vision_tower 21 | self.select_layer = select_layer 22 | self.select_feature = select_feature 23 | 24 | if not delay_load: 25 | self.load_model() 26 | 27 | def load_model(self): 28 | self.image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14") 29 | self.vision_tower = imagebind_model.imagebind_huge(pretrained=True) 30 | for p in self.vision_tower.parameters(): 31 | p.requires_grad = False 32 | self.vision_tower.eval() 33 | self.is_loaded = True 34 | 35 | def train(self, mode=True): 36 | self.training = mode 37 | 38 | if self.is_loaded: 39 | self.vision_tower.eval() 40 | 41 | @torch.no_grad() 42 | def forward(self, x): 43 | if type(x) == dict: 44 | if x["audios"] is not None: 45 | inputs = {ModalityType.AUDIO: load_and_transform_audio_data(x["audios"], device=self.device).half()} 46 | embeddings = self.vision_tower(inputs) 47 | audio_embedding = embeddings[ModalityType.AUDIO] 48 | return audio_embedding.unsqueeze(1) 49 | else: 50 | inputs = {ModalityType.VISION: x.to(dtype=self.dtype)} 51 | embeddings = self.vision_tower(inputs) 52 | vision_embedding = embeddings[ModalityType.VISION] 53 | if vision_embedding.ndim == 2: 54 | return vision_embedding.unsqueeze(1) 55 | if vision_embedding.shape[1] == 257: 56 | return vision_embedding[:, 1:] 57 | raise ValueError(f"Unexpected shape: {vision_embedding.shape}") 58 | 59 | @property 60 | def dummy_feature(self): 61 | return torch.zeros(1, 1024, device=self.device, dtype=self.dtype) 62 | 63 | @property 64 | def dtype(self): 65 | return self.vision_tower.modality_preprocessors.vision.cls_token.dtype 66 | 67 | @property 68 | def device(self): 69 | return self.vision_tower.modality_preprocessors.vision.cls_token.device 70 | 71 | @property 72 | def hidden_size(self): 73 | return 1024 74 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | from .pooler_projector import PoolerProjector 6 | 7 | 8 | class IdentityMap(nn.Module): 9 | def __init__(self): 10 | super().__init__() 11 | 12 | def forward(self, x, *args, **kwargs): 13 | return x 14 | 15 | @property 16 | def config(self): 17 | return {"mm_projector_type": "identity"} 18 | 19 | 20 | class SimpleResBlock(nn.Module): 21 | def __init__(self, channels): 22 | super().__init__() 23 | self.pre_norm = nn.LayerNorm(channels) 24 | 25 | self.proj = nn.Sequential(nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels)) 26 | 27 | def forward(self, x): 28 | x = self.pre_norm(x) 29 | return x + self.proj(x) 30 | 31 | 32 | def build_vision_projector(config, delay_load=False, **kwargs): 33 | projector_type = getattr(config, "mm_projector_type", "linear") 34 | 35 | if projector_type == "linear": 36 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 37 | 38 | if projector_type == "pooler": 39 | return PoolerProjector(config, kwargs["vision_cfg"]) 40 | 41 | mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type) 42 | if mlp_gelu_match: 43 | mlp_depth = int(mlp_gelu_match.group(1)) 44 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 45 | for _ in range(1, mlp_depth): 46 | modules.append(nn.GELU()) 47 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 48 | return nn.Sequential(*modules) 49 | 50 | mlp_gelu_resnet_match = re.match(r"^mlp(\d+)x_res(\d+)x_gelu$", projector_type) 51 | if mlp_gelu_resnet_match: 52 | mlp_depth = int(mlp_gelu_resnet_match.group(1)) 53 | res_depth = int(mlp_gelu_resnet_match.group(2)) 54 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 55 | for _ in range(1, mlp_depth): 56 | modules.append(nn.GELU()) 57 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 58 | for _ in range(res_depth): 59 | modules.append(SimpleResBlock(config.hidden_size)) 60 | return nn.Sequential(*modules) 61 | 62 | if projector_type == "identity": 63 | return IdentityMap() 64 | 65 | raise ValueError(f"Unknown projector type: {projector_type}") 66 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_projector/pooler_projector.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | import math 5 | 6 | from transformers.models.clip.modeling_clip import CLIPVisionModel 7 | 8 | 9 | class PoolerProjector(nn.Module): 10 | def __init__(self, config, vision_cfg): 11 | super().__init__() 12 | self._config = config 13 | self.hw = vision_cfg.image_size // vision_cfg.patch_size 14 | 15 | self.conv_pool = nn.Conv2d(config.mm_hidden_size, config.hidden_size, kernel_size=2, stride=2) 16 | 17 | self.proj = nn.Sequential( 18 | nn.GELU(), 19 | nn.Linear(config.hidden_size, config.hidden_size), 20 | ) 21 | 22 | def forward(self, x, *args, **kwargs): 23 | height = width = self.hw 24 | assert height * width == x.shape[1] 25 | x = x.view(x.shape[0], height, width, -1).permute(0, 3, 1, 2) 26 | x = self.conv_pool(x) 27 | x = x.flatten(2).transpose(1, 2) 28 | x = self.proj(x) 29 | return x 30 | 31 | @property 32 | def config(self): 33 | return {"mm_projector_type": "pooler"} 34 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_resampler/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .masked_drop import MaskedDrop 4 | from .spatial_pool import SpatialPool 5 | from .perceiver import PerceiverResampler 6 | from .qformer import Qformer 7 | 8 | 9 | class IdentityMap(torch.nn.Module): 10 | def __init__(self): 11 | super().__init__() 12 | 13 | def forward(self, x, *args, **kwargs): 14 | return x 15 | 16 | @property 17 | def config(self): 18 | return {"mm_resampler_type": None} 19 | 20 | 21 | def build_vision_resampler(model_args, delay_load=False, **kwargs): 22 | resampler_type = getattr(model_args, "mm_resampler_type", None) 23 | if resampler_type == "masked_drop": 24 | return MaskedDrop(model_args) 25 | elif resampler_type == "spatial_pool": 26 | return SpatialPool(model_args, **kwargs) 27 | elif resampler_type == "perceiver": 28 | return PerceiverResampler(model_args, **kwargs) 29 | elif resampler_type == "qformer": 30 | return Qformer(model_args, **kwargs) 31 | elif resampler_type is None: 32 | return IdentityMap() 33 | 34 | raise ValueError(f"Unknown resampler type: {resampler_type}") 35 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_resampler/masked_drop.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | import random 5 | 6 | 7 | class MaskedDrop(nn.Module): 8 | def __init__(self, model_args): 9 | super().__init__() 10 | 11 | self.mode = model_args.mm_mask_drop_mode 12 | self.skip_percentage = model_args.mm_mask_drop_skip_percentage 13 | self.ratio = model_args.mm_mask_drop_ratio 14 | self.ratio_upper = model_args.mm_mask_drop_ratio_upper 15 | self.ratio_lower = model_args.mm_mask_drop_ratio_lower 16 | 17 | def forward(self, image_features, *args, **kwargs): 18 | 19 | if not self.training: 20 | return image_features 21 | 22 | if self.skip_percentage > random.random(): 23 | return image_features 24 | 25 | masked_features = [] 26 | 27 | for image_feature in image_features: 28 | num_tokens = image_feature.shape[0] 29 | if self.mode == "fixed": 30 | num_keep = int(num_tokens * self.ratio) 31 | masked_features.append(self.random_masking(image_feature.unsqueeze(0), num_keep)[0][0]) 32 | elif self.mode == "range": 33 | num_keep = int(num_tokens * random.uniform(self.ratio_lower, self.ratio_upper)) 34 | masked_features.append(self.random_masking(image_feature.unsqueeze(0), num_keep)[0]) 35 | elif self.mode == "cls_only": 36 | masked_features.append(image_feature[0:1]) 37 | else: 38 | raise ValueError(f"Unexpected masked drop mode: {self.mode}") 39 | 40 | if self.mode not in ["range"] and (type(image_features) is not list or self.mode in ["cls_only"]): 41 | masked_features = torch.stack(masked_features, dim=0) 42 | 43 | return masked_features 44 | 45 | @property 46 | def config(self): 47 | return { 48 | "mm_resampler_type": "masked_drop", 49 | "mm_mask_drop_mode": self.mode, 50 | "mm_mask_drop_skip_percentage": self.skip_percentage, 51 | "mm_mask_drop_ratio": self.ratio, 52 | "mm_mask_drop_ratio_upper": self.ratio_upper, 53 | "mm_mask_drop_ratio_lower": self.ratio_lower, 54 | } 55 | 56 | def random_masking(self, x, len_keep): 57 | """ 58 | Perform per-sample random masking by per-sample shuffling. 59 | Per-sample shuffling is done by argsort random noise. 60 | x: [N, L, D], sequence 61 | """ 62 | N, L, D = x.shape # batch, length, dim 63 | 64 | noise = torch.rand(N, L, device=x.device) # noise in [0, 1] 65 | 66 | # sort noise for each sample 67 | ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove 68 | ids_restore = torch.argsort(ids_shuffle, dim=1) 69 | 70 | # keep the first subset 71 | ids_keep = ids_shuffle[:, :len_keep] 72 | x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D)) 73 | 74 | # generate the binary mask: 0 is keep, 1 is remove 75 | mask = torch.ones([N, L], device=x.device) 76 | mask[:, :len_keep] = 0 77 | # unshuffle to get the binary mask 78 | mask = torch.gather(mask, dim=1, index=ids_restore) 79 | 80 | return x_masked, mask, ids_restore 81 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_resampler/perceiver.py: -------------------------------------------------------------------------------- 1 | """ 2 | Taken from https://github.com/lucidrains/flamingo-pytorch 3 | """ 4 | 5 | import torch 6 | from einops import rearrange, repeat 7 | 8 | try: 9 | from einops_exts import rearrange_many 10 | except: 11 | pass 12 | 13 | from torch import einsum, nn 14 | 15 | 16 | def exists(val): 17 | return val is not None 18 | 19 | 20 | def FeedForward(dim, mult=4): 21 | inner_dim = int(dim * mult) 22 | return nn.Sequential( 23 | nn.LayerNorm(dim), 24 | nn.Linear(dim, inner_dim, bias=False), 25 | nn.GELU(), 26 | nn.Linear(inner_dim, dim, bias=False), 27 | ) 28 | 29 | 30 | class PerceiverAttention(nn.Module): 31 | def __init__(self, *, dim, dim_head=64, heads=8): 32 | super().__init__() 33 | self.scale = dim_head**-0.5 34 | self.heads = heads 35 | inner_dim = dim_head * heads 36 | 37 | self.norm_media = nn.LayerNorm(dim) 38 | self.norm_latents = nn.LayerNorm(dim) 39 | 40 | self.to_q = nn.Linear(dim, inner_dim, bias=False) 41 | self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False) 42 | self.to_out = nn.Linear(inner_dim, dim, bias=False) 43 | 44 | def forward(self, x, latents): 45 | """ 46 | Args: 47 | x (torch.Tensor): image features 48 | shape (b, T, n1, D) 49 | latent (torch.Tensor): latent features 50 | shape (b, T, n2, D) 51 | """ 52 | x = self.norm_media(x) 53 | latents = self.norm_latents(latents) 54 | 55 | h = self.heads 56 | 57 | q = self.to_q(latents) 58 | kv_input = torch.cat((x, latents), dim=-2) 59 | k, v = self.to_kv(kv_input).chunk(2, dim=-1) 60 | q, k, v = rearrange_many((q, k, v), "b t n (h d) -> b h t n d", h=h) 61 | q = q * self.scale 62 | 63 | # attention 64 | sim = einsum("... i d, ... j d -> ... i j", q, k) 65 | sim = sim - sim.amax(dim=-1, keepdim=True).detach() 66 | attn = sim.softmax(dim=-1) 67 | 68 | out = einsum("... i j, ... j d -> ... i d", attn, v) 69 | out = rearrange(out, "b h t n d -> b t n (h d)", h=h) 70 | return self.to_out(out) 71 | 72 | 73 | class PerceiverResamplerModule(nn.Module): 74 | def __init__( 75 | self, 76 | *, 77 | dim, 78 | depth=6, 79 | dim_head=64, 80 | heads=8, 81 | num_latents=64, 82 | max_num_media=None, 83 | max_num_frames=None, 84 | ff_mult=4, 85 | ): 86 | super().__init__() 87 | self.latents = nn.Parameter(torch.randn(num_latents, dim)) 88 | self.frame_embs = nn.Parameter(torch.randn(max_num_frames, dim)) if exists(max_num_frames) else None 89 | self.media_time_embs = nn.Parameter(torch.randn(max_num_media, 1, dim)) if exists(max_num_media) else None 90 | 91 | self.layers = nn.ModuleList([]) 92 | for _ in range(depth): 93 | self.layers.append( 94 | nn.ModuleList( 95 | [ 96 | PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads), 97 | FeedForward(dim=dim, mult=ff_mult) if ff_mult > 0 else nn.Identity(), 98 | ] 99 | ) 100 | ) 101 | 102 | self.norm = nn.LayerNorm(dim) 103 | 104 | def forward(self, x): 105 | """ 106 | Args: 107 | x (torch.Tensor): image features 108 | shape (b, T, F, v, D) 109 | Returns: 110 | shape (b, T, n, D) where n is self.num_latents 111 | """ 112 | b, T, F, v = x.shape[:4] 113 | 114 | # frame and media time embeddings 115 | if exists(self.frame_embs): 116 | frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v) 117 | x = x + frame_embs 118 | x = rearrange(x, "b T F v d -> b T (F v) d") # flatten the frame and spatial dimensions 119 | if exists(self.media_time_embs): 120 | x = x + self.media_time_embs[:T] 121 | 122 | # blocks 123 | latents = repeat(self.latents, "n d -> b T n d", b=b, T=T) 124 | for attn, ff in self.layers: 125 | latents = attn(x, latents) + latents 126 | latents = ff(latents) + latents 127 | return self.norm(latents) 128 | 129 | 130 | class PerceiverResampler(nn.Module): 131 | def __init__(self, model_args, vision_tower): 132 | super().__init__() 133 | 134 | self.depth = model_args.mm_perceiver_depth 135 | self.num_latents = model_args.mm_perceiver_latents 136 | self.ff_mult = model_args.mm_perceiver_ff_mult 137 | self.pretrained = model_args.mm_perceiver_pretrained 138 | 139 | self.perceiver = PerceiverResamplerModule(dim=vision_tower.hidden_size, depth=self.depth, num_latents=self.num_latents, ff_mult=self.ff_mult) 140 | 141 | if self.pretrained is not None: 142 | self.load_state_dict(torch.load(self.pretrained)) 143 | 144 | def forward(self, image_features, *args, **kwargs): 145 | return self.perceiver(image_features[:, None, None]).squeeze(1) 146 | 147 | @property 148 | def config(self): 149 | return { 150 | "mm_resampler_type": "perceiver", 151 | "mm_perceiver_depth": self.depth, 152 | "mm_perceiver_latents": self.num_latents, 153 | "mm_perceiver_ff_mult": self.ff_mult, 154 | "mm_perceiver_pretrained": self.pretrained, 155 | } 156 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/multimodal_resampler/spatial_pool.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | 5 | 6 | class SpatialPool(nn.Module): 7 | def __init__(self, model_args, vision_tower): 8 | super().__init__() 9 | 10 | self.mode = model_args.mm_spatial_pool_mode 11 | self.stride = model_args.mm_spatial_pool_stride 12 | self.out_channels = getattr(model_args, "mm_spatial_pool_out_channels", vision_tower.hidden_size) 13 | 14 | if self.mode == "average": 15 | self.pool = nn.AvgPool2d(kernel_size=self.stride, stride=self.stride) 16 | elif self.mode == "max": 17 | self.pool = nn.MaxPool2d(kernel_size=self.stride, stride=self.stride) 18 | elif self.mode == "conv": 19 | self.pool = nn.Conv2d(in_channels=vision_tower.hidden_size, out_channels=self.out_channels, kernel_size=self.stride, stride=self.stride) 20 | else: 21 | raise ValueError(f"Unknown pooling mode: {self.pool}.") 22 | 23 | def forward(self, image_features, images, *args, **kwargs): 24 | ori_W = int(math.sqrt(image_features.shape[1] * images.shape[3] // images.shape[2])) 25 | ori_H = int(ori_W * images.shape[2] // images.shape[3]) 26 | 27 | B, _, F = image_features.shape 28 | 29 | image_features_spatial = image_features.view(B, ori_H, ori_H, F).permute(0, 3, 1, 2) 30 | image_features_spatial_pool = self.pool(image_features_spatial) 31 | 32 | return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous() 33 | 34 | @property 35 | def config(self): 36 | return { 37 | "mm_resampler_type": "spatial_pool", 38 | "mm_spatial_pool_stride": self.stride, 39 | "mm_spatial_pool_mode": self.mode, 40 | "mm_spatial_pool_out_channels": self.out_channels, 41 | } 42 | 43 | @property 44 | def hidden_size(self): 45 | return self.out_channels 46 | -------------------------------------------------------------------------------- /lmmrotate/models/llava/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if "llava" in config and "llava" not in cfg.model_type: 7 | assert cfg.model_type == "llama" 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = "LlavaLlamaForCausalLM" 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /lmmrotate/models/version.py: -------------------------------------------------------------------------------- 1 | default_version_commit_id = { 2 | "florence2": { 3 | "large": "f92844072980ab91bc708ae2fc8c1227318023a4", 4 | "base": "ceaf371f01ef66192264811b390bccad475a4f02", 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /lmmrotate/modules/rsar_dataset.py: -------------------------------------------------------------------------------- 1 | # https://github.com/zhasion/RSAR/blob/main/mmrotate/datasets/rsar.py 2 | # Copyright (c) OpenMMLab. All rights reserved. 3 | import glob 4 | import os.path as osp 5 | from typing import List 6 | 7 | from mmengine.dataset import BaseDataset 8 | 9 | from mmrotate.registry import DATASETS 10 | 11 | 12 | @DATASETS.register_module() 13 | class RSARDataset(BaseDataset): 14 | """RSAR dataset for detection. 15 | 16 | Note: ``ann_file`` in RSARDataset is different from the BaseDataset. 17 | In BaseDataset, it is the path of an annotation file. In RSARDataset, 18 | it is the path of a folder containing XML files. 19 | 20 | Args: 21 | diff_thr (int): The difficulty threshold of ground truth. Bboxes 22 | with difficulty higher than it will be ignored. The range of this 23 | value should be non-negative integer. Defaults to 100. 24 | """ 25 | 26 | METAINFO = { 27 | 'classes': ('ship', 'aircraft', 'car', 'tank', 'bridge', 'harbor'), 28 | # palette is a list of color tuples, which is used for visualization. 29 | 'palette': 30 | [(220, 20, 60), (0, 0, 230), (106, 0, 228), 31 | (0, 182, 0), (200, 182, 0), (0, 182, 200)] 32 | } 33 | 34 | def __init__(self, 35 | diff_thr: int = 100, 36 | **kwargs) -> None: 37 | self.diff_thr = diff_thr 38 | super().__init__(**kwargs) 39 | 40 | def load_data_list(self) -> List[dict]: 41 | """Load annotations from an annotation file named as ``self.ann_file`` 42 | Returns: 43 | List[dict]: A list of annotation. 44 | """ # noqa: E501 45 | cls_map = {c: i 46 | for i, c in enumerate(self.metainfo['classes']) 47 | } # in mmdet v2.0 label is 0-based 48 | data_list = [] 49 | if self.ann_file == '': 50 | img_files = glob.glob( 51 | osp.join(self.data_prefix['img_path'], '*')) 52 | for img_path in img_files: 53 | data_info = {} 54 | data_info['img_path'] = img_path 55 | img_name = osp.split(img_path)[1] 56 | data_info['file_name'] = img_name 57 | img_id = img_name[:-4] 58 | data_info['img_id'] = img_id 59 | 60 | instance = dict(bbox=[], bbox_label=[], ignore_flag=0) 61 | data_info['instances'] = [instance] 62 | data_list.append(data_info) 63 | 64 | return data_list 65 | else: 66 | img_files = glob.glob(osp.join(self.data_prefix['img_path'], '*')) 67 | if len(img_files) == 0: 68 | raise ValueError('There is no img file in ' 69 | f'{self.data_prefix["img_path"]}') 70 | for img_file in img_files: 71 | data_info = {} 72 | img_id = osp.split(img_file)[1][:-4] 73 | data_info['img_id'] = img_id 74 | data_info['file_name'] = osp.basename(img_file) 75 | data_info['img_path'] = img_file 76 | instances = [] 77 | txt_file = osp.join(self.ann_file, img_id + ".txt") 78 | with open(txt_file) as f: 79 | s = f.readlines() 80 | for si in s: 81 | instance = {} 82 | bbox_info = si.split() 83 | instance['bbox'] = [float(i) for i in bbox_info[:8]] 84 | cls_name = bbox_info[8] 85 | instance['bbox_label'] = cls_map[cls_name] 86 | difficulty = int(bbox_info[9]) 87 | if difficulty > self.diff_thr: 88 | instance['ignore_flag'] = 1 89 | else: 90 | instance['ignore_flag'] = 0 91 | instances.append(instance) 92 | data_info['instances'] = instances 93 | data_list.append(data_info) 94 | 95 | return data_list 96 | 97 | def filter_data(self) -> List[dict]: 98 | """Filter annotations according to filter_cfg. 99 | 100 | Returns: 101 | List[dict]: Filtered results. 102 | """ 103 | if self.test_mode: 104 | return self.data_list 105 | 106 | filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) \ 107 | if self.filter_cfg is not None else False 108 | 109 | valid_data_infos = [] 110 | for i, data_info in enumerate(self.data_list): 111 | if filter_empty_gt and len(data_info['instances']) == 0: 112 | continue 113 | valid_data_infos.append(data_info) 114 | 115 | return valid_data_infos 116 | 117 | def get_cat_ids(self, idx: int) -> List[int]: 118 | """Get RSAR category ids by index. 119 | 120 | Args: 121 | idx (int): Index of data. 122 | Returns: 123 | List[int]: All categories in the image of specified index. 124 | """ 125 | 126 | instances = self.get_data_info(idx)['instances'] 127 | return [instance['bbox_label'] for instance in instances] 128 | -------------------------------------------------------------------------------- /lmmrotate/train.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from typing import Optional 3 | from dataclasses import dataclass, field 4 | 5 | import transformers 6 | 7 | from .dataset import OrientedDetSFTDataset 8 | from .trainer import CustomTrainer, DatasetResetCallback, initialize_model_and_processor 9 | 10 | 11 | @dataclass 12 | class ModelArguments: 13 | model_type: Optional[str] = field(default="florence2") 14 | model_name_or_path: Optional[str] = field(default="microsoft/florence-2-large") 15 | image_square_length: Optional[int] = field(default=None) 16 | language_model_max_length: Optional[int] = field(default=None) 17 | model_revision: Optional[str] = field(default=None) 18 | language_model_lora: Optional[int] = field(default=None) 19 | 20 | 21 | @dataclass 22 | class DataArguments: 23 | data_path: str = field(default=None, 24 | metadata={"help": "Path to the training data.", "nargs": "+"}) 25 | image_folder: Optional[str] = field(default=None, metadata={"nargs": "+"}) 26 | dataset_mode: str = field(default="single") # single / concat / balanced concat 27 | response_format: Optional[str] = field(default="florence2") 28 | 29 | 30 | @dataclass 31 | class TrainingArguments(transformers.TrainingArguments): 32 | cache_dir: Optional[str] = field(default=None) 33 | optim: str = field(default="adamw_torch") 34 | freeze_language: bool = field( 35 | default=False, 36 | metadata={'help': 'Set to True to freeze the language model.'}, 37 | ) 38 | freeze_vision: bool = field( 39 | default=False, 40 | metadata={'help': 'Set to True to freeze the vision backbone of the model.'}, 41 | ) 42 | freeze_multimodal_projection: bool = field( 43 | default=False, 44 | metadata={'help': 'Set to True to freeze the multimodal projection.'}, 45 | ) 46 | attn_implementation: str = field(default="flash_attention_2", metadata={"help": "Use transformers attention implementation."}) 47 | 48 | 49 | def parse_args(): 50 | parser = transformers.HfArgumentParser( 51 | (ModelArguments, DataArguments, TrainingArguments)) 52 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 53 | data_args.language_model_max_length = model_args.language_model_max_length 54 | data_args.model_type = model_args.model_type 55 | model_args.response_format = data_args.response_format 56 | return model_args, data_args, training_args 57 | 58 | 59 | def train(): 60 | # Initialize arguments 61 | model_args, data_args, training_args = parse_args() 62 | 63 | # Initialize model and processor 64 | model, processor = initialize_model_and_processor(model_args, training_args) 65 | 66 | # Initialize dataset 67 | train_dataset = OrientedDetSFTDataset(processor=processor, data_args=data_args, model=model) 68 | 69 | # Initialize trainer 70 | trainer = CustomTrainer( 71 | model=model, 72 | processor=processor, 73 | tokenizer=processor.tokenizer, 74 | args=training_args, 75 | train_dataset=train_dataset, 76 | eval_dataset=train_dataset, 77 | data_collator=train_dataset.collate_fn, 78 | callbacks=[DatasetResetCallback(train_dataset)], 79 | ) 80 | 81 | # Train 82 | if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): 83 | trainer.train(resume_from_checkpoint=True) 84 | else: 85 | trainer.train() 86 | 87 | # Save final model 88 | trainer.save_final_model() 89 | 90 | 91 | if __name__ == "__main__": 92 | train() 93 | -------------------------------------------------------------------------------- /playground/P0368__1024__0___0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionXLab/mllm-mmrotate/110f0ffdfd6788c0a73eaa3457cc4dfbdfc28bbb/playground/P0368__1024__0___0.png -------------------------------------------------------------------------------- /playground/eval_mmrotate_detector_mapnc.py: -------------------------------------------------------------------------------- 1 | import json 2 | from fire import Fire 3 | 4 | import torch 5 | 6 | from mmengine.fileio import load 7 | from mmengine.evaluator import Evaluator 8 | 9 | from mmrotate.utils import register_all_modules 10 | from mmdet.utils import register_all_modules as register_all_modules_mmdet 11 | 12 | 13 | def monkey_patch_of_collections_typehint_for_mmrotate1x(): 14 | import collections 15 | from collections.abc import Mapping, Sequence, Iterable 16 | collections.Mapping = Mapping 17 | collections.Sequence = Sequence 18 | collections.Iterable = Iterable 19 | 20 | monkey_patch_of_collections_typehint_for_mmrotate1x() 21 | 22 | register_all_modules_mmdet(init_default_scope=False) 23 | register_all_modules(init_default_scope=False) 24 | 25 | # G=20 26 | # THRESHOLD_TO_CHECK = [i / G for i in range(int(G))] 27 | THRESHOLD_TO_CHECK = (0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.7) 28 | 29 | 30 | def prepare_evaluator(dataset_name): 31 | if dataset_name == "dota": 32 | from mmrotate.evaluation import DOTAMetric 33 | evaluator = Evaluator(DOTAMetric(metric="mAP")) 34 | evaluator.dataset_meta = { 35 | 'classes': 36 | ('plane', 'baseball-diamond', 'bridge', 'ground-track-field', 37 | 'small-vehicle', 'large-vehicle', 'ship', 'tennis-court', 38 | 'basketball-court', 'storage-tank', 'soccer-ball-field', 'roundabout', 39 | 'harbor', 'swimming-pool', 'helicopter'), 40 | } 41 | elif dataset_name == "fair": 42 | from lmmrotate.modules.fair_metric import FAIRMetric 43 | evaluator = Evaluator(FAIRMetric(metric="mAP")) 44 | evaluator.dataset_meta = { 45 | 'classes': 46 | ('Boeing737', 'Boeing747', 'Boeing777', 'Boeing787', 'C919', 'A220', 47 | 'A321', 'A330', 'A350', 'ARJ21', 'Passenger Ship', 'Motorboat', 48 | 'Fishing Boat', 'Tugboat', 'Engineering Ship', 'Liquid Cargo Ship', 49 | 'Dry Cargo Ship', 'Warship', 'Small Car', 'Bus', 'Cargo Truck', 50 | 'Dump Truck', 'Van', 'Trailer', 'Tractor', 'Excavator', 51 | 'Truck Tractor', 'Basketball Court', 'Tennis Court', 'Football Field', 52 | 'Baseball Field', 'Intersection', 'Roundabout', 'Bridge'), 53 | } 54 | elif dataset_name == "dior": 55 | from mmrotate.evaluation import DOTAMetric 56 | evaluator = Evaluator(DOTAMetric(metric="mAP")) 57 | evaluator.dataset_meta = { 58 | 'classes': 59 | ('airplane', 'airport', 'baseballfield', 'basketballcourt', 'bridge', 60 | 'chimney', 'expressway-service-area', 'expressway-toll-station', 61 | 'dam', 'golffield', 'groundtrackfield', 'harbor', 'overpass', 'ship', 62 | 'stadium', 'storagetank', 'tenniscourt', 'trainstation', 'vehicle', 'windmill'), 63 | } 64 | elif dataset_name == "srsdd": 65 | from mmrotate.evaluation import RotatedCocoMetric 66 | evaluator = Evaluator(RotatedCocoMetric(metric='bbox', classwise=True)) 67 | evaluator.dataset_meta = { 68 | 'classes': 69 | ('Cell-Container', 'Container', 'Dredger', 'Fishing', 'LawEnforce', 'ore-oil'), 70 | } 71 | elif dataset_name == "rsar": 72 | from mmrotate.evaluation import DOTAMetric 73 | evaluator = Evaluator(DOTAMetric(metric="mAP")) 74 | evaluator.dataset_meta = {'classes': ('ship', 'aircraft', 'car', 'tank', 'bridge', 'harbor')} 75 | else: 76 | raise NotImplementedError(f"Unknown dataset: {dataset_name}") 77 | return evaluator 78 | 79 | 80 | def get_results_of_different_thresholds(dataset_name, pickle_result_path): 81 | results = {} 82 | for threshold in THRESHOLD_TO_CHECK: 83 | print("".join(["="*10, f" score threshold={threshold} ", "="*10])) 84 | 85 | evaluator = prepare_evaluator(dataset_name) 86 | results_test = load(pickle_result_path) 87 | for res in results_test: 88 | keep = res["pred_instances"]["scores"] > threshold 89 | res["pred_instances"]["labels"] = res["pred_instances"]["labels"][keep] 90 | res["pred_instances"]["bboxes"] = res["pred_instances"]["bboxes"][keep] 91 | res["pred_instances"]["scores"] = torch.ones_like(res["pred_instances"]["scores"][keep]) 92 | 93 | mAP = evaluator.offline_evaluate(data_samples=results_test, chunk_size=128) 94 | mAP = mAP.get('dota/mAP', mAP.get('fair1m/mAP', mAP.get('r_coco/bbox_mAP_50'))) 95 | 96 | results[threshold] = mAP 97 | 98 | print(json.dumps(results, indent=4)) 99 | print("best result: ", max(results)) 100 | print("best threshold: ", THRESHOLD_TO_CHECK[results.index(max(results))]) 101 | 102 | 103 | if __name__ == "__main__": 104 | Fire(get_results_of_different_thresholds) 105 | -------------------------------------------------------------------------------- /playground/gradio_app_poly.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from PIL import Image, ImageDraw 3 | 4 | 5 | def draw_quad(image, quad_coords): 6 | image = image.convert("RGB") 7 | draw = ImageDraw.Draw(image) 8 | 9 | coords = list(map(float, quad_coords.split(','))) 10 | all_polygons = [] 11 | for i in range(len(coords) // 8): 12 | all_polygons.append(coords[i * 8: (i + 1) * 8]) 13 | print(f"left {coords[(i + 1) * 8:]}") 14 | 15 | for polygon in all_polygons: 16 | draw.polygon(polygon, outline="red", width=3) 17 | return image 18 | 19 | 20 | def load_img(img_path_input): 21 | image = Image.open(img_path_input) 22 | return image 23 | 24 | 25 | if __name__ == "__main__": 26 | with gr.Blocks() as demo: 27 | gr.Markdown("# Quad Box Drawer") 28 | 29 | image_input = gr.Image(type="pil", label="Upload Image") 30 | img_path_input = gr.Textbox(label="Image Path", placeholder="absolute path") 31 | quad_input = gr.Textbox(label="Quad Coordinates (comma separated, 8 values)", placeholder="x1,y1,x2,y2,x3,y3,x4,y4") 32 | img_button = gr.Button("Load Image") 33 | draw_button = gr.Button("Draw Quad Box") 34 | image_output = gr.Image(type="pil", label="Image with Quad Box") 35 | img_button.click(load_img, inputs=[img_path_input], outputs=[image_input]) 36 | draw_button.click(draw_quad, inputs=[image_input, quad_input], outputs=[image_output]) 37 | 38 | demo.launch(share=False) 39 | -------------------------------------------------------------------------------- /playground/mmrotate_configs/dior-1024.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'DIORDataset' 3 | data_root = 'playground/data/DIOR/' 4 | 5 | train_pipeline = [ 6 | dict(type='mmdet.LoadImageFromFile'), 7 | dict(type='mmdet.LoadAnnotations', with_bbox=True, box_type='qbox'), 8 | dict(type='ConvertBoxType', box_type_mapping=dict(gt_bboxes='rbox')), 9 | dict(type='mmdet.Resize', scale=(1024, 1024), keep_ratio=True), 10 | dict( 11 | type='mmdet.RandomFlip', 12 | prob=0.75, 13 | direction=['horizontal', 'vertical', 'diagonal']), 14 | dict(type='mmdet.PackDetInputs') 15 | ] 16 | val_pipeline = [ 17 | dict(type='mmdet.LoadImageFromFile'), 18 | dict(type='mmdet.Resize', scale=(1024, 1024), keep_ratio=True), 19 | # avoid bboxes being resized 20 | dict(type='mmdet.LoadAnnotations', with_bbox=True, box_type='qbox'), 21 | dict(type='ConvertBoxType', box_type_mapping=dict(gt_bboxes='rbox')), 22 | dict( 23 | type='mmdet.PackDetInputs', 24 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 25 | 'scale_factor')) 26 | ] 27 | test_pipeline = [ 28 | dict(type='mmdet.LoadImageFromFile'), 29 | dict(type='mmdet.Resize', scale=(1024, 1024), keep_ratio=True), 30 | dict( 31 | type='mmdet.PackDetInputs', 32 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 33 | 'scale_factor')) 34 | ] 35 | train_dataloader = dict( 36 | batch_size=2, 37 | num_workers=2, 38 | persistent_workers=True, 39 | sampler=dict(type='DefaultSampler', shuffle=True), 40 | batch_sampler=None, 41 | dataset=dict( 42 | type=dataset_type, 43 | data_root=data_root, 44 | ann_file='ImageSets/Main/trainval.txt', 45 | data_prefix=dict(img_path='JPEGImages-trainval'), 46 | filter_cfg=dict(filter_empty_gt=True), 47 | pipeline=train_pipeline)) 48 | val_dataloader = dict( 49 | batch_size=1, 50 | num_workers=2, 51 | persistent_workers=True, 52 | drop_last=False, 53 | sampler=dict(type='DefaultSampler', shuffle=False), 54 | dataset=dict( 55 | type=dataset_type, 56 | data_root=data_root, 57 | ann_file='ImageSets/Main/test.txt', 58 | data_prefix=dict(img_path='JPEGImages-test'), 59 | test_mode=True, 60 | pipeline=val_pipeline)) 61 | test_dataloader = val_dataloader 62 | 63 | val_evaluator = dict(type='DOTAMetric', metric='mAP') 64 | test_evaluator = val_evaluator -------------------------------------------------------------------------------- /playground/mmrotate_configs/dior.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'DIORDataset' 3 | data_root = 'playground/data/DIOR/' 4 | 5 | train_pipeline = [ 6 | dict(type='mmdet.LoadImageFromFile'), 7 | dict(type='mmdet.LoadAnnotations', with_bbox=True, box_type='qbox'), 8 | dict(type='ConvertBoxType', box_type_mapping=dict(gt_bboxes='rbox')), 9 | dict(type='mmdet.Resize', scale=(800, 800), keep_ratio=True), 10 | dict( 11 | type='mmdet.RandomFlip', 12 | prob=0.75, 13 | direction=['horizontal', 'vertical', 'diagonal']), 14 | dict(type='mmdet.PackDetInputs') 15 | ] 16 | val_pipeline = [ 17 | dict(type='mmdet.LoadImageFromFile'), 18 | dict(type='mmdet.Resize', scale=(800, 800), keep_ratio=True), 19 | # avoid bboxes being resized 20 | dict(type='mmdet.LoadAnnotations', with_bbox=True, box_type='qbox'), 21 | dict(type='ConvertBoxType', box_type_mapping=dict(gt_bboxes='rbox')), 22 | dict( 23 | type='mmdet.PackDetInputs', 24 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 25 | 'scale_factor')) 26 | ] 27 | test_pipeline = [ 28 | dict(type='mmdet.LoadImageFromFile'), 29 | dict(type='mmdet.Resize', scale=(800, 800), keep_ratio=True), 30 | dict( 31 | type='mmdet.PackDetInputs', 32 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 33 | 'scale_factor')) 34 | ] 35 | train_dataloader = dict( 36 | batch_size=2, 37 | num_workers=2, 38 | persistent_workers=True, 39 | sampler=dict(type='DefaultSampler', shuffle=True), 40 | batch_sampler=None, 41 | dataset=dict( 42 | type=dataset_type, 43 | data_root=data_root, 44 | ann_file='ImageSets/Main/trainval.txt', 45 | data_prefix=dict(img_path='JPEGImages-trainval'), 46 | filter_cfg=dict(filter_empty_gt=True), 47 | pipeline=train_pipeline)) 48 | val_dataloader = dict( 49 | batch_size=1, 50 | num_workers=2, 51 | persistent_workers=True, 52 | drop_last=False, 53 | sampler=dict(type='DefaultSampler', shuffle=False), 54 | dataset=dict( 55 | type=dataset_type, 56 | data_root=data_root, 57 | ann_file='ImageSets/Main/test.txt', 58 | data_prefix=dict(img_path='JPEGImages-test'), 59 | test_mode=True, 60 | pipeline=val_pipeline)) 61 | test_dataloader = val_dataloader 62 | 63 | val_evaluator = dict(type='DOTAMetric', metric='mAP') 64 | test_evaluator = val_evaluator -------------------------------------------------------------------------------- /playground/mmrotate_configs/fair1m.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | custom_imports = dict(imports=['lmmrotate.modules.fair_dataset', 'lmmrotate.modules.fair_metric'], allow_failed_imports=False) 3 | 4 | dataset_type = 'FAIRDOTADataset' 5 | data_root = 'playground/data/split_ss_fair1m_1_0/' 6 | backend_args = None 7 | 8 | train_pipeline = [ 9 | dict(type='mmdet.LoadImageFromFile', backend_args=backend_args), 10 | dict(type='mmdet.LoadAnnotations', with_bbox=True, box_type='qbox'), 11 | dict(type='ConvertBoxType', box_type_mapping=dict(gt_bboxes='rbox')), 12 | dict(type='mmdet.Resize', scale=(1024, 1024), keep_ratio=True), 13 | dict( 14 | type='mmdet.RandomFlip', 15 | prob=0.75, 16 | direction=['horizontal', 'vertical', 'diagonal']), 17 | dict(type='mmdet.PackDetInputs') 18 | ] 19 | val_pipeline = [ 20 | dict(type='mmdet.LoadImageFromFile', backend_args=backend_args), 21 | dict(type='mmdet.Resize', scale=(1024, 1024), keep_ratio=True), 22 | # avoid bboxes being resized 23 | dict(type='mmdet.LoadAnnotations', with_bbox=True, box_type='qbox'), 24 | dict(type='ConvertBoxType', box_type_mapping=dict(gt_bboxes='rbox')), 25 | dict( 26 | type='mmdet.PackDetInputs', 27 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 28 | 'scale_factor')) 29 | ] 30 | test_pipeline = [ 31 | dict(type='mmdet.LoadImageFromFile', backend_args=backend_args), 32 | dict(type='mmdet.Resize', scale=(1024, 1024), keep_ratio=True), 33 | dict( 34 | type='mmdet.PackDetInputs', 35 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 36 | 'scale_factor')) 37 | ] 38 | train_dataloader = dict( 39 | batch_size=2, 40 | num_workers=2, 41 | persistent_workers=True, 42 | sampler=dict(type='DefaultSampler', shuffle=True), 43 | batch_sampler=None, 44 | dataset=dict( 45 | type=dataset_type, 46 | data_root=data_root, 47 | ann_file='train/annfiles/', 48 | data_prefix=dict(img_path='train/images/'), 49 | filter_cfg=dict(filter_empty_gt=True), 50 | pipeline=train_pipeline)) 51 | val_dataloader = dict( 52 | batch_size=4, 53 | num_workers=2, 54 | persistent_workers=True, 55 | drop_last=False, 56 | sampler=dict(type='DefaultSampler', shuffle=False), 57 | dataset=dict( 58 | type=dataset_type, 59 | data_root=data_root, 60 | ann_file='train/annfiles/', 61 | data_prefix=dict(img_path='train/images/'), 62 | filter_cfg=dict(filter_empty_gt=True), 63 | test_mode=True, 64 | pipeline=val_pipeline)) 65 | test_dataloader = val_dataloader 66 | 67 | val_evaluator = dict(type='FAIRMetric', metric='mAP') 68 | test_evaluator = val_evaluator 69 | 70 | # inference on test dataset and format the output results 71 | # for submission. Note: the test set has no annotation. 72 | test_dataloader = dict( 73 | batch_size=4, 74 | num_workers=2, 75 | persistent_workers=True, 76 | drop_last=False, 77 | sampler=dict(type='DefaultSampler', shuffle=False), 78 | dataset=dict( 79 | type=dataset_type, 80 | data_root=data_root, 81 | data_prefix=dict(img_path='test/images/'), 82 | test_mode=True, 83 | pipeline=test_pipeline)) 84 | test_evaluator = dict( 85 | type='FAIRMetric', 86 | format_only=True, 87 | merge_patches=True, 88 | outfile_prefix='./work_dirs/fair/Task1') -------------------------------------------------------------------------------- /playground/mmrotate_configs/rotated-fcos-le90_r50_fpn_1x_dior-1024.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | './dior-1024.py', 3 | 'mmrotate::_base_/schedules/schedule_1x.py', 4 | 'mmrotate::_base_/default_runtime.py' 5 | ] 6 | angle_version = 'le90' 7 | 8 | # model settings 9 | model = dict( 10 | type='mmdet.FCOS', 11 | data_preprocessor=dict( 12 | type='mmdet.DetDataPreprocessor', 13 | mean=[123.675, 116.28, 103.53], 14 | std=[58.395, 57.12, 57.375], 15 | bgr_to_rgb=True, 16 | pad_size_divisor=32, 17 | boxtype2tensor=False), 18 | backbone=dict( 19 | type='mmdet.ResNet', 20 | depth=50, 21 | num_stages=4, 22 | out_indices=(0, 1, 2, 3), 23 | frozen_stages=1, 24 | norm_cfg=dict(type='BN', requires_grad=True), 25 | norm_eval=True, 26 | style='pytorch', 27 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 28 | neck=dict( 29 | type='mmdet.FPN', 30 | in_channels=[256, 512, 1024, 2048], 31 | out_channels=256, 32 | start_level=1, 33 | add_extra_convs='on_output', 34 | num_outs=5, 35 | relu_before_extra_convs=True), 36 | bbox_head=dict( 37 | type='RotatedFCOSHead', 38 | num_classes=20, 39 | in_channels=256, 40 | stacked_convs=4, 41 | feat_channels=256, 42 | strides=[8, 16, 32, 64, 128], 43 | center_sampling=True, 44 | center_sample_radius=1.5, 45 | norm_on_bbox=True, 46 | centerness_on_reg=True, 47 | use_hbbox_loss=False, 48 | scale_angle=True, 49 | bbox_coder=dict( 50 | type='DistanceAnglePointCoder', angle_version=angle_version), 51 | loss_cls=dict( 52 | type='mmdet.FocalLoss', 53 | use_sigmoid=True, 54 | gamma=2.0, 55 | alpha=0.25, 56 | loss_weight=1.0), 57 | loss_bbox=dict(type='RotatedIoULoss', loss_weight=1.0), 58 | loss_angle=None, 59 | loss_centerness=dict( 60 | type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), 61 | # training and testing settings 62 | train_cfg=None, 63 | test_cfg=dict( 64 | nms_pre=2000, 65 | min_bbox_size=0, 66 | score_thr=0.05, 67 | nms=dict(type='nms_rotated', iou_threshold=0.1), 68 | max_per_img=2000)) -------------------------------------------------------------------------------- /playground/mmrotate_configs/rotated-fcos-le90_r50_fpn_1x_dior.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | './dior.py', 3 | 'mmrotate::_base_/schedules/schedule_1x.py', 4 | 'mmrotate::_base_/default_runtime.py' 5 | ] 6 | angle_version = 'le90' 7 | 8 | # model settings 9 | model = dict( 10 | type='mmdet.FCOS', 11 | data_preprocessor=dict( 12 | type='mmdet.DetDataPreprocessor', 13 | mean=[123.675, 116.28, 103.53], 14 | std=[58.395, 57.12, 57.375], 15 | bgr_to_rgb=True, 16 | pad_size_divisor=32, 17 | boxtype2tensor=False), 18 | backbone=dict( 19 | type='mmdet.ResNet', 20 | depth=50, 21 | num_stages=4, 22 | out_indices=(0, 1, 2, 3), 23 | frozen_stages=1, 24 | norm_cfg=dict(type='BN', requires_grad=True), 25 | norm_eval=True, 26 | style='pytorch', 27 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 28 | neck=dict( 29 | type='mmdet.FPN', 30 | in_channels=[256, 512, 1024, 2048], 31 | out_channels=256, 32 | start_level=1, 33 | add_extra_convs='on_output', 34 | num_outs=5, 35 | relu_before_extra_convs=True), 36 | bbox_head=dict( 37 | type='RotatedFCOSHead', 38 | num_classes=20, 39 | in_channels=256, 40 | stacked_convs=4, 41 | feat_channels=256, 42 | strides=[8, 16, 32, 64, 128], 43 | center_sampling=True, 44 | center_sample_radius=1.5, 45 | norm_on_bbox=True, 46 | centerness_on_reg=True, 47 | use_hbbox_loss=False, 48 | scale_angle=True, 49 | bbox_coder=dict( 50 | type='DistanceAnglePointCoder', angle_version=angle_version), 51 | loss_cls=dict( 52 | type='mmdet.FocalLoss', 53 | use_sigmoid=True, 54 | gamma=2.0, 55 | alpha=0.25, 56 | loss_weight=1.0), 57 | loss_bbox=dict(type='RotatedIoULoss', loss_weight=1.0), 58 | loss_angle=None, 59 | loss_centerness=dict( 60 | type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), 61 | # training and testing settings 62 | train_cfg=None, 63 | test_cfg=dict( 64 | nms_pre=2000, 65 | min_bbox_size=0, 66 | score_thr=0.05, 67 | nms=dict(type='nms_rotated', iou_threshold=0.1), 68 | max_per_img=2000)) -------------------------------------------------------------------------------- /playground/mmrotate_configs/rotated-fcos-le90_r50_fpn_1x_dota-train.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | 'mmrotate::_base_/datasets/dota.py', 3 | 'mmrotate::_base_/schedules/schedule_1x.py', 4 | 'mmrotate::_base_/default_runtime.py' 5 | ] 6 | angle_version = 'le90' 7 | 8 | data_root = 'playground/data/split_ss_dota/' 9 | train_dataloader = dict( 10 | dataset=dict( 11 | data_root=data_root, 12 | ann_file='train/annfiles/', 13 | ) 14 | ) 15 | val_dataloader = dict( 16 | dataset=dict( 17 | data_root=data_root, 18 | ann_file='val/annfiles/', 19 | ) 20 | ) 21 | test_dataloader = val_dataloader 22 | 23 | # model settings 24 | model = dict( 25 | type='mmdet.FCOS', 26 | data_preprocessor=dict( 27 | type='mmdet.DetDataPreprocessor', 28 | mean=[123.675, 116.28, 103.53], 29 | std=[58.395, 57.12, 57.375], 30 | bgr_to_rgb=True, 31 | pad_size_divisor=32, 32 | boxtype2tensor=False), 33 | backbone=dict( 34 | type='mmdet.ResNet', 35 | depth=50, 36 | num_stages=4, 37 | out_indices=(0, 1, 2, 3), 38 | frozen_stages=1, 39 | norm_cfg=dict(type='BN', requires_grad=True), 40 | norm_eval=True, 41 | style='pytorch', 42 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 43 | neck=dict( 44 | type='mmdet.FPN', 45 | in_channels=[256, 512, 1024, 2048], 46 | out_channels=256, 47 | start_level=1, 48 | add_extra_convs='on_output', 49 | num_outs=5, 50 | relu_before_extra_convs=True), 51 | bbox_head=dict( 52 | type='RotatedFCOSHead', 53 | num_classes=15, 54 | in_channels=256, 55 | stacked_convs=4, 56 | feat_channels=256, 57 | strides=[8, 16, 32, 64, 128], 58 | center_sampling=True, 59 | center_sample_radius=1.5, 60 | norm_on_bbox=True, 61 | centerness_on_reg=True, 62 | use_hbbox_loss=False, 63 | scale_angle=True, 64 | bbox_coder=dict( 65 | type='DistanceAnglePointCoder', angle_version=angle_version), 66 | loss_cls=dict( 67 | type='mmdet.FocalLoss', 68 | use_sigmoid=True, 69 | gamma=2.0, 70 | alpha=0.25, 71 | loss_weight=1.0), 72 | loss_bbox=dict(type='RotatedIoULoss', loss_weight=1.0), 73 | loss_angle=None, 74 | loss_centerness=dict( 75 | type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), 76 | # training and testing settings 77 | train_cfg=None, 78 | test_cfg=dict( 79 | nms_pre=2000, 80 | min_bbox_size=0, 81 | score_thr=0.05, 82 | nms=dict(type='nms_rotated', iou_threshold=0.1), 83 | max_per_img=2000)) -------------------------------------------------------------------------------- /playground/mmrotate_configs/rotated-fcos-le90_r50_fpn_1x_fair.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | './fair1m.py', 3 | 'mmrotate::_base_/schedules/schedule_1x.py', 4 | 'mmrotate::_base_/default_runtime.py' 5 | ] 6 | angle_version = 'le90' 7 | 8 | # model settings 9 | model = dict( 10 | type='mmdet.FCOS', 11 | data_preprocessor=dict( 12 | type='mmdet.DetDataPreprocessor', 13 | mean=[123.675, 116.28, 103.53], 14 | std=[58.395, 57.12, 57.375], 15 | bgr_to_rgb=True, 16 | pad_size_divisor=32, 17 | boxtype2tensor=False), 18 | backbone=dict( 19 | type='mmdet.ResNet', 20 | depth=50, 21 | num_stages=4, 22 | out_indices=(0, 1, 2, 3), 23 | frozen_stages=1, 24 | norm_cfg=dict(type='BN', requires_grad=True), 25 | norm_eval=True, 26 | style='pytorch', 27 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 28 | neck=dict( 29 | type='mmdet.FPN', 30 | in_channels=[256, 512, 1024, 2048], 31 | out_channels=256, 32 | start_level=1, 33 | add_extra_convs='on_output', 34 | num_outs=5, 35 | relu_before_extra_convs=True), 36 | bbox_head=dict( 37 | type='RotatedFCOSHead', 38 | num_classes=34, 39 | in_channels=256, 40 | stacked_convs=4, 41 | feat_channels=256, 42 | strides=[8, 16, 32, 64, 128], 43 | center_sampling=True, 44 | center_sample_radius=1.5, 45 | norm_on_bbox=True, 46 | centerness_on_reg=True, 47 | use_hbbox_loss=False, 48 | scale_angle=True, 49 | bbox_coder=dict( 50 | type='DistanceAnglePointCoder', angle_version=angle_version), 51 | loss_cls=dict( 52 | type='mmdet.FocalLoss', 53 | use_sigmoid=True, 54 | gamma=2.0, 55 | alpha=0.25, 56 | loss_weight=1.0), 57 | loss_bbox=dict(type='RotatedIoULoss', loss_weight=1.0), 58 | loss_angle=None, 59 | loss_centerness=dict( 60 | type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), 61 | # training and testing settings 62 | train_cfg=None, 63 | test_cfg=dict( 64 | nms_pre=2000, 65 | min_bbox_size=0, 66 | score_thr=0.05, 67 | nms=dict(type='nms_rotated', iou_threshold=0.1), 68 | max_per_img=2000)) -------------------------------------------------------------------------------- /playground/mmrotate_configs/rotated-fcos-le90_r50_fpn_1x_rsar-1024.py: -------------------------------------------------------------------------------- 1 | # https://github.com/zhasion/RSAR/blob/main/configs/rotated_fcos/rotated-fcos-le90_r50_fpn_1x_rsar.py 2 | _base_ = [ 3 | './rsar-1024.py', 4 | 'mmrotate::_base_/schedules/schedule_1x.py', 5 | 'mmrotate::_base_/default_runtime.py' 6 | ] 7 | angle_version = 'le90' 8 | 9 | # model settings 10 | model = dict( 11 | type='mmdet.FCOS', 12 | data_preprocessor=dict( 13 | type='mmdet.DetDataPreprocessor', 14 | mean=[123.675, 116.28, 103.53], 15 | std=[58.395, 57.12, 57.375], 16 | bgr_to_rgb=True, 17 | pad_size_divisor=32, 18 | boxtype2tensor=False), 19 | backbone=dict( 20 | type='mmdet.ResNet', 21 | depth=50, 22 | num_stages=4, 23 | out_indices=(0, 1, 2, 3), 24 | frozen_stages=1, 25 | norm_cfg=dict(type='BN', requires_grad=True), 26 | norm_eval=True, 27 | style='pytorch', 28 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 29 | neck=dict( 30 | type='mmdet.FPN', 31 | in_channels=[256, 512, 1024, 2048], 32 | out_channels=256, 33 | start_level=1, 34 | add_extra_convs='on_output', 35 | num_outs=5, 36 | relu_before_extra_convs=True), 37 | bbox_head=dict( 38 | type='RotatedFCOSHead', 39 | num_classes=6, 40 | in_channels=256, 41 | stacked_convs=4, 42 | feat_channels=256, 43 | strides=[8, 16, 32, 64, 128], 44 | center_sampling=True, 45 | center_sample_radius=1.5, 46 | norm_on_bbox=True, 47 | centerness_on_reg=True, 48 | use_hbbox_loss=False, 49 | scale_angle=True, 50 | bbox_coder=dict( 51 | type='DistanceAnglePointCoder', angle_version=angle_version), 52 | loss_cls=dict( 53 | type='mmdet.FocalLoss', 54 | use_sigmoid=True, 55 | gamma=2.0, 56 | alpha=0.25, 57 | loss_weight=1.0), 58 | loss_bbox=dict(type='RotatedIoULoss', loss_weight=1.0), 59 | loss_angle=None, 60 | loss_centerness=dict( 61 | type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), 62 | # training and testing settings 63 | train_cfg=None, 64 | test_cfg=dict( 65 | nms_pre=2000, 66 | min_bbox_size=0, 67 | score_thr=0.05, 68 | nms=dict(type='nms_rotated', iou_threshold=0.1), 69 | max_per_img=2000)) -------------------------------------------------------------------------------- /playground/mmrotate_configs/rotated-fcos-le90_r50_fpn_1x_rsar.py: -------------------------------------------------------------------------------- 1 | # https://github.com/zhasion/RSAR/blob/main/configs/rotated_fcos/rotated-fcos-le90_r50_fpn_1x_rsar.py 2 | _base_ = [ 3 | './rsar.py', 4 | 'mmrotate::_base_/schedules/schedule_1x.py', 5 | 'mmrotate::_base_/default_runtime.py' 6 | ] 7 | angle_version = 'le90' 8 | 9 | # model settings 10 | model = dict( 11 | type='mmdet.FCOS', 12 | data_preprocessor=dict( 13 | type='mmdet.DetDataPreprocessor', 14 | mean=[123.675, 116.28, 103.53], 15 | std=[58.395, 57.12, 57.375], 16 | bgr_to_rgb=True, 17 | pad_size_divisor=32, 18 | boxtype2tensor=False), 19 | backbone=dict( 20 | type='mmdet.ResNet', 21 | depth=50, 22 | num_stages=4, 23 | out_indices=(0, 1, 2, 3), 24 | frozen_stages=1, 25 | norm_cfg=dict(type='BN', requires_grad=True), 26 | norm_eval=True, 27 | style='pytorch', 28 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 29 | neck=dict( 30 | type='mmdet.FPN', 31 | in_channels=[256, 512, 1024, 2048], 32 | out_channels=256, 33 | start_level=1, 34 | add_extra_convs='on_output', 35 | num_outs=5, 36 | relu_before_extra_convs=True), 37 | bbox_head=dict( 38 | type='RotatedFCOSHead', 39 | num_classes=6, 40 | in_channels=256, 41 | stacked_convs=4, 42 | feat_channels=256, 43 | strides=[8, 16, 32, 64, 128], 44 | center_sampling=True, 45 | center_sample_radius=1.5, 46 | norm_on_bbox=True, 47 | centerness_on_reg=True, 48 | use_hbbox_loss=False, 49 | scale_angle=True, 50 | bbox_coder=dict( 51 | type='DistanceAnglePointCoder', angle_version=angle_version), 52 | loss_cls=dict( 53 | type='mmdet.FocalLoss', 54 | use_sigmoid=True, 55 | gamma=2.0, 56 | alpha=0.25, 57 | loss_weight=1.0), 58 | loss_bbox=dict(type='RotatedIoULoss', loss_weight=1.0), 59 | loss_angle=None, 60 | loss_centerness=dict( 61 | type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), 62 | # training and testing settings 63 | train_cfg=None, 64 | test_cfg=dict( 65 | nms_pre=2000, 66 | min_bbox_size=0, 67 | score_thr=0.05, 68 | nms=dict(type='nms_rotated', iou_threshold=0.1), 69 | max_per_img=2000)) -------------------------------------------------------------------------------- /playground/mmrotate_configs/rotated-fcos-le90_r50_fpn_6x_srsdd.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | './srsdd.py', 3 | 'mmrotate::_base_/schedules/schedule_6x.py', 4 | 'mmrotate::_base_/default_runtime.py' 5 | ] 6 | angle_version = 'le90' 7 | 8 | # model settings 9 | model = dict( 10 | type='mmdet.FCOS', 11 | data_preprocessor=dict( 12 | type='mmdet.DetDataPreprocessor', 13 | mean=[123.675, 116.28, 103.53], 14 | std=[58.395, 57.12, 57.375], 15 | bgr_to_rgb=True, 16 | pad_size_divisor=32, 17 | boxtype2tensor=False), 18 | backbone=dict( 19 | type='mmdet.ResNet', 20 | depth=50, 21 | num_stages=4, 22 | out_indices=(0, 1, 2, 3), 23 | frozen_stages=1, 24 | norm_cfg=dict(type='BN', requires_grad=True), 25 | norm_eval=True, 26 | style='pytorch', 27 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 28 | neck=dict( 29 | type='mmdet.FPN', 30 | in_channels=[256, 512, 1024, 2048], 31 | out_channels=256, 32 | start_level=1, 33 | add_extra_convs='on_output', 34 | num_outs=5, 35 | relu_before_extra_convs=True), 36 | bbox_head=dict( 37 | type='RotatedFCOSHead', 38 | num_classes=6, 39 | in_channels=256, 40 | stacked_convs=4, 41 | feat_channels=256, 42 | strides=[8, 16, 32, 64, 128], 43 | center_sampling=True, 44 | center_sample_radius=1.5, 45 | norm_on_bbox=True, 46 | centerness_on_reg=True, 47 | use_hbbox_loss=False, 48 | scale_angle=True, 49 | bbox_coder=dict( 50 | type='DistanceAnglePointCoder', angle_version=angle_version), 51 | loss_cls=dict( 52 | type='mmdet.FocalLoss', 53 | use_sigmoid=True, 54 | gamma=2.0, 55 | alpha=0.25, 56 | loss_weight=1.0), 57 | loss_bbox=dict(type='RotatedIoULoss', loss_weight=1.0), 58 | loss_angle=None, 59 | loss_centerness=dict( 60 | type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), 61 | # training and testing settings 62 | train_cfg=None, 63 | test_cfg=dict( 64 | nms_pre=2000, 65 | min_bbox_size=0, 66 | score_thr=0.05, 67 | nms=dict(type='nms_rotated', iou_threshold=0.1), 68 | max_per_img=2000)) 69 | -------------------------------------------------------------------------------- /playground/mmrotate_configs/rotated-retinanet-rbox-le90_r50_fpn_1x_dior-1024.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | './dior-1024.py', 3 | 'mmrotate::_base_/schedules/schedule_1x.py', 4 | 'mmrotate::_base_/default_runtime.py' 5 | ] 6 | angle_version = 'le90' 7 | 8 | model = dict( 9 | type='mmdet.RetinaNet', 10 | data_preprocessor=dict( 11 | type='mmdet.DetDataPreprocessor', 12 | mean=[123.675, 116.28, 103.53], 13 | std=[58.395, 57.12, 57.375], 14 | bgr_to_rgb=True, 15 | pad_size_divisor=32, 16 | boxtype2tensor=False), 17 | backbone=dict( 18 | type='mmdet.ResNet', 19 | depth=50, 20 | num_stages=4, 21 | out_indices=(0, 1, 2, 3), 22 | frozen_stages=1, 23 | norm_cfg=dict(type='BN', requires_grad=True), 24 | norm_eval=True, 25 | style='pytorch', 26 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 27 | neck=dict( 28 | type='mmdet.FPN', 29 | in_channels=[256, 512, 1024, 2048], 30 | out_channels=256, 31 | start_level=1, 32 | add_extra_convs='on_input', 33 | num_outs=5), 34 | bbox_head=dict( 35 | type='mmdet.RetinaHead', 36 | num_classes=20, 37 | in_channels=256, 38 | stacked_convs=4, 39 | feat_channels=256, 40 | anchor_generator=dict( 41 | type='FakeRotatedAnchorGenerator', 42 | angle_version=angle_version, 43 | octave_base_scale=4, 44 | scales_per_octave=3, 45 | ratios=[1.0, 0.5, 2.0], 46 | strides=[8, 16, 32, 64, 128]), 47 | bbox_coder=dict( 48 | type='DeltaXYWHTRBBoxCoder', 49 | angle_version=angle_version, 50 | norm_factor=None, 51 | edge_swap=True, 52 | proj_xy=True, 53 | target_means=(.0, .0, .0, .0, .0), 54 | target_stds=(1.0, 1.0, 1.0, 1.0, 1.0)), 55 | loss_cls=dict( 56 | type='mmdet.FocalLoss', 57 | use_sigmoid=True, 58 | gamma=2.0, 59 | alpha=0.25, 60 | loss_weight=1.0), 61 | loss_bbox=dict(type='mmdet.L1Loss', loss_weight=1.0)), 62 | train_cfg=dict( 63 | assigner=dict( 64 | type='mmdet.MaxIoUAssigner', 65 | pos_iou_thr=0.5, 66 | neg_iou_thr=0.4, 67 | min_pos_iou=0, 68 | ignore_iof_thr=-1, 69 | iou_calculator=dict(type='RBboxOverlaps2D')), 70 | sampler=dict( 71 | type='mmdet.PseudoSampler'), # Focal loss should use PseudoSampler 72 | allowed_border=-1, 73 | pos_weight=-1, 74 | debug=False), 75 | test_cfg=dict( 76 | nms_pre=2000, 77 | min_bbox_size=0, 78 | score_thr=0.05, 79 | nms=dict(type='nms_rotated', iou_threshold=0.1), 80 | max_per_img=2000)) -------------------------------------------------------------------------------- /playground/mmrotate_configs/rotated-retinanet-rbox-le90_r50_fpn_1x_dota-train.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | 'mmrotate::_base_/datasets/dota.py', 3 | 'mmrotate::_base_/schedules/schedule_1x.py', 4 | 'mmrotate::_base_/default_runtime.py' 5 | ] 6 | angle_version = 'le90' 7 | 8 | data_root = 'playground/data/split_ss_dota/' 9 | train_dataloader = dict( 10 | dataset=dict( 11 | data_root=data_root, 12 | ann_file='train/annfiles/', 13 | ) 14 | ) 15 | val_dataloader = dict( 16 | dataset=dict( 17 | data_root=data_root, 18 | ann_file='val/annfiles/', 19 | ) 20 | ) 21 | test_dataloader = val_dataloader 22 | 23 | model = dict( 24 | type='mmdet.RetinaNet', 25 | data_preprocessor=dict( 26 | type='mmdet.DetDataPreprocessor', 27 | mean=[123.675, 116.28, 103.53], 28 | std=[58.395, 57.12, 57.375], 29 | bgr_to_rgb=True, 30 | pad_size_divisor=32, 31 | boxtype2tensor=False), 32 | backbone=dict( 33 | type='mmdet.ResNet', 34 | depth=50, 35 | num_stages=4, 36 | out_indices=(0, 1, 2, 3), 37 | frozen_stages=1, 38 | norm_cfg=dict(type='BN', requires_grad=True), 39 | norm_eval=True, 40 | style='pytorch', 41 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 42 | neck=dict( 43 | type='mmdet.FPN', 44 | in_channels=[256, 512, 1024, 2048], 45 | out_channels=256, 46 | start_level=1, 47 | add_extra_convs='on_input', 48 | num_outs=5), 49 | bbox_head=dict( 50 | type='mmdet.RetinaHead', 51 | num_classes=15, 52 | in_channels=256, 53 | stacked_convs=4, 54 | feat_channels=256, 55 | anchor_generator=dict( 56 | type='FakeRotatedAnchorGenerator', 57 | angle_version=angle_version, 58 | octave_base_scale=4, 59 | scales_per_octave=3, 60 | ratios=[1.0, 0.5, 2.0], 61 | strides=[8, 16, 32, 64, 128]), 62 | bbox_coder=dict( 63 | type='DeltaXYWHTRBBoxCoder', 64 | angle_version=angle_version, 65 | norm_factor=None, 66 | edge_swap=True, 67 | proj_xy=True, 68 | target_means=(.0, .0, .0, .0, .0), 69 | target_stds=(1.0, 1.0, 1.0, 1.0, 1.0)), 70 | loss_cls=dict( 71 | type='mmdet.FocalLoss', 72 | use_sigmoid=True, 73 | gamma=2.0, 74 | alpha=0.25, 75 | loss_weight=1.0), 76 | loss_bbox=dict(type='mmdet.L1Loss', loss_weight=1.0)), 77 | train_cfg=dict( 78 | assigner=dict( 79 | type='mmdet.MaxIoUAssigner', 80 | pos_iou_thr=0.5, 81 | neg_iou_thr=0.4, 82 | min_pos_iou=0, 83 | ignore_iof_thr=-1, 84 | iou_calculator=dict(type='RBboxOverlaps2D')), 85 | sampler=dict( 86 | type='mmdet.PseudoSampler'), # Focal loss should use PseudoSampler 87 | allowed_border=-1, 88 | pos_weight=-1, 89 | debug=False), 90 | test_cfg=dict( 91 | nms_pre=2000, 92 | min_bbox_size=0, 93 | score_thr=0.05, 94 | nms=dict(type='nms_rotated', iou_threshold=0.1), 95 | max_per_img=2000)) -------------------------------------------------------------------------------- /playground/mmrotate_configs/rotated-retinanet-rbox-le90_r50_fpn_1x_dota.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | 'mmrotate::_base_/datasets/dota.py', 3 | 'mmrotate::_base_/schedules/schedule_1x.py', 4 | 'mmrotate::_base_/default_runtime.py' 5 | ] 6 | angle_version = 'le90' 7 | 8 | data_root = 'playground/data/split_ss_dota/' 9 | train_dataloader = dict(dataset=dict(data_root=data_root)) 10 | val_dataloader = dict(dataset=dict(data_root=data_root)) 11 | test_dataloader = dict(dataset=dict(data_root=data_root)) 12 | 13 | model = dict( 14 | type='mmdet.RetinaNet', 15 | data_preprocessor=dict( 16 | type='mmdet.DetDataPreprocessor', 17 | mean=[123.675, 116.28, 103.53], 18 | std=[58.395, 57.12, 57.375], 19 | bgr_to_rgb=True, 20 | pad_size_divisor=32, 21 | boxtype2tensor=False), 22 | backbone=dict( 23 | type='mmdet.ResNet', 24 | depth=50, 25 | num_stages=4, 26 | out_indices=(0, 1, 2, 3), 27 | frozen_stages=1, 28 | norm_cfg=dict(type='BN', requires_grad=True), 29 | norm_eval=True, 30 | style='pytorch', 31 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 32 | neck=dict( 33 | type='mmdet.FPN', 34 | in_channels=[256, 512, 1024, 2048], 35 | out_channels=256, 36 | start_level=1, 37 | add_extra_convs='on_input', 38 | num_outs=5), 39 | bbox_head=dict( 40 | type='mmdet.RetinaHead', 41 | num_classes=15, 42 | in_channels=256, 43 | stacked_convs=4, 44 | feat_channels=256, 45 | anchor_generator=dict( 46 | type='FakeRotatedAnchorGenerator', 47 | angle_version=angle_version, 48 | octave_base_scale=4, 49 | scales_per_octave=3, 50 | ratios=[1.0, 0.5, 2.0], 51 | strides=[8, 16, 32, 64, 128]), 52 | bbox_coder=dict( 53 | type='DeltaXYWHTRBBoxCoder', 54 | angle_version=angle_version, 55 | norm_factor=None, 56 | edge_swap=True, 57 | proj_xy=True, 58 | target_means=(.0, .0, .0, .0, .0), 59 | target_stds=(1.0, 1.0, 1.0, 1.0, 1.0)), 60 | loss_cls=dict( 61 | type='mmdet.FocalLoss', 62 | use_sigmoid=True, 63 | gamma=2.0, 64 | alpha=0.25, 65 | loss_weight=1.0), 66 | loss_bbox=dict(type='mmdet.L1Loss', loss_weight=1.0)), 67 | train_cfg=dict( 68 | assigner=dict( 69 | type='mmdet.MaxIoUAssigner', 70 | pos_iou_thr=0.5, 71 | neg_iou_thr=0.4, 72 | min_pos_iou=0, 73 | ignore_iof_thr=-1, 74 | iou_calculator=dict(type='RBboxOverlaps2D')), 75 | sampler=dict( 76 | type='mmdet.PseudoSampler'), # Focal loss should use PseudoSampler 77 | allowed_border=-1, 78 | pos_weight=-1, 79 | debug=False), 80 | test_cfg=dict( 81 | nms_pre=2000, 82 | min_bbox_size=0, 83 | score_thr=0.05, 84 | nms=dict(type='nms_rotated', iou_threshold=0.1), 85 | max_per_img=2000)) -------------------------------------------------------------------------------- /playground/mmrotate_configs/rotated-retinanet-rbox-le90_r50_fpn_1x_fair.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | './fair1m.py', 3 | 'mmrotate::_base_/schedules/schedule_1x.py', 4 | 'mmrotate::_base_/default_runtime.py' 5 | ] 6 | angle_version = 'le90' 7 | 8 | model = dict( 9 | type='mmdet.RetinaNet', 10 | data_preprocessor=dict( 11 | type='mmdet.DetDataPreprocessor', 12 | mean=[123.675, 116.28, 103.53], 13 | std=[58.395, 57.12, 57.375], 14 | bgr_to_rgb=True, 15 | pad_size_divisor=32, 16 | boxtype2tensor=False), 17 | backbone=dict( 18 | type='mmdet.ResNet', 19 | depth=50, 20 | num_stages=4, 21 | out_indices=(0, 1, 2, 3), 22 | frozen_stages=1, 23 | norm_cfg=dict(type='BN', requires_grad=True), 24 | norm_eval=True, 25 | style='pytorch', 26 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 27 | neck=dict( 28 | type='mmdet.FPN', 29 | in_channels=[256, 512, 1024, 2048], 30 | out_channels=256, 31 | start_level=1, 32 | add_extra_convs='on_input', 33 | num_outs=5), 34 | bbox_head=dict( 35 | type='mmdet.RetinaHead', 36 | num_classes=34, 37 | in_channels=256, 38 | stacked_convs=4, 39 | feat_channels=256, 40 | anchor_generator=dict( 41 | type='FakeRotatedAnchorGenerator', 42 | angle_version=angle_version, 43 | octave_base_scale=4, 44 | scales_per_octave=3, 45 | ratios=[1.0, 0.5, 2.0], 46 | strides=[8, 16, 32, 64, 128]), 47 | bbox_coder=dict( 48 | type='DeltaXYWHTRBBoxCoder', 49 | angle_version=angle_version, 50 | norm_factor=None, 51 | edge_swap=True, 52 | proj_xy=True, 53 | target_means=(.0, .0, .0, .0, .0), 54 | target_stds=(1.0, 1.0, 1.0, 1.0, 1.0)), 55 | loss_cls=dict( 56 | type='mmdet.FocalLoss', 57 | use_sigmoid=True, 58 | gamma=2.0, 59 | alpha=0.25, 60 | loss_weight=1.0), 61 | loss_bbox=dict(type='mmdet.L1Loss', loss_weight=1.0)), 62 | train_cfg=dict( 63 | assigner=dict( 64 | type='mmdet.MaxIoUAssigner', 65 | pos_iou_thr=0.5, 66 | neg_iou_thr=0.4, 67 | min_pos_iou=0, 68 | ignore_iof_thr=-1, 69 | iou_calculator=dict(type='RBboxOverlaps2D')), 70 | sampler=dict( 71 | type='mmdet.PseudoSampler'), # Focal loss should use PseudoSampler 72 | allowed_border=-1, 73 | pos_weight=-1, 74 | debug=False), 75 | test_cfg=dict( 76 | nms_pre=2000, 77 | min_bbox_size=0, 78 | score_thr=0.05, 79 | nms=dict(type='nms_rotated', iou_threshold=0.1), 80 | max_per_img=2000)) -------------------------------------------------------------------------------- /playground/mmrotate_configs/rotated-retinanet-rbox-le90_r50_fpn_1x_rsar-1024.py: -------------------------------------------------------------------------------- 1 | # https://github.com/zhasion/RSAR/blob/main/configs/rotated_retinanet/rotated-retinanet-rbox-le90_r50_fpn_1x_rsar.py 2 | _base_ = [ 3 | './rsar-1024.py', 4 | 'mmrotate::_base_/schedules/schedule_1x.py', 5 | 'mmrotate::_base_/default_runtime.py' 6 | ] 7 | angle_version = 'le90' 8 | 9 | model = dict( 10 | type='mmdet.RetinaNet', 11 | data_preprocessor=dict( 12 | type='mmdet.DetDataPreprocessor', 13 | mean=[123.675, 116.28, 103.53], 14 | std=[58.395, 57.12, 57.375], 15 | bgr_to_rgb=True, 16 | pad_size_divisor=32, 17 | boxtype2tensor=False), 18 | backbone=dict( 19 | type='mmdet.ResNet', 20 | depth=50, 21 | num_stages=4, 22 | out_indices=(0, 1, 2, 3), 23 | frozen_stages=1, 24 | norm_cfg=dict(type='BN', requires_grad=True), 25 | norm_eval=True, 26 | style='pytorch', 27 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 28 | neck=dict( 29 | type='mmdet.FPN', 30 | in_channels=[256, 512, 1024, 2048], 31 | out_channels=256, 32 | start_level=1, 33 | add_extra_convs='on_input', 34 | num_outs=5), 35 | bbox_head=dict( 36 | type='mmdet.RetinaHead', 37 | num_classes=6, # 38 | in_channels=256, 39 | stacked_convs=4, 40 | feat_channels=256, 41 | anchor_generator=dict( 42 | type='FakeRotatedAnchorGenerator', 43 | angle_version=angle_version, 44 | octave_base_scale=4, 45 | scales_per_octave=3, 46 | ratios=[1.0, 0.5, 2.0], 47 | strides=[8, 16, 32, 64, 128]), 48 | bbox_coder=dict( 49 | type='DeltaXYWHTRBBoxCoder', 50 | angle_version=angle_version, 51 | norm_factor=None, 52 | edge_swap=True, 53 | proj_xy=True, 54 | target_means=(.0, .0, .0, .0, .0), 55 | target_stds=(1.0, 1.0, 1.0, 1.0, 1.0)), 56 | loss_cls=dict( 57 | type='mmdet.FocalLoss', 58 | use_sigmoid=True, 59 | gamma=2.0, 60 | alpha=0.25, 61 | loss_weight=1.0), 62 | loss_bbox=dict(type='mmdet.L1Loss', loss_weight=1.0)), 63 | train_cfg=dict( 64 | assigner=dict( 65 | type='mmdet.MaxIoUAssigner', 66 | pos_iou_thr=0.5, 67 | neg_iou_thr=0.4, 68 | min_pos_iou=0, 69 | ignore_iof_thr=-1, 70 | iou_calculator=dict(type='RBboxOverlaps2D')), 71 | sampler=dict( 72 | type='mmdet.PseudoSampler'), # Focal loss should use PseudoSampler 73 | allowed_border=-1, 74 | pos_weight=-1, 75 | debug=False), 76 | test_cfg=dict( 77 | nms_pre=2000, 78 | min_bbox_size=0, 79 | score_thr=0.05, 80 | nms=dict(type='nms_rotated', iou_threshold=0.1), 81 | max_per_img=2000)) -------------------------------------------------------------------------------- /playground/mmrotate_configs/rotated-retinanet-rbox-le90_r50_fpn_1x_rsar.py: -------------------------------------------------------------------------------- 1 | # https://github.com/zhasion/RSAR/blob/main/configs/rotated_retinanet/rotated-retinanet-rbox-le90_r50_fpn_1x_rsar.py 2 | _base_ = [ 3 | './rsar.py', 4 | 'mmrotate::_base_/schedules/schedule_1x.py', 5 | 'mmrotate::_base_/default_runtime.py' 6 | ] 7 | angle_version = 'le90' 8 | 9 | model = dict( 10 | type='mmdet.RetinaNet', 11 | data_preprocessor=dict( 12 | type='mmdet.DetDataPreprocessor', 13 | mean=[123.675, 116.28, 103.53], 14 | std=[58.395, 57.12, 57.375], 15 | bgr_to_rgb=True, 16 | pad_size_divisor=32, 17 | boxtype2tensor=False), 18 | backbone=dict( 19 | type='mmdet.ResNet', 20 | depth=50, 21 | num_stages=4, 22 | out_indices=(0, 1, 2, 3), 23 | frozen_stages=1, 24 | norm_cfg=dict(type='BN', requires_grad=True), 25 | norm_eval=True, 26 | style='pytorch', 27 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 28 | neck=dict( 29 | type='mmdet.FPN', 30 | in_channels=[256, 512, 1024, 2048], 31 | out_channels=256, 32 | start_level=1, 33 | add_extra_convs='on_input', 34 | num_outs=5), 35 | bbox_head=dict( 36 | type='mmdet.RetinaHead', 37 | num_classes=6, # 38 | in_channels=256, 39 | stacked_convs=4, 40 | feat_channels=256, 41 | anchor_generator=dict( 42 | type='FakeRotatedAnchorGenerator', 43 | angle_version=angle_version, 44 | octave_base_scale=4, 45 | scales_per_octave=3, 46 | ratios=[1.0, 0.5, 2.0], 47 | strides=[8, 16, 32, 64, 128]), 48 | bbox_coder=dict( 49 | type='DeltaXYWHTRBBoxCoder', 50 | angle_version=angle_version, 51 | norm_factor=None, 52 | edge_swap=True, 53 | proj_xy=True, 54 | target_means=(.0, .0, .0, .0, .0), 55 | target_stds=(1.0, 1.0, 1.0, 1.0, 1.0)), 56 | loss_cls=dict( 57 | type='mmdet.FocalLoss', 58 | use_sigmoid=True, 59 | gamma=2.0, 60 | alpha=0.25, 61 | loss_weight=1.0), 62 | loss_bbox=dict(type='mmdet.L1Loss', loss_weight=1.0)), 63 | train_cfg=dict( 64 | assigner=dict( 65 | type='mmdet.MaxIoUAssigner', 66 | pos_iou_thr=0.5, 67 | neg_iou_thr=0.4, 68 | min_pos_iou=0, 69 | ignore_iof_thr=-1, 70 | iou_calculator=dict(type='RBboxOverlaps2D')), 71 | sampler=dict( 72 | type='mmdet.PseudoSampler'), # Focal loss should use PseudoSampler 73 | allowed_border=-1, 74 | pos_weight=-1, 75 | debug=False), 76 | test_cfg=dict( 77 | nms_pre=2000, 78 | min_bbox_size=0, 79 | score_thr=0.05, 80 | nms=dict(type='nms_rotated', iou_threshold=0.1), 81 | max_per_img=2000)) -------------------------------------------------------------------------------- /playground/mmrotate_configs/rotated-retinanet-rbox-le90_r50_fpn_6x_srsdd.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | './srsdd.py', 3 | 'mmrotate::_base_/schedules/schedule_6x.py', 4 | 'mmrotate::_base_/default_runtime.py' 5 | ] 6 | angle_version = 'le90' 7 | 8 | model = dict( 9 | type='mmdet.RetinaNet', 10 | data_preprocessor=dict( 11 | type='mmdet.DetDataPreprocessor', 12 | mean=[123.675, 116.28, 103.53], 13 | std=[58.395, 57.12, 57.375], 14 | bgr_to_rgb=True, 15 | pad_size_divisor=32, 16 | boxtype2tensor=False), 17 | backbone=dict( 18 | type='mmdet.ResNet', 19 | depth=50, 20 | num_stages=4, 21 | out_indices=(0, 1, 2, 3), 22 | frozen_stages=1, 23 | norm_cfg=dict(type='BN', requires_grad=True), 24 | norm_eval=True, 25 | style='pytorch', 26 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 27 | neck=dict( 28 | type='mmdet.FPN', 29 | in_channels=[256, 512, 1024, 2048], 30 | out_channels=256, 31 | start_level=1, 32 | add_extra_convs='on_input', 33 | num_outs=5), 34 | bbox_head=dict( 35 | type='mmdet.RetinaHead', 36 | num_classes=6, 37 | in_channels=256, 38 | stacked_convs=4, 39 | feat_channels=256, 40 | anchor_generator=dict( 41 | type='FakeRotatedAnchorGenerator', 42 | angle_version=angle_version, 43 | octave_base_scale=4, 44 | scales_per_octave=3, 45 | ratios=[1.0, 0.5, 2.0], 46 | strides=[8, 16, 32, 64, 128]), 47 | bbox_coder=dict( 48 | type='DeltaXYWHTRBBoxCoder', 49 | angle_version=angle_version, 50 | norm_factor=None, 51 | edge_swap=True, 52 | proj_xy=True, 53 | target_means=(.0, .0, .0, .0, .0), 54 | target_stds=(1.0, 1.0, 1.0, 1.0, 1.0)), 55 | loss_cls=dict( 56 | type='mmdet.FocalLoss', 57 | use_sigmoid=True, 58 | gamma=2.0, 59 | alpha=0.25, 60 | loss_weight=1.0), 61 | loss_bbox=dict(type='mmdet.L1Loss', loss_weight=1.0)), 62 | train_cfg=dict( 63 | assigner=dict( 64 | type='mmdet.MaxIoUAssigner', 65 | pos_iou_thr=0.5, 66 | neg_iou_thr=0.4, 67 | min_pos_iou=0, 68 | ignore_iof_thr=-1, 69 | iou_calculator=dict(type='RBboxOverlaps2D')), 70 | sampler=dict( 71 | type='mmdet.PseudoSampler'), # Focal loss should use PseudoSampler 72 | allowed_border=-1, 73 | pos_weight=-1, 74 | debug=False), 75 | test_cfg=dict( 76 | nms_pre=2000, 77 | min_bbox_size=0, 78 | score_thr=0.05, 79 | nms=dict(type='nms_rotated', iou_threshold=0.1), 80 | max_per_img=2000)) 81 | -------------------------------------------------------------------------------- /playground/mmrotate_configs/rsar-1024.py: -------------------------------------------------------------------------------- 1 | # https://github.com/zhasion/RSAR/blob/main/configs/_base_/datasets/rsar.py 2 | # It is different from the official setting, which train with `train` split and evaluate with `test` split. 3 | # We unified the training and evaluation splits to `trainval` and evaluate with `test` split. 4 | # dataset settings 5 | dataset_type = 'RSARDataset' 6 | data_root = 'playground/data/RSAR/' 7 | backend_args = None 8 | 9 | custom_imports = dict(imports=['lmmrotate.modules.rsar_dataset'], allow_failed_imports=False) 10 | 11 | train_pipeline = [ 12 | dict(type='mmdet.LoadImageFromFile', backend_args=backend_args), 13 | dict(type='mmdet.LoadAnnotations', with_bbox=True, box_type='qbox'), 14 | dict(type='ConvertBoxType', box_type_mapping=dict(gt_bboxes='rbox')), 15 | dict(type='mmdet.Resize', scale=(1024, 1024), keep_ratio=True), 16 | dict(type='mmdet.Pad', size=(1024, 1024), pad_val=dict(img=(0, 0, 0))), 17 | dict( 18 | type='mmdet.RandomFlip', 19 | prob=0.75, 20 | direction=['horizontal', 'vertical', 'diagonal']), 21 | dict(type='mmdet.PackDetInputs') 22 | ] 23 | val_pipeline = [ 24 | dict(type='mmdet.LoadImageFromFile', backend_args=backend_args), 25 | dict(type='mmdet.Resize', scale=(1024, 1024), keep_ratio=True), 26 | dict(type='mmdet.Pad', size=(1024, 1024), pad_val=dict(img=(0, 0, 0))), 27 | # avoid bboxes being resized 28 | dict(type='mmdet.LoadAnnotations', with_bbox=True, box_type='qbox'), 29 | dict(type='ConvertBoxType', box_type_mapping=dict(gt_bboxes='rbox')), 30 | dict( 31 | type='mmdet.PackDetInputs', 32 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 33 | 'scale_factor')) 34 | ] 35 | test_pipeline = [ 36 | dict(type='mmdet.LoadImageFromFile', backend_args=backend_args), 37 | dict(type='mmdet.Resize', scale=(1024, 1024), keep_ratio=True), 38 | dict(type='mmdet.Pad', size=(1024, 1024), pad_val=dict(img=(0, 0, 0))), 39 | # If you don't have a gt annotation, delete the pipeline 40 | dict(type='mmdet.LoadAnnotations', with_bbox=True, box_type='qbox'), 41 | dict(type='ConvertBoxType', box_type_mapping=dict(gt_bboxes='rbox')), 42 | dict( 43 | type='mmdet.PackDetInputs', 44 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 45 | 'scale_factor')) 46 | ] 47 | train_dataloader = dict( 48 | batch_size=2, 49 | num_workers=2, 50 | persistent_workers=True, 51 | sampler=dict(type='DefaultSampler', shuffle=True), 52 | batch_sampler=None, 53 | dataset=dict( 54 | type=dataset_type, 55 | data_root=data_root, 56 | ann_file='trainval/annfiles/', 57 | data_prefix=dict(img_path='trainval/images/'), 58 | filter_cfg=dict(filter_empty_gt=True), 59 | pipeline=train_pipeline)) 60 | val_dataloader = dict( 61 | batch_size=1, 62 | num_workers=2, 63 | persistent_workers=True, 64 | drop_last=False, 65 | sampler=dict(type='DefaultSampler', shuffle=False), 66 | dataset=dict( 67 | type=dataset_type, 68 | data_root=data_root, 69 | ann_file='test/annfiles/', 70 | data_prefix=dict(img_path='test/images/'), 71 | test_mode=True, 72 | pipeline=val_pipeline)) 73 | test_dataloader = val_dataloader 74 | 75 | # val_evaluator = dict(type='DOTAMetric', metric='mAP', iou_thrs=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]) 76 | val_evaluator = dict(type='DOTAMetric', metric='mAP', iou_thrs=[0.5]) 77 | test_evaluator = val_evaluator 78 | 79 | # inference on test dataset and format the output results 80 | # for submission. Note: the test set has no annotation. 81 | test_dataloader = dict( 82 | batch_size=1, 83 | num_workers=2, 84 | persistent_workers=True, 85 | drop_last=False, 86 | sampler=dict(type='DefaultSampler', shuffle=False), 87 | dataset=dict( 88 | type=dataset_type, 89 | data_root=data_root, 90 | ann_file='test/annfiles/', 91 | data_prefix=dict(img_path='test/images/'), 92 | test_mode=True, 93 | pipeline=test_pipeline)) 94 | -------------------------------------------------------------------------------- /playground/mmrotate_configs/rsar.py: -------------------------------------------------------------------------------- 1 | # https://github.com/zhasion/RSAR/blob/main/configs/_base_/datasets/rsar.py 2 | # It is different from the official setting, which train with `train` split and evaluate with `test` split. 3 | # We unified the training and evaluation splits to `trainval` and evaluate with `test` split. 4 | # dataset settings 5 | dataset_type = 'RSARDataset' 6 | data_root = 'playground/data/RSAR/' 7 | backend_args = None 8 | 9 | custom_imports = dict(imports=['lmmrotate.modules.rsar_dataset'], allow_failed_imports=False) 10 | 11 | train_pipeline = [ 12 | dict(type='mmdet.LoadImageFromFile', backend_args=backend_args), 13 | dict(type='mmdet.LoadAnnotations', with_bbox=True, box_type='qbox'), 14 | dict(type='ConvertBoxType', box_type_mapping=dict(gt_bboxes='rbox')), 15 | dict(type='mmdet.Resize', scale=(800, 800), keep_ratio=True), 16 | dict(type='mmdet.Pad', size=(800, 800), pad_val=dict(img=(0, 0, 0))), 17 | dict( 18 | type='mmdet.RandomFlip', 19 | prob=0.75, 20 | direction=['horizontal', 'vertical', 'diagonal']), 21 | dict(type='mmdet.PackDetInputs') 22 | ] 23 | val_pipeline = [ 24 | dict(type='mmdet.LoadImageFromFile', backend_args=backend_args), 25 | dict(type='mmdet.Resize', scale=(800, 800), keep_ratio=True), 26 | dict(type='mmdet.Pad', size=(800, 800), pad_val=dict(img=(0, 0, 0))), 27 | # avoid bboxes being resized 28 | dict(type='mmdet.LoadAnnotations', with_bbox=True, box_type='qbox'), 29 | dict(type='ConvertBoxType', box_type_mapping=dict(gt_bboxes='rbox')), 30 | dict( 31 | type='mmdet.PackDetInputs', 32 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 33 | 'scale_factor')) 34 | ] 35 | test_pipeline = [ 36 | dict(type='mmdet.LoadImageFromFile', backend_args=backend_args), 37 | dict(type='mmdet.Resize', scale=(800, 800), keep_ratio=True), 38 | dict(type='mmdet.Pad', size=(800, 800), pad_val=dict(img=(0, 0, 0))), 39 | # If you don't have a gt annotation, delete the pipeline 40 | dict(type='mmdet.LoadAnnotations', with_bbox=True, box_type='qbox'), 41 | dict(type='ConvertBoxType', box_type_mapping=dict(gt_bboxes='rbox')), 42 | dict( 43 | type='mmdet.PackDetInputs', 44 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 45 | 'scale_factor')) 46 | ] 47 | train_dataloader = dict( 48 | batch_size=2, 49 | num_workers=2, 50 | persistent_workers=True, 51 | sampler=dict(type='DefaultSampler', shuffle=True), 52 | batch_sampler=None, 53 | dataset=dict( 54 | type=dataset_type, 55 | data_root=data_root, 56 | ann_file='trainval/annfiles/', 57 | data_prefix=dict(img_path='trainval/images/'), 58 | filter_cfg=dict(filter_empty_gt=True), 59 | pipeline=train_pipeline)) 60 | val_dataloader = dict( 61 | batch_size=1, 62 | num_workers=2, 63 | persistent_workers=True, 64 | drop_last=False, 65 | sampler=dict(type='DefaultSampler', shuffle=False), 66 | dataset=dict( 67 | type=dataset_type, 68 | data_root=data_root, 69 | ann_file='test/annfiles/', 70 | data_prefix=dict(img_path='test/images/'), 71 | test_mode=True, 72 | pipeline=val_pipeline)) 73 | test_dataloader = val_dataloader 74 | 75 | # val_evaluator = dict(type='DOTAMetric', metric='mAP', iou_thrs=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]) 76 | val_evaluator = dict(type='DOTAMetric', metric='mAP', iou_thrs=[0.5]) 77 | test_evaluator = val_evaluator 78 | 79 | # inference on test dataset and format the output results 80 | # for submission. Note: the test set has no annotation. 81 | test_dataloader = dict( 82 | batch_size=1, 83 | num_workers=2, 84 | persistent_workers=True, 85 | drop_last=False, 86 | sampler=dict(type='DefaultSampler', shuffle=False), 87 | dataset=dict( 88 | type=dataset_type, 89 | data_root=data_root, 90 | ann_file='test/annfiles/', 91 | data_prefix=dict(img_path='test/images/'), 92 | test_mode=True, 93 | pipeline=test_pipeline)) 94 | -------------------------------------------------------------------------------- /playground/mmrotate_configs/srsdd.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'mmdet.CocoDataset' 3 | data_root = 'playground/data/SRSDD/' 4 | 5 | train_pipeline = [ 6 | dict(type='mmdet.LoadImageFromFile'), 7 | dict( 8 | type='mmdet.LoadAnnotations', 9 | with_bbox=True, 10 | with_mask=True, 11 | poly2mask=False), 12 | dict(type='ConvertMask2BoxType', box_type='rbox'), 13 | dict(type='mmdet.Resize', scale=(1024, 1024), keep_ratio=True), 14 | dict( 15 | type='mmdet.RandomFlip', 16 | prob=0.75, 17 | direction=['horizontal', 'vertical', 'diagonal']), 18 | dict(type='mmdet.PackDetInputs') 19 | ] 20 | val_pipeline = [ 21 | dict(type='mmdet.LoadImageFromFile'), 22 | dict(type='mmdet.Resize', scale=(1024, 1024), keep_ratio=True), 23 | # avoid bboxes being resized 24 | dict( 25 | type='mmdet.LoadAnnotations', 26 | with_bbox=True, 27 | with_mask=True, 28 | poly2mask=False), 29 | dict(type='ConvertMask2BoxType', box_type='qbox'), 30 | dict( 31 | type='mmdet.PackDetInputs', 32 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 33 | 'scale_factor', 'instances')) 34 | ] 35 | 36 | metainfo = dict( 37 | classes=('Cell-Container', 'Container', 'Dredger', 'Fishing', 'LawEnforce', 'ore-oil')) 38 | 39 | train_dataloader = dict( 40 | batch_size=2, 41 | num_workers=2, 42 | persistent_workers=True, 43 | sampler=dict(type='DefaultSampler', shuffle=True), 44 | batch_sampler=None, 45 | dataset=dict( 46 | type=dataset_type, 47 | metainfo=metainfo, 48 | data_root=data_root, 49 | ann_file='train.json', 50 | data_prefix=dict(img='train/images/'), 51 | filter_cfg=dict(filter_empty_gt=True), 52 | pipeline=train_pipeline)) 53 | val_dataloader = dict( 54 | batch_size=1, 55 | num_workers=2, 56 | persistent_workers=True, 57 | drop_last=False, 58 | sampler=dict(type='DefaultSampler', shuffle=False), 59 | dataset=dict( 60 | type=dataset_type, 61 | metainfo=metainfo, 62 | data_root=data_root, 63 | ann_file='test.json', 64 | data_prefix=dict(img='test/images/'), 65 | test_mode=True, 66 | pipeline=val_pipeline)) 67 | test_dataloader = val_dataloader 68 | 69 | # val_evaluator = dict(type='DOTAMetric', metric='mAP') 70 | val_evaluator = dict(type='RotatedCocoMetric', metric='bbox', classwise=True) 71 | test_evaluator = val_evaluator -------------------------------------------------------------------------------- /playground/mmrotate_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import os 4 | import os.path as osp 5 | 6 | from mmdet.utils import register_all_modules as register_all_modules_mmdet 7 | from mmengine.config import Config, DictAction 8 | from mmengine.evaluator import DumpResults 9 | from mmengine.registry import RUNNERS 10 | from mmengine.runner import Runner 11 | 12 | from mmrotate.utils import register_all_modules 13 | 14 | 15 | def monkey_patch_of_collections_typehint_for_mmrotate1x(): 16 | import collections 17 | from collections.abc import Mapping, Sequence, Iterable 18 | collections.Mapping = Mapping 19 | collections.Sequence = Sequence 20 | collections.Iterable = Iterable 21 | 22 | monkey_patch_of_collections_typehint_for_mmrotate1x() 23 | 24 | 25 | def parse_args(): 26 | parser = argparse.ArgumentParser(description='Test (and eval) a model') 27 | parser.add_argument('config', help='test config file path') 28 | parser.add_argument('checkpoint', help='checkpoint file') 29 | parser.add_argument( 30 | '--work-dir', 31 | help='the directory to save the file containing evaluation metrics') 32 | parser.add_argument( 33 | '--out', 34 | type=str, 35 | help='dump predictions to a pickle file for offline evaluation') 36 | parser.add_argument( 37 | '--show', action='store_true', help='show prediction results') 38 | parser.add_argument( 39 | '--show-dir', 40 | help='directory where painted images will be saved. ' 41 | 'If specified, it will be automatically saved ' 42 | 'to the work_dir/timestamp/show_dir') 43 | parser.add_argument( 44 | '--wait-time', type=float, default=2, help='the interval of show (s)') 45 | parser.add_argument( 46 | '--cfg-options', 47 | nargs='+', 48 | action=DictAction, 49 | help='override some settings in the used config, the key-value pair ' 50 | 'in xxx=yyy format will be merged into config file. If the value to ' 51 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 52 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 53 | 'Note that the quotation marks are necessary and that no white space ' 54 | 'is allowed.') 55 | parser.add_argument( 56 | '--launcher', 57 | choices=['none', 'pytorch', 'slurm', 'mpi'], 58 | default='none', 59 | help='job launcher') 60 | parser.add_argument('--local_rank', type=int, default=0) 61 | args = parser.parse_args() 62 | if 'LOCAL_RANK' not in os.environ: 63 | os.environ['LOCAL_RANK'] = str(args.local_rank) 64 | return args 65 | 66 | 67 | def trigger_visualization_hook(cfg, args): 68 | default_hooks = cfg.default_hooks 69 | if 'visualization' in default_hooks: 70 | visualization_hook = default_hooks['visualization'] 71 | # Turn on visualization 72 | visualization_hook['draw'] = True 73 | if args.show: 74 | visualization_hook['show'] = True 75 | visualization_hook['wait_time'] = args.wait_time 76 | if args.show_dir: 77 | visualization_hook['test_out_dir'] = args.show_dir 78 | else: 79 | raise RuntimeError( 80 | 'VisualizationHook must be included in default_hooks.' 81 | 'refer to usage ' 82 | '"visualization=dict(type=\'VisualizationHook\')"') 83 | 84 | return cfg 85 | 86 | 87 | def main(): 88 | args = parse_args() 89 | 90 | # register all modules in mmdet into the registries 91 | # do not init the default scope here because it will be init in the runner 92 | register_all_modules_mmdet(init_default_scope=False) 93 | register_all_modules(init_default_scope=False) 94 | 95 | # load config 96 | cfg = Config.fromfile(args.config) 97 | cfg.launcher = args.launcher 98 | if args.cfg_options is not None: 99 | cfg.merge_from_dict(args.cfg_options) 100 | 101 | # work_dir is determined in this priority: CLI > segment in file > filename 102 | if args.work_dir is not None: 103 | # update configs according to CLI args if args.work_dir is not None 104 | cfg.work_dir = args.work_dir 105 | elif cfg.get('work_dir', None) is None: 106 | # use config filename as default work_dir if cfg.work_dir is None 107 | cfg.work_dir = osp.join('./work_dirs', 108 | osp.splitext(osp.basename(args.config))[0]) 109 | 110 | cfg.load_from = args.checkpoint 111 | 112 | if args.show or args.show_dir: 113 | cfg = trigger_visualization_hook(cfg, args) 114 | 115 | # build the runner from config 116 | if 'runner_type' not in cfg: 117 | # build the default runner 118 | runner = Runner.from_cfg(cfg) 119 | else: 120 | # build customized runner from the registry 121 | # if 'runner_type' is set in the cfg 122 | runner = RUNNERS.build(cfg) 123 | 124 | # add `DumpResults` dummy metric 125 | if args.out is not None: 126 | assert args.out.endswith(('.pkl', '.pickle')), \ 127 | 'The dump file must be a pkl file.' 128 | runner.test_evaluator.metrics.append( 129 | DumpResults(out_file_path=args.out)) 130 | 131 | # start testing 132 | runner.test() 133 | 134 | 135 | if __name__ == '__main__': 136 | main() -------------------------------------------------------------------------------- /playground/mmrotate_train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import logging 4 | import os 5 | import os.path as osp 6 | 7 | from mmdet.utils import register_all_modules as register_all_modules_mmdet 8 | from mmengine.config import Config, DictAction 9 | from mmengine.logging import print_log 10 | from mmengine.registry import RUNNERS 11 | from mmengine.runner import Runner 12 | 13 | from mmrotate.utils import register_all_modules 14 | 15 | 16 | def monkey_patch_of_collections_typehint_for_mmrotate1x(): 17 | import collections 18 | from collections.abc import Mapping, Sequence, Iterable 19 | collections.Mapping = Mapping 20 | collections.Sequence = Sequence 21 | collections.Iterable = Iterable 22 | 23 | monkey_patch_of_collections_typehint_for_mmrotate1x() 24 | 25 | 26 | def parse_args(): 27 | parser = argparse.ArgumentParser(description='Train a detector') 28 | parser.add_argument('config', help='train config file path') 29 | parser.add_argument('--work-dir', help='the dir to save logs and models') 30 | parser.add_argument( 31 | '--amp', 32 | action='store_true', 33 | default=False, 34 | help='enable automatic-mixed-precision training') 35 | parser.add_argument( 36 | '--auto-scale-lr', 37 | action='store_true', 38 | help='enable automatically scaling LR.') 39 | parser.add_argument( 40 | '--resume', 41 | action='store_true', 42 | help='resume from the latest checkpoint in the work_dir automatically') 43 | parser.add_argument( 44 | '--cfg-options', 45 | nargs='+', 46 | action=DictAction, 47 | help='override some settings in the used config, the key-value pair ' 48 | 'in xxx=yyy format will be merged into config file. If the value to ' 49 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 50 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 51 | 'Note that the quotation marks are necessary and that no white space ' 52 | 'is allowed.') 53 | parser.add_argument( 54 | '--launcher', 55 | choices=['none', 'pytorch', 'slurm', 'mpi'], 56 | default='none', 57 | help='job launcher') 58 | # When using PyTorch version >= 2.0.0, the `torch.distributed.launch` 59 | # will pass the `--local-rank` parameter to `tools/train.py` instead 60 | # of `--local_rank`. 61 | parser.add_argument('--local_rank', '--local-rank', type=int, default=0) 62 | args = parser.parse_args() 63 | if 'LOCAL_RANK' not in os.environ: 64 | os.environ['LOCAL_RANK'] = str(args.local_rank) 65 | 66 | return args 67 | 68 | 69 | def main(): 70 | args = parse_args() 71 | 72 | # register all modules in mmdet into the registries 73 | # do not init the default scope here because it will be init in the runner 74 | register_all_modules_mmdet(init_default_scope=False) 75 | register_all_modules(init_default_scope=False) 76 | 77 | # load config 78 | cfg = Config.fromfile(args.config) 79 | cfg.launcher = args.launcher 80 | if args.cfg_options is not None: 81 | cfg.merge_from_dict(args.cfg_options) 82 | 83 | # work_dir is determined in this priority: CLI > segment in file > filename 84 | if args.work_dir is not None: 85 | # update configs according to CLI args if args.work_dir is not None 86 | cfg.work_dir = args.work_dir 87 | elif cfg.get('work_dir', None) is None: 88 | # use config filename as default work_dir if cfg.work_dir is None 89 | cfg.work_dir = osp.join('./work_dirs', 90 | osp.splitext(osp.basename(args.config))[0]) 91 | 92 | # enable automatic-mixed-precision training 93 | if args.amp is True: 94 | optim_wrapper = cfg.optim_wrapper.type 95 | if optim_wrapper == 'AmpOptimWrapper': 96 | print_log( 97 | 'AMP training is already enabled in your config.', 98 | logger='current', 99 | level=logging.WARNING) 100 | else: 101 | assert optim_wrapper == 'OptimWrapper', ( 102 | '`--amp` is only supported when the optimizer wrapper type is ' 103 | f'`OptimWrapper` but got {optim_wrapper}.') 104 | cfg.optim_wrapper.type = 'AmpOptimWrapper' 105 | cfg.optim_wrapper.loss_scale = 'dynamic' 106 | 107 | # enable automatically scaling LR 108 | if args.auto_scale_lr: 109 | if 'auto_scale_lr' in cfg and \ 110 | 'enable' in cfg.auto_scale_lr and \ 111 | 'base_batch_size' in cfg.auto_scale_lr: 112 | cfg.auto_scale_lr.enable = True 113 | else: 114 | raise RuntimeError('Can not find "auto_scale_lr" or ' 115 | '"auto_scale_lr.enable" or ' 116 | '"auto_scale_lr.base_batch_size" in your' 117 | ' configuration file.') 118 | 119 | cfg.resume = args.resume 120 | 121 | # build the runner from config 122 | if 'runner_type' not in cfg: 123 | # build the default runner 124 | runner = Runner.from_cfg(cfg) 125 | else: 126 | # build customized runner from the registry 127 | # if 'runner_type' is set in the cfg 128 | runner = RUNNERS.build(cfg) 129 | 130 | # start training 131 | runner.train() 132 | 133 | 134 | if __name__ == '__main__': 135 | main() -------------------------------------------------------------------------------- /playground/times.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionXLab/mllm-mmrotate/110f0ffdfd6788c0a73eaa3457cc4dfbdfc28bbb/playground/times.ttf -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "lmmrotate" 7 | version = "1.0.0" 8 | description = "A training codebase of multimodal language model for orientated detection." 9 | authors = [ 10 | { name = "Li-Qingyun", email = "962537281@qq.com" }, 11 | { name = "Xue Yang", email = "yangxue0827@126.com" }, 12 | ] 13 | readme = "README.md" 14 | requires-python = ">=3.8" 15 | dependencies = [ 16 | "torch==2.3.0", 17 | "torchvision==0.18.0", 18 | "transformers==4.45.0", 19 | "peft", 20 | "datasets", 21 | "huggingface_hub", 22 | "timm", 23 | "deepspeed==0.14.4", 24 | "accelerate>=0.29.1", 25 | "numpy<2", 26 | "einops", 27 | "tensorboard", 28 | "scipy", 29 | "scikit-learn", 30 | "rapidfuzz", 31 | "openmim", 32 | "fire", 33 | ] 34 | classifiers = [ 35 | "Programming Language :: Python :: 3", 36 | "License :: OSI Approved :: Apache Software License", 37 | "Development Status :: 1 - Planning", 38 | "Intended Audience :: Developers", 39 | "Intended Audience :: Science/Research", 40 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 41 | ] 42 | 43 | [tool.setuptools.packages.find] 44 | include = ["lmmrotate*"] 45 | exclude = [ 46 | "checkpoints*", 47 | "playground*", 48 | "scripts*", 49 | "scripts_sh*", 50 | "scripts_py*", 51 | "tests*", 52 | "wandb*", 53 | ] 54 | 55 | [tool.wheel] 56 | exclude = [ 57 | "checkpoints*", 58 | "playground*", 59 | "scripts*", 60 | "scripts_sh*", 61 | "scripts_py*", 62 | "tests*", 63 | "wandb*", 64 | ] -------------------------------------------------------------------------------- /scripts/eval_standalone.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | NGPUS=${NGPUS:-1} 3 | SPLIT=${SPLIT:-"trainval test"} 4 | CKPT=$1 5 | OTHER_ARGS=${@:2} 6 | 7 | torchrun --nproc_per_node=${NGPUS} -m lmmrotate.eval --model_ckpt_path ${CKPT} --split ${SPLIT} ${OTHER_ARGS} 8 | -------------------------------------------------------------------------------- /scripts/florence-2-l_vis1024-lang2048_dota1-v2_b1x1xga32-50e.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | RUN_NAME="florence-2-l_vis1024-lang2048_dota1-v2_b1x1xga32-50e" 3 | 4 | set -x 5 | ACCELERATE_CPU_AFFINITY=1 python -m lmmrotate.train \ 6 | --model_name_or_path 'microsoft/Florence-2-large' \ 7 | --image_square_length 1024 \ 8 | --language_model_max_length 2048 \ 9 | --data_path ./playground/data/florence-dota/florence_split_ss_dota_trainval_v2.json \ 10 | --image_folder ./playground/data/split_ss_dota/trainval/images \ 11 | --fp16 True \ 12 | --attn_implementation "flash_attention_2" \ 13 | --output_dir ./checkpoints/${RUN_NAME} \ 14 | --num_train_epochs 50 \ 15 | --per_device_train_batch_size 1 \ 16 | --per_device_eval_batch_size 1 \ 17 | --gradient_accumulation_steps 32 \ 18 | --eval_strategy "no" \ 19 | --eval_steps 200 \ 20 | --save_strategy "steps" \ 21 | --save_steps 50000 \ 22 | --learning_rate 2e-5 \ 23 | --weight_decay 0. \ 24 | --warmup_ratio 0.03 \ 25 | --lr_scheduler_type "cosine" \ 26 | --logging_steps 1 \ 27 | --gradient_checkpointing False \ 28 | --dataloader_num_workers 4 \ 29 | --report_to "tensorboard" \ 30 | --run_name ${RUN_NAME} 31 | -------------------------------------------------------------------------------- /scripts/florence-2-l_vis1024-lang2048_dota1-v2_b2x2xga8-50e.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | RUN_NAME="florence-2-l_vis1024-lang2048_dota1-v2_b2x2xga8-50e-zero2" 3 | 4 | set -x 5 | PYTHONPATH="."$PYTHONPATH ACCELERATE_CPU_AFFINITY=1 torchrun --nnodes=1 --nproc-per-node=2 \ 6 | -m lmmrotate.train \ 7 | --deepspeed ./lmmrotate/deepspeed_config/zero2.json \ 8 | --model_name_or_path 'microsoft/Florence-2-large' \ 9 | --image_square_length 1024 \ 10 | --language_model_max_length 2048 \ 11 | --data_path ./playground/data/florence-dota/florence_split_ss_dota_trainval_v2.json \ 12 | --image_folder ./playground/data/split_ss_dota/trainval/images \ 13 | --fp16 True \ 14 | --attn_implementation "flash_attention_2" \ 15 | --output_dir ./checkpoints/${RUN_NAME} \ 16 | --num_train_epochs 50 \ 17 | --per_device_train_batch_size 2 \ 18 | --per_device_eval_batch_size 1 \ 19 | --gradient_accumulation_steps 8 \ 20 | --eval_strategy "no" \ 21 | --eval_steps 200 \ 22 | --save_strategy "steps" \ 23 | --save_steps 50000 \ 24 | --learning_rate 2e-5 \ 25 | --weight_decay 0. \ 26 | --warmup_ratio 0.03 \ 27 | --lr_scheduler_type "cosine" \ 28 | --logging_steps 1 \ 29 | --gradient_checkpointing False \ 30 | --dataloader_num_workers 4 \ 31 | --report_to "tensorboard" \ 32 | --run_name ${RUN_NAME} 33 | -------------------------------------------------------------------------------- /scripts/florence-2-l_vis1024-lang2048_dota1-v2_b2x4xga4-50e.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | RUN_NAME="florence-2-l_vis1024-lang2048_dota1-v2_b2x4xga4-50e-zero2" 3 | 4 | set -x 5 | PYTHONPATH="."$PYTHONPATH ACCELERATE_CPU_AFFINITY=1 torchrun --nnodes=1 --nproc-per-node=4 \ 6 | -m lmmrotate.train \ 7 | --deepspeed ./lmmrotate/deepspeed_config/zero2.json \ 8 | --model_name_or_path 'microsoft/Florence-2-large' \ 9 | --image_square_length 1024 \ 10 | --language_model_max_length 2048 \ 11 | --data_path ./playground/data/florence-dota/florence_split_ss_dota_trainval_v2.json \ 12 | --image_folder ./playground/data/split_ss_dota/trainval/images \ 13 | --fp16 True \ 14 | --attn_implementation "flash_attention_2" \ 15 | --output_dir ./checkpoints/${RUN_NAME} \ 16 | --num_train_epochs 50 \ 17 | --per_device_train_batch_size 2 \ 18 | --per_device_eval_batch_size 1 \ 19 | --gradient_accumulation_steps 4 \ 20 | --eval_strategy "no" \ 21 | --eval_steps 200 \ 22 | --save_strategy "steps" \ 23 | --save_steps 50000 \ 24 | --learning_rate 2e-5 \ 25 | --weight_decay 0. \ 26 | --warmup_ratio 0.03 \ 27 | --lr_scheduler_type "cosine" \ 28 | --logging_steps 1 \ 29 | --gradient_checkpointing False \ 30 | --dataloader_num_workers 4 \ 31 | --report_to "tensorboard" \ 32 | --run_name ${RUN_NAME} 33 | -------------------------------------------------------------------------------- /scripts/slurm/eval_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SPLIT=${SPLIT:-"trainval test"} 3 | CKPT=$1 4 | OTHER_ARGS=${@:2} 5 | 6 | NNODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l) 7 | export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 8 | HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") 9 | THEID=$(echo -e $HOSTNAMES | python3 -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$(hostname)'.strip()]") 10 | echo MASTER_ADDR=$MASTER_ADDR 11 | echo HOSTNAMES=$HOSTNAMES 12 | echo SLURM_PROCID=$THEID 13 | 14 | PYTHONPATH="$PYTHONPATH:$(pwd)" SPLIT=${SPLIT} torchrun --nnodes=$NNODES --nproc-per-node=8 \ 15 | --master_port 12955 --master_addr ${MASTER_ADDR} --node_rank ${THEID} \ 16 | -m lmmrotate.eval --model_ckpt_path ${CKPT} --split ${SPLIT} ${OTHER_ARGS} 17 | -------------------------------------------------------------------------------- /scripts/slurm/florence-2-l_vis1024-lang2048_dota1-v2_b2x16-100e.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | TOTAL_BATCH_SIZE=${TOTAL_BATCH_SIZE:-32} 3 | NGPUS=16 4 | PER_RANK_BATCH_SIZE=$((TOTAL_BATCH_SIZE / NGPUS)) 5 | RUN_NAME="florence-2-l_vis1024-lang2048_dota1-v2_b2x16-100e-zero2" 6 | 7 | export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 8 | HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") 9 | THEID=$(echo -e $HOSTNAMES | python3 -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$(hostname)'.strip()]") 10 | echo MASTER_ADDR=$MASTER_ADDR 11 | echo HOSTNAMES=$HOSTNAMES 12 | echo SLURM_PROCID=$THEID 13 | 14 | set -x 15 | PYTHONPATH="."$PYTHONPATH ACCELERATE_CPU_AFFINITY=1 torchrun --nnodes=2 --nproc-per-node=8 \ 16 | --master_port 12968 --master_addr ${MASTER_ADDR} --node_rank ${THEID} \ 17 | -m lmmrotate.train \ 18 | --deepspeed ./lmmrotate/deepspeed_config/zero2.json \ 19 | --model_name_or_path 'microsoft/Florence-2-large' \ 20 | --image_square_length 1024 \ 21 | --language_model_max_length 2048 \ 22 | --data_path ./playground/data/florence-dota/florence_split_ss_dota_trainval_v2.json \ 23 | --image_folder ./playground/data/split_ss_dota/trainval/images \ 24 | --bf16 True \ 25 | --attn_implementation "flash_attention_2" \ 26 | --output_dir ./checkpoints/${RUN_NAME} \ 27 | --num_train_epochs 100 \ 28 | --per_device_train_batch_size ${PER_RANK_BATCH_SIZE} \ 29 | --per_device_eval_batch_size 2 \ 30 | --gradient_accumulation_steps 1 \ 31 | --eval_strategy "no" \ 32 | --eval_steps 200 \ 33 | --save_strategy "steps" \ 34 | --save_steps 50000 \ 35 | --learning_rate 2e-5 \ 36 | --weight_decay 0. \ 37 | --warmup_ratio 0.03 \ 38 | --lr_scheduler_type "cosine" \ 39 | --logging_steps 1 \ 40 | --gradient_checkpointing True \ 41 | --dataloader_num_workers 4 \ 42 | --report_to "tensorboard" \ 43 | --run_name ${RUN_NAME} 44 | --------------------------------------------------------------------------------