├── .github └── workflows │ └── lint.yaml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MODEL_CARD.md ├── README.md ├── conda-extras.yaml ├── conda.yaml ├── dinov2 ├── __init__.py ├── configs │ ├── __init__.py │ ├── eval │ │ ├── vitb14_pretrain.yaml │ │ ├── vitb14_reg4_pretrain.yaml │ │ ├── vitg14_pretrain.yaml │ │ ├── vitg14_reg4_pretrain.yaml │ │ ├── vitl14_pretrain.yaml │ │ ├── vitl14_reg4_pretrain.yaml │ │ ├── vits14_pretrain.yaml │ │ └── vits14_reg4_pretrain.yaml │ ├── ssl_default_config.yaml │ └── train │ │ ├── vitg14.yaml │ │ ├── vitl14.yaml │ │ └── vitl16_short.yaml ├── data │ ├── __init__.py │ ├── adapters.py │ ├── augmentations.py │ ├── collate.py │ ├── datasets │ │ ├── __init__.py │ │ ├── decoders.py │ │ ├── extended.py │ │ ├── image_net.py │ │ └── image_net_22k.py │ ├── loaders.py │ ├── masking.py │ ├── samplers.py │ └── transforms.py ├── distributed │ └── __init__.py ├── eval │ ├── __init__.py │ ├── depth │ │ ├── __init__.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── backbones │ │ │ │ ├── __init__.py │ │ │ │ └── vision_transformer.py │ │ │ ├── builder.py │ │ │ ├── decode_heads │ │ │ │ ├── __init__.py │ │ │ │ ├── decode_head.py │ │ │ │ ├── dpt_head.py │ │ │ │ └── linear_head.py │ │ │ ├── depther │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ └── encoder_decoder.py │ │ │ └── losses │ │ │ │ ├── __init__.py │ │ │ │ ├── gradientloss.py │ │ │ │ └── sigloss.py │ │ └── ops │ │ │ ├── __init__.py │ │ │ └── wrappers.py │ ├── knn.py │ ├── linear.py │ ├── log_regression.py │ ├── metrics.py │ ├── segmentation │ │ ├── __init__.py │ │ ├── hooks │ │ │ ├── __init__.py │ │ │ └── optimizer.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── backbones │ │ │ │ ├── __init__.py │ │ │ │ └── vision_transformer.py │ │ │ └── decode_heads │ │ │ │ ├── __init__.py │ │ │ │ └── linear_head.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ └── colormaps.py │ ├── segmentation_m2f │ │ ├── __init__.py │ │ ├── core │ │ │ ├── __init__.py │ │ │ ├── anchor │ │ │ │ ├── __init__.py │ │ │ │ ├── builder.py │ │ │ │ └── point_generator.py │ │ │ ├── box │ │ │ │ ├── __init__.py │ │ │ │ ├── builder.py │ │ │ │ └── samplers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base_sampler.py │ │ │ │ │ ├── mask_pseudo_sampler.py │ │ │ │ │ ├── mask_sampling_result.py │ │ │ │ │ └── sampling_result.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── dist_utils.py │ │ │ │ └── misc.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── backbones │ │ │ │ ├── __init__.py │ │ │ │ ├── adapter_modules.py │ │ │ │ ├── drop_path.py │ │ │ │ ├── vit.py │ │ │ │ └── vit_adapter.py │ │ │ ├── builder.py │ │ │ ├── decode_heads │ │ │ │ ├── __init__.py │ │ │ │ └── mask2former_head.py │ │ │ ├── losses │ │ │ │ ├── __init__.py │ │ │ │ ├── cross_entropy_loss.py │ │ │ │ ├── dice_loss.py │ │ │ │ └── match_costs.py │ │ │ ├── plugins │ │ │ │ ├── __init__.py │ │ │ │ └── msdeformattn_pixel_decoder.py │ │ │ ├── segmentors │ │ │ │ ├── __init__.py │ │ │ │ └── encoder_decoder_mask2former.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── assigner.py │ │ │ │ ├── point_sample.py │ │ │ │ ├── positional_encoding.py │ │ │ │ └── transformer.py │ │ └── ops │ │ │ └── modules │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn.py │ ├── setup.py │ └── utils.py ├── fsdp │ └── __init__.py ├── hub │ ├── __init__.py │ ├── backbones.py │ ├── classifiers.py │ ├── depth │ │ ├── __init__.py │ │ ├── decode_heads.py │ │ ├── encoder_decoder.py │ │ └── ops.py │ ├── depthers.py │ └── utils.py ├── layers │ ├── __init__.py │ ├── attention.py │ ├── block.py │ ├── dino_head.py │ ├── drop_path.py │ ├── layer_scale.py │ ├── mlp.py │ ├── patch_embed.py │ └── swiglu_ffn.py ├── logging │ ├── __init__.py │ └── helpers.py ├── loss │ ├── __init__.py │ ├── dino_clstoken_loss.py │ ├── ibot_patch_loss.py │ └── koleo_loss.py ├── models │ ├── __init__.py │ └── vision_transformer.py ├── run │ ├── __init__.py │ ├── eval │ │ ├── knn.py │ │ ├── linear.py │ │ └── log_regression.py │ ├── submit.py │ └── train │ │ └── train.py ├── train │ ├── __init__.py │ ├── ssl_meta_arch.py │ └── train.py └── utils │ ├── __init__.py │ ├── cluster.py │ ├── config.py │ ├── dtype.py │ ├── param_groups.py │ └── utils.py ├── hubconf.py ├── notebooks ├── depth_estimation.ipynb └── semantic_segmentation.ipynb ├── pyproject.toml ├── requirements-dev.txt ├── requirements-extras.txt ├── requirements.txt ├── scripts └── lint.sh ├── setup.cfg └── setup.py /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | run-linters: 13 | name: Run linters 14 | runs-on: ubuntu-20.04 15 | 16 | steps: 17 | - name: Checkout repository 18 | uses: actions/checkout@v3 19 | - name: Set up Python 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: 3.9 23 | cache: 'pip' 24 | cache-dependency-path: '**/requirements*.txt' 25 | - name: Install Python (development) dependencies 26 | run: | 27 | pip install -r requirements-dev.txt 28 | - name: Run flake8 29 | run: | 30 | flake8 31 | - name: Run black 32 | if: always() 33 | run: | 34 | black --check dinov2 35 | - name: Run pylint 36 | if: always() 37 | run: | 38 | pylint --exit-zero dinov2 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | *.egg-info/ 4 | **/__pycache__/ 5 | 6 | **/.ipynb_checkpoints 7 | **/.ipynb_checkpoints/** 8 | 9 | *.swp 10 | 11 | .vscode/ 12 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq 81 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to DINOv2 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Meta's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to DINOv2, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. 32 | -------------------------------------------------------------------------------- /conda-extras.yaml: -------------------------------------------------------------------------------- 1 | name: dinov2-extras 2 | channels: 3 | - defaults 4 | - pytorch 5 | - nvidia 6 | - xformers 7 | - conda-forge 8 | dependencies: 9 | - python=3.9 10 | - pytorch::pytorch=2.0.0 11 | - pytorch::pytorch-cuda=11.7.0 12 | - pytorch::torchvision=0.15.0 13 | - omegaconf 14 | - torchmetrics=0.10.3 15 | - fvcore 16 | - iopath 17 | - xformers::xformers=0.0.18 18 | - pip 19 | - pip: 20 | - git+https://github.com/facebookincubator/submitit 21 | - --extra-index-url https://pypi.nvidia.com 22 | - cuml-cu11 23 | - mmcv-full==1.5.0 24 | - mmsegmentation==0.27.0 25 | -------------------------------------------------------------------------------- /conda.yaml: -------------------------------------------------------------------------------- 1 | name: dinov2 2 | channels: 3 | - defaults 4 | - pytorch 5 | - nvidia 6 | - xformers 7 | - conda-forge 8 | dependencies: 9 | - python=3.9 10 | - pytorch::pytorch=2.0.0 11 | - pytorch::pytorch-cuda=11.7.0 12 | - pytorch::torchvision=0.15.0 13 | - omegaconf 14 | - torchmetrics=0.10.3 15 | - fvcore 16 | - iopath 17 | - xformers::xformers=0.0.18 18 | - pip 19 | - pip: 20 | - git+https://github.com/facebookincubator/submitit 21 | - --extra-index-url https://pypi.nvidia.com 22 | - cuml-cu11 23 | -------------------------------------------------------------------------------- /dinov2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | __version__ = "0.0.1" 7 | -------------------------------------------------------------------------------- /dinov2/configs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import pathlib 7 | 8 | from omegaconf import OmegaConf 9 | 10 | 11 | def load_config(config_name: str): 12 | config_filename = config_name + ".yaml" 13 | return OmegaConf.load(pathlib.Path(__file__).parent.resolve() / config_filename) 14 | 15 | 16 | dinov2_default_config = load_config("ssl_default_config") 17 | 18 | 19 | def load_and_merge_config(config_name: str): 20 | default_config = OmegaConf.create(dinov2_default_config) 21 | loaded_config = load_config(config_name) 22 | return OmegaConf.merge(default_config, loaded_config) 23 | -------------------------------------------------------------------------------- /dinov2/configs/eval/vitb14_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_base 3 | patch_size: 14 4 | crops: 5 | global_crops_size: 518 # this is to set up the position embeddings properly 6 | local_crops_size: 98 -------------------------------------------------------------------------------- /dinov2/configs/eval/vitb14_reg4_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_base 3 | patch_size: 14 4 | num_register_tokens: 4 5 | interpolate_antialias: true 6 | interpolate_offset: 0.0 7 | crops: 8 | global_crops_size: 518 # this is to set up the position embeddings properly 9 | local_crops_size: 98 -------------------------------------------------------------------------------- /dinov2/configs/eval/vitg14_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_giant2 3 | patch_size: 14 4 | ffn_layer: swiglufused 5 | crops: 6 | global_crops_size: 518 # this is to set up the position embeddings properly 7 | local_crops_size: 98 -------------------------------------------------------------------------------- /dinov2/configs/eval/vitg14_reg4_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_giant2 3 | patch_size: 14 4 | ffn_layer: swiglufused 5 | num_register_tokens: 4 6 | interpolate_antialias: true 7 | interpolate_offset: 0.0 8 | crops: 9 | global_crops_size: 518 # this is to set up the position embeddings properly 10 | local_crops_size: 98 -------------------------------------------------------------------------------- /dinov2/configs/eval/vitl14_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_large 3 | patch_size: 14 4 | crops: 5 | global_crops_size: 518 # this is to set up the position embeddings properly 6 | local_crops_size: 98 -------------------------------------------------------------------------------- /dinov2/configs/eval/vitl14_reg4_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_large 3 | patch_size: 14 4 | num_register_tokens: 4 5 | interpolate_antialias: true 6 | interpolate_offset: 0.0 7 | crops: 8 | global_crops_size: 518 # this is to set up the position embeddings properly 9 | local_crops_size: 98 -------------------------------------------------------------------------------- /dinov2/configs/eval/vits14_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_small 3 | patch_size: 14 4 | crops: 5 | global_crops_size: 518 # this is to set up the position embeddings properly 6 | local_crops_size: 98 -------------------------------------------------------------------------------- /dinov2/configs/eval/vits14_reg4_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_small 3 | patch_size: 14 4 | num_register_tokens: 4 5 | interpolate_antialias: true 6 | interpolate_offset: 0.0 7 | crops: 8 | global_crops_size: 518 # this is to set up the position embeddings properly 9 | local_crops_size: 98 -------------------------------------------------------------------------------- /dinov2/configs/ssl_default_config.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | WEIGHTS: '' 3 | compute_precision: 4 | grad_scaler: true 5 | teacher: 6 | backbone: 7 | sharding_strategy: SHARD_GRAD_OP 8 | mixed_precision: 9 | param_dtype: fp16 10 | reduce_dtype: fp16 11 | buffer_dtype: fp32 12 | dino_head: 13 | sharding_strategy: SHARD_GRAD_OP 14 | mixed_precision: 15 | param_dtype: fp16 16 | reduce_dtype: fp16 17 | buffer_dtype: fp32 18 | ibot_head: 19 | sharding_strategy: SHARD_GRAD_OP 20 | mixed_precision: 21 | param_dtype: fp16 22 | reduce_dtype: fp16 23 | buffer_dtype: fp32 24 | student: 25 | backbone: 26 | sharding_strategy: SHARD_GRAD_OP 27 | mixed_precision: 28 | param_dtype: fp16 29 | reduce_dtype: fp16 30 | buffer_dtype: fp32 31 | dino_head: 32 | sharding_strategy: SHARD_GRAD_OP 33 | mixed_precision: 34 | param_dtype: fp16 35 | reduce_dtype: fp32 36 | buffer_dtype: fp32 37 | ibot_head: 38 | sharding_strategy: SHARD_GRAD_OP 39 | mixed_precision: 40 | param_dtype: fp16 41 | reduce_dtype: fp32 42 | buffer_dtype: fp32 43 | dino: 44 | loss_weight: 1.0 45 | head_n_prototypes: 65536 46 | head_bottleneck_dim: 256 47 | head_nlayers: 3 48 | head_hidden_dim: 2048 49 | koleo_loss_weight: 0.1 50 | ibot: 51 | loss_weight: 1.0 52 | mask_sample_probability: 0.5 53 | mask_ratio_min_max: 54 | - 0.1 55 | - 0.5 56 | separate_head: false 57 | head_n_prototypes: 65536 58 | head_bottleneck_dim: 256 59 | head_nlayers: 3 60 | head_hidden_dim: 2048 61 | train: 62 | batch_size_per_gpu: 64 63 | dataset_path: ImageNet:split=TRAIN 64 | output_dir: . 65 | saveckp_freq: 20 66 | seed: 0 67 | num_workers: 10 68 | OFFICIAL_EPOCH_LENGTH: 1250 69 | cache_dataset: true 70 | centering: "centering" # or "sinkhorn_knopp" 71 | student: 72 | arch: vit_large 73 | patch_size: 16 74 | drop_path_rate: 0.3 75 | layerscale: 1.0e-05 76 | drop_path_uniform: true 77 | pretrained_weights: '' 78 | ffn_layer: "mlp" 79 | block_chunks: 0 80 | qkv_bias: true 81 | proj_bias: true 82 | ffn_bias: true 83 | num_register_tokens: 0 84 | interpolate_antialias: false 85 | interpolate_offset: 0.1 86 | teacher: 87 | momentum_teacher: 0.992 88 | final_momentum_teacher: 1 89 | warmup_teacher_temp: 0.04 90 | teacher_temp: 0.07 91 | warmup_teacher_temp_epochs: 30 92 | optim: 93 | epochs: 100 94 | weight_decay: 0.04 95 | weight_decay_end: 0.4 96 | base_lr: 0.004 # learning rate for a batch size of 1024 97 | lr: 0. # will be set after applying scaling rule 98 | warmup_epochs: 10 99 | min_lr: 1.0e-06 100 | clip_grad: 3.0 101 | freeze_last_layer_epochs: 1 102 | scaling_rule: sqrt_wrt_1024 103 | patch_embed_lr_mult: 0.2 104 | layerwise_decay: 0.9 105 | adamw_beta1: 0.9 106 | adamw_beta2: 0.999 107 | crops: 108 | global_crops_scale: 109 | - 0.32 110 | - 1.0 111 | local_crops_number: 8 112 | local_crops_scale: 113 | - 0.05 114 | - 0.32 115 | global_crops_size: 224 116 | local_crops_size: 96 117 | evaluation: 118 | eval_period_iterations: 12500 119 | -------------------------------------------------------------------------------- /dinov2/configs/train/vitg14.yaml: -------------------------------------------------------------------------------- 1 | dino: 2 | head_n_prototypes: 131072 3 | head_bottleneck_dim: 384 4 | ibot: 5 | separate_head: true 6 | head_n_prototypes: 131072 7 | train: 8 | batch_size_per_gpu: 12 9 | dataset_path: ImageNet22k 10 | centering: sinkhorn_knopp 11 | student: 12 | arch: vit_giant2 13 | patch_size: 14 14 | drop_path_rate: 0.4 15 | ffn_layer: swiglufused 16 | block_chunks: 4 17 | teacher: 18 | momentum_teacher: 0.994 19 | optim: 20 | epochs: 500 21 | weight_decay_end: 0.2 22 | base_lr: 2.0e-04 # learning rate for a batch size of 1024 23 | warmup_epochs: 80 24 | layerwise_decay: 1.0 25 | crops: 26 | local_crops_size: 98 -------------------------------------------------------------------------------- /dinov2/configs/train/vitl14.yaml: -------------------------------------------------------------------------------- 1 | dino: 2 | head_n_prototypes: 131072 3 | head_bottleneck_dim: 384 4 | ibot: 5 | separate_head: true 6 | head_n_prototypes: 131072 7 | train: 8 | batch_size_per_gpu: 32 9 | dataset_path: ImageNet22k 10 | centering: sinkhorn_knopp 11 | student: 12 | arch: vit_large 13 | patch_size: 14 14 | drop_path_rate: 0.4 15 | ffn_layer: swiglufused 16 | block_chunks: 4 17 | teacher: 18 | momentum_teacher: 0.994 19 | optim: 20 | epochs: 500 21 | weight_decay_end: 0.2 22 | base_lr: 2.0e-04 # learning rate for a batch size of 1024 23 | warmup_epochs: 80 24 | layerwise_decay: 1.0 25 | crops: 26 | local_crops_size: 98 -------------------------------------------------------------------------------- /dinov2/configs/train/vitl16_short.yaml: -------------------------------------------------------------------------------- 1 | # this corresponds to the default config 2 | train: 3 | dataset_path: ImageNet:split=TRAIN 4 | batch_size_per_gpu: 64 5 | student: 6 | block_chunks: 4 7 | -------------------------------------------------------------------------------- /dinov2/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .adapters import DatasetWithEnumeratedTargets 7 | from .loaders import make_data_loader, make_dataset, SamplerType 8 | from .collate import collate_data_and_cast 9 | from .masking import MaskingGenerator 10 | from .augmentations import DataAugmentationDINO 11 | -------------------------------------------------------------------------------- /dinov2/data/adapters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from typing import Any, Tuple 7 | 8 | from torch.utils.data import Dataset 9 | 10 | 11 | class DatasetWithEnumeratedTargets(Dataset): 12 | def __init__(self, dataset): 13 | self._dataset = dataset 14 | 15 | def get_image_data(self, index: int) -> bytes: 16 | return self._dataset.get_image_data(index) 17 | 18 | def get_target(self, index: int) -> Tuple[Any, int]: 19 | target = self._dataset.get_target(index) 20 | return (index, target) 21 | 22 | def __getitem__(self, index: int) -> Tuple[Any, Tuple[Any, int]]: 23 | image, target = self._dataset[index] 24 | target = index if target is None else target 25 | return image, (index, target) 26 | 27 | def __len__(self) -> int: 28 | return len(self._dataset) 29 | -------------------------------------------------------------------------------- /dinov2/data/augmentations.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import logging 7 | 8 | from torchvision import transforms 9 | 10 | from .transforms import ( 11 | GaussianBlur, 12 | make_normalize_transform, 13 | ) 14 | 15 | 16 | logger = logging.getLogger("dinov2") 17 | 18 | 19 | class DataAugmentationDINO(object): 20 | def __init__( 21 | self, 22 | global_crops_scale, 23 | local_crops_scale, 24 | local_crops_number, 25 | global_crops_size=224, 26 | local_crops_size=96, 27 | ): 28 | self.global_crops_scale = global_crops_scale 29 | self.local_crops_scale = local_crops_scale 30 | self.local_crops_number = local_crops_number 31 | self.global_crops_size = global_crops_size 32 | self.local_crops_size = local_crops_size 33 | 34 | logger.info("###################################") 35 | logger.info("Using data augmentation parameters:") 36 | logger.info(f"global_crops_scale: {global_crops_scale}") 37 | logger.info(f"local_crops_scale: {local_crops_scale}") 38 | logger.info(f"local_crops_number: {local_crops_number}") 39 | logger.info(f"global_crops_size: {global_crops_size}") 40 | logger.info(f"local_crops_size: {local_crops_size}") 41 | logger.info("###################################") 42 | 43 | # random resized crop and flip 44 | self.geometric_augmentation_global = transforms.Compose( 45 | [ 46 | transforms.RandomResizedCrop( 47 | global_crops_size, scale=global_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC 48 | ), 49 | transforms.RandomHorizontalFlip(p=0.5), 50 | ] 51 | ) 52 | 53 | self.geometric_augmentation_local = transforms.Compose( 54 | [ 55 | transforms.RandomResizedCrop( 56 | local_crops_size, scale=local_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC 57 | ), 58 | transforms.RandomHorizontalFlip(p=0.5), 59 | ] 60 | ) 61 | 62 | # color distorsions / blurring 63 | color_jittering = transforms.Compose( 64 | [ 65 | transforms.RandomApply( 66 | [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)], 67 | p=0.8, 68 | ), 69 | transforms.RandomGrayscale(p=0.2), 70 | ] 71 | ) 72 | 73 | global_transfo1_extra = GaussianBlur(p=1.0) 74 | 75 | global_transfo2_extra = transforms.Compose( 76 | [ 77 | GaussianBlur(p=0.1), 78 | transforms.RandomSolarize(threshold=128, p=0.2), 79 | ] 80 | ) 81 | 82 | local_transfo_extra = GaussianBlur(p=0.5) 83 | 84 | # normalization 85 | self.normalize = transforms.Compose( 86 | [ 87 | transforms.ToTensor(), 88 | make_normalize_transform(), 89 | ] 90 | ) 91 | 92 | self.global_transfo1 = transforms.Compose([color_jittering, global_transfo1_extra, self.normalize]) 93 | self.global_transfo2 = transforms.Compose([color_jittering, global_transfo2_extra, self.normalize]) 94 | self.local_transfo = transforms.Compose([color_jittering, local_transfo_extra, self.normalize]) 95 | 96 | def __call__(self, image): 97 | output = {} 98 | 99 | # global crops: 100 | im1_base = self.geometric_augmentation_global(image) 101 | global_crop_1 = self.global_transfo1(im1_base) 102 | 103 | im2_base = self.geometric_augmentation_global(image) 104 | global_crop_2 = self.global_transfo2(im2_base) 105 | 106 | output["global_crops"] = [global_crop_1, global_crop_2] 107 | 108 | # global crops for teacher: 109 | output["global_crops_teacher"] = [global_crop_1, global_crop_2] 110 | 111 | # local crops: 112 | local_crops = [ 113 | self.local_transfo(self.geometric_augmentation_local(image)) for _ in range(self.local_crops_number) 114 | ] 115 | output["local_crops"] = local_crops 116 | output["offsets"] = () 117 | 118 | return output 119 | -------------------------------------------------------------------------------- /dinov2/data/collate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import random 8 | 9 | 10 | def collate_data_and_cast(samples_list, mask_ratio_tuple, mask_probability, dtype, n_tokens=None, mask_generator=None): 11 | # dtype = torch.half # TODO: Remove 12 | 13 | n_global_crops = len(samples_list[0][0]["global_crops"]) 14 | n_local_crops = len(samples_list[0][0]["local_crops"]) 15 | 16 | collated_global_crops = torch.stack([s[0]["global_crops"][i] for i in range(n_global_crops) for s in samples_list]) 17 | 18 | collated_local_crops = torch.stack([s[0]["local_crops"][i] for i in range(n_local_crops) for s in samples_list]) 19 | 20 | B = len(collated_global_crops) 21 | N = n_tokens 22 | n_samples_masked = int(B * mask_probability) 23 | probs = torch.linspace(*mask_ratio_tuple, n_samples_masked + 1) 24 | upperbound = 0 25 | masks_list = [] 26 | for i in range(0, n_samples_masked): 27 | prob_min = probs[i] 28 | prob_max = probs[i + 1] 29 | masks_list.append(torch.BoolTensor(mask_generator(int(N * random.uniform(prob_min, prob_max))))) 30 | upperbound += int(N * prob_max) 31 | for i in range(n_samples_masked, B): 32 | masks_list.append(torch.BoolTensor(mask_generator(0))) 33 | 34 | random.shuffle(masks_list) 35 | 36 | collated_masks = torch.stack(masks_list).flatten(1) 37 | mask_indices_list = collated_masks.flatten().nonzero().flatten() 38 | 39 | masks_weight = (1 / collated_masks.sum(-1).clamp(min=1.0)).unsqueeze(-1).expand_as(collated_masks)[collated_masks] 40 | 41 | return { 42 | "collated_global_crops": collated_global_crops.to(dtype), 43 | "collated_local_crops": collated_local_crops.to(dtype), 44 | "collated_masks": collated_masks, 45 | "mask_indices_list": mask_indices_list, 46 | "masks_weight": masks_weight, 47 | "upperbound": upperbound, 48 | "n_masked_patches": torch.full((1,), fill_value=mask_indices_list.shape[0], dtype=torch.long), 49 | } 50 | -------------------------------------------------------------------------------- /dinov2/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .image_net import ImageNet 7 | from .image_net_22k import ImageNet22k 8 | -------------------------------------------------------------------------------- /dinov2/data/datasets/decoders.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from io import BytesIO 7 | from typing import Any 8 | 9 | from PIL import Image 10 | 11 | 12 | class Decoder: 13 | def decode(self) -> Any: 14 | raise NotImplementedError 15 | 16 | 17 | class ImageDataDecoder(Decoder): 18 | def __init__(self, image_data: bytes) -> None: 19 | self._image_data = image_data 20 | 21 | def decode(self) -> Image: 22 | f = BytesIO(self._image_data) 23 | return Image.open(f).convert(mode="RGB") 24 | 25 | 26 | class TargetDecoder(Decoder): 27 | def __init__(self, target: Any): 28 | self._target = target 29 | 30 | def decode(self) -> Any: 31 | return self._target 32 | -------------------------------------------------------------------------------- /dinov2/data/datasets/extended.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from typing import Any, Tuple 7 | 8 | from torchvision.datasets import VisionDataset 9 | 10 | from .decoders import TargetDecoder, ImageDataDecoder 11 | 12 | 13 | class ExtendedVisionDataset(VisionDataset): 14 | def __init__(self, *args, **kwargs) -> None: 15 | super().__init__(*args, **kwargs) # type: ignore 16 | 17 | def get_image_data(self, index: int) -> bytes: 18 | raise NotImplementedError 19 | 20 | def get_target(self, index: int) -> Any: 21 | raise NotImplementedError 22 | 23 | def __getitem__(self, index: int) -> Tuple[Any, Any]: 24 | try: 25 | image_data = self.get_image_data(index) 26 | image = ImageDataDecoder(image_data).decode() 27 | except Exception as e: 28 | raise RuntimeError(f"can not read image for sample {index}") from e 29 | target = self.get_target(index) 30 | target = TargetDecoder(target).decode() 31 | 32 | if self.transforms is not None: 33 | image, target = self.transforms(image, target) 34 | 35 | return image, target 36 | 37 | def __len__(self) -> int: 38 | raise NotImplementedError 39 | -------------------------------------------------------------------------------- /dinov2/data/masking.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import random 7 | import math 8 | import numpy as np 9 | 10 | 11 | class MaskingGenerator: 12 | def __init__( 13 | self, 14 | input_size, 15 | num_masking_patches=None, 16 | min_num_patches=4, 17 | max_num_patches=None, 18 | min_aspect=0.3, 19 | max_aspect=None, 20 | ): 21 | if not isinstance(input_size, tuple): 22 | input_size = (input_size,) * 2 23 | self.height, self.width = input_size 24 | 25 | self.num_patches = self.height * self.width 26 | self.num_masking_patches = num_masking_patches 27 | 28 | self.min_num_patches = min_num_patches 29 | self.max_num_patches = num_masking_patches if max_num_patches is None else max_num_patches 30 | 31 | max_aspect = max_aspect or 1 / min_aspect 32 | self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) 33 | 34 | def __repr__(self): 35 | repr_str = "Generator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % ( 36 | self.height, 37 | self.width, 38 | self.min_num_patches, 39 | self.max_num_patches, 40 | self.num_masking_patches, 41 | self.log_aspect_ratio[0], 42 | self.log_aspect_ratio[1], 43 | ) 44 | return repr_str 45 | 46 | def get_shape(self): 47 | return self.height, self.width 48 | 49 | def _mask(self, mask, max_mask_patches): 50 | delta = 0 51 | for _ in range(10): 52 | target_area = random.uniform(self.min_num_patches, max_mask_patches) 53 | aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) 54 | h = int(round(math.sqrt(target_area * aspect_ratio))) 55 | w = int(round(math.sqrt(target_area / aspect_ratio))) 56 | if w < self.width and h < self.height: 57 | top = random.randint(0, self.height - h) 58 | left = random.randint(0, self.width - w) 59 | 60 | num_masked = mask[top : top + h, left : left + w].sum() 61 | # Overlap 62 | if 0 < h * w - num_masked <= max_mask_patches: 63 | for i in range(top, top + h): 64 | for j in range(left, left + w): 65 | if mask[i, j] == 0: 66 | mask[i, j] = 1 67 | delta += 1 68 | 69 | if delta > 0: 70 | break 71 | return delta 72 | 73 | def __call__(self, num_masking_patches=0): 74 | mask = np.zeros(shape=self.get_shape(), dtype=bool) 75 | mask_count = 0 76 | while mask_count < num_masking_patches: 77 | max_mask_patches = num_masking_patches - mask_count 78 | max_mask_patches = min(max_mask_patches, self.max_num_patches) 79 | 80 | delta = self._mask(mask, max_mask_patches) 81 | if delta == 0: 82 | break 83 | else: 84 | mask_count += delta 85 | 86 | return mask 87 | -------------------------------------------------------------------------------- /dinov2/data/transforms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from typing import Sequence 7 | 8 | import torch 9 | from torchvision import transforms 10 | 11 | 12 | class GaussianBlur(transforms.RandomApply): 13 | """ 14 | Apply Gaussian Blur to the PIL image. 15 | """ 16 | 17 | def __init__(self, *, p: float = 0.5, radius_min: float = 0.1, radius_max: float = 2.0): 18 | # NOTE: torchvision is applying 1 - probability to return the original image 19 | keep_p = 1 - p 20 | transform = transforms.GaussianBlur(kernel_size=9, sigma=(radius_min, radius_max)) 21 | super().__init__(transforms=[transform], p=keep_p) 22 | 23 | 24 | class MaybeToTensor(transforms.ToTensor): 25 | """ 26 | Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor, or keep as is if already a tensor. 27 | """ 28 | 29 | def __call__(self, pic): 30 | """ 31 | Args: 32 | pic (PIL Image, numpy.ndarray or torch.tensor): Image to be converted to tensor. 33 | Returns: 34 | Tensor: Converted image. 35 | """ 36 | if isinstance(pic, torch.Tensor): 37 | return pic 38 | return super().__call__(pic) 39 | 40 | 41 | # Use timm's names 42 | IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) 43 | IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) 44 | 45 | 46 | def make_normalize_transform( 47 | mean: Sequence[float] = IMAGENET_DEFAULT_MEAN, 48 | std: Sequence[float] = IMAGENET_DEFAULT_STD, 49 | ) -> transforms.Normalize: 50 | return transforms.Normalize(mean=mean, std=std) 51 | 52 | 53 | # This roughly matches torchvision's preset for classification training: 54 | # https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L6-L44 55 | def make_classification_train_transform( 56 | *, 57 | crop_size: int = 224, 58 | interpolation=transforms.InterpolationMode.BICUBIC, 59 | hflip_prob: float = 0.5, 60 | mean: Sequence[float] = IMAGENET_DEFAULT_MEAN, 61 | std: Sequence[float] = IMAGENET_DEFAULT_STD, 62 | ): 63 | transforms_list = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)] 64 | if hflip_prob > 0.0: 65 | transforms_list.append(transforms.RandomHorizontalFlip(hflip_prob)) 66 | transforms_list.extend( 67 | [ 68 | MaybeToTensor(), 69 | make_normalize_transform(mean=mean, std=std), 70 | ] 71 | ) 72 | return transforms.Compose(transforms_list) 73 | 74 | 75 | # This matches (roughly) torchvision's preset for classification evaluation: 76 | # https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L47-L69 77 | def make_classification_eval_transform( 78 | *, 79 | resize_size: int = 256, 80 | interpolation=transforms.InterpolationMode.BICUBIC, 81 | crop_size: int = 224, 82 | mean: Sequence[float] = IMAGENET_DEFAULT_MEAN, 83 | std: Sequence[float] = IMAGENET_DEFAULT_STD, 84 | ) -> transforms.Compose: 85 | transforms_list = [ 86 | transforms.Resize(resize_size, interpolation=interpolation), 87 | transforms.CenterCrop(crop_size), 88 | MaybeToTensor(), 89 | make_normalize_transform(mean=mean, std=std), 90 | ] 91 | return transforms.Compose(transforms_list) 92 | -------------------------------------------------------------------------------- /dinov2/eval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /dinov2/eval/depth/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /dinov2/eval/depth/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .backbones import * # noqa: F403 7 | from .builder import BACKBONES, DEPTHER, HEADS, LOSSES, build_backbone, build_depther, build_head, build_loss 8 | from .decode_heads import * # noqa: F403 9 | from .depther import * # noqa: F403 10 | from .losses import * # noqa: F403 11 | -------------------------------------------------------------------------------- /dinov2/eval/depth/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .vision_transformer import DinoVisionTransformer 7 | -------------------------------------------------------------------------------- /dinov2/eval/depth/models/backbones/vision_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from mmcv.runner import BaseModule 7 | 8 | from ..builder import BACKBONES 9 | 10 | 11 | @BACKBONES.register_module() 12 | class DinoVisionTransformer(BaseModule): 13 | """Vision Transformer.""" 14 | 15 | def __init__(self, *args, **kwargs): 16 | super().__init__() 17 | -------------------------------------------------------------------------------- /dinov2/eval/depth/models/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import warnings 7 | 8 | from mmcv.cnn import MODELS as MMCV_MODELS 9 | from mmcv.cnn.bricks.registry import ATTENTION as MMCV_ATTENTION 10 | from mmcv.utils import Registry 11 | 12 | MODELS = Registry("models", parent=MMCV_MODELS) 13 | ATTENTION = Registry("attention", parent=MMCV_ATTENTION) 14 | 15 | 16 | BACKBONES = MODELS 17 | NECKS = MODELS 18 | HEADS = MODELS 19 | LOSSES = MODELS 20 | DEPTHER = MODELS 21 | 22 | 23 | def build_backbone(cfg): 24 | """Build backbone.""" 25 | return BACKBONES.build(cfg) 26 | 27 | 28 | def build_neck(cfg): 29 | """Build neck.""" 30 | return NECKS.build(cfg) 31 | 32 | 33 | def build_head(cfg): 34 | """Build head.""" 35 | return HEADS.build(cfg) 36 | 37 | 38 | def build_loss(cfg): 39 | """Build loss.""" 40 | return LOSSES.build(cfg) 41 | 42 | 43 | def build_depther(cfg, train_cfg=None, test_cfg=None): 44 | """Build depther.""" 45 | if train_cfg is not None or test_cfg is not None: 46 | warnings.warn("train_cfg and test_cfg is deprecated, " "please specify them in model", UserWarning) 47 | assert cfg.get("train_cfg") is None or train_cfg is None, "train_cfg specified in both outer field and model field " 48 | assert cfg.get("test_cfg") is None or test_cfg is None, "test_cfg specified in both outer field and model field " 49 | return DEPTHER.build(cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg)) 50 | -------------------------------------------------------------------------------- /dinov2/eval/depth/models/decode_heads/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .dpt_head import DPTHead 7 | from .linear_head import BNHead 8 | -------------------------------------------------------------------------------- /dinov2/eval/depth/models/decode_heads/linear_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from ...ops import resize 10 | from ..builder import HEADS 11 | from .decode_head import DepthBaseDecodeHead 12 | 13 | 14 | @HEADS.register_module() 15 | class BNHead(DepthBaseDecodeHead): 16 | """Just a batchnorm.""" 17 | 18 | def __init__(self, input_transform="resize_concat", in_index=(0, 1, 2, 3), upsample=1, **kwargs): 19 | super().__init__(**kwargs) 20 | self.input_transform = input_transform 21 | self.in_index = in_index 22 | self.upsample = upsample 23 | # self.bn = nn.SyncBatchNorm(self.in_channels) 24 | if self.classify: 25 | self.conv_depth = nn.Conv2d(self.channels, self.n_bins, kernel_size=1, padding=0, stride=1) 26 | else: 27 | self.conv_depth = nn.Conv2d(self.channels, 1, kernel_size=1, padding=0, stride=1) 28 | 29 | def _transform_inputs(self, inputs): 30 | """Transform inputs for decoder. 31 | Args: 32 | inputs (list[Tensor]): List of multi-level img features. 33 | Returns: 34 | Tensor: The transformed inputs 35 | """ 36 | 37 | if "concat" in self.input_transform: 38 | inputs = [inputs[i] for i in self.in_index] 39 | if "resize" in self.input_transform: 40 | inputs = [ 41 | resize( 42 | input=x, 43 | size=[s * self.upsample for s in inputs[0].shape[2:]], 44 | mode="bilinear", 45 | align_corners=self.align_corners, 46 | ) 47 | for x in inputs 48 | ] 49 | inputs = torch.cat(inputs, dim=1) 50 | elif self.input_transform == "multiple_select": 51 | inputs = [inputs[i] for i in self.in_index] 52 | else: 53 | inputs = inputs[self.in_index] 54 | 55 | return inputs 56 | 57 | def _forward_feature(self, inputs, img_metas=None, **kwargs): 58 | """Forward function for feature maps before classifying each pixel with 59 | ``self.cls_seg`` fc. 60 | Args: 61 | inputs (list[Tensor]): List of multi-level img features. 62 | Returns: 63 | feats (Tensor): A tensor of shape (batch_size, self.channels, 64 | H, W) which is feature map for last layer of decoder head. 65 | """ 66 | # accept lists (for cls token) 67 | inputs = list(inputs) 68 | for i, x in enumerate(inputs): 69 | if len(x) == 2: 70 | x, cls_token = x[0], x[1] 71 | if len(x.shape) == 2: 72 | x = x[:, :, None, None] 73 | cls_token = cls_token[:, :, None, None].expand_as(x) 74 | inputs[i] = torch.cat((x, cls_token), 1) 75 | else: 76 | x = x[0] 77 | if len(x.shape) == 2: 78 | x = x[:, :, None, None] 79 | inputs[i] = x 80 | x = self._transform_inputs(inputs) 81 | # feats = self.bn(x) 82 | return x 83 | 84 | def forward(self, inputs, img_metas=None, **kwargs): 85 | """Forward function.""" 86 | output = self._forward_feature(inputs, img_metas=img_metas, **kwargs) 87 | output = self.depth_pred(output) 88 | 89 | return output 90 | -------------------------------------------------------------------------------- /dinov2/eval/depth/models/depther/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .base import BaseDepther 7 | from .encoder_decoder import DepthEncoderDecoder 8 | -------------------------------------------------------------------------------- /dinov2/eval/depth/models/losses/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .gradientloss import GradientLoss 7 | from .sigloss import SigLoss 8 | -------------------------------------------------------------------------------- /dinov2/eval/depth/models/losses/gradientloss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from ...models.builder import LOSSES 10 | 11 | 12 | @LOSSES.register_module() 13 | class GradientLoss(nn.Module): 14 | """GradientLoss. 15 | 16 | Adapted from https://www.cs.cornell.edu/projects/megadepth/ 17 | 18 | Args: 19 | valid_mask (bool): Whether filter invalid gt (gt > 0). Default: True. 20 | loss_weight (float): Weight of the loss. Default: 1.0. 21 | max_depth (int): When filtering invalid gt, set a max threshold. Default: None. 22 | """ 23 | 24 | def __init__(self, valid_mask=True, loss_weight=1.0, max_depth=None, loss_name="loss_grad"): 25 | super(GradientLoss, self).__init__() 26 | self.valid_mask = valid_mask 27 | self.loss_weight = loss_weight 28 | self.max_depth = max_depth 29 | self.loss_name = loss_name 30 | 31 | self.eps = 0.001 # avoid grad explode 32 | 33 | def gradientloss(self, input, target): 34 | input_downscaled = [input] + [input[:: 2 * i, :: 2 * i] for i in range(1, 4)] 35 | target_downscaled = [target] + [target[:: 2 * i, :: 2 * i] for i in range(1, 4)] 36 | 37 | gradient_loss = 0 38 | for input, target in zip(input_downscaled, target_downscaled): 39 | if self.valid_mask: 40 | mask = target > 0 41 | if self.max_depth is not None: 42 | mask = torch.logical_and(target > 0, target <= self.max_depth) 43 | N = torch.sum(mask) 44 | else: 45 | mask = torch.ones_like(target) 46 | N = input.numel() 47 | input_log = torch.log(input + self.eps) 48 | target_log = torch.log(target + self.eps) 49 | log_d_diff = input_log - target_log 50 | 51 | log_d_diff = torch.mul(log_d_diff, mask) 52 | 53 | v_gradient = torch.abs(log_d_diff[0:-2, :] - log_d_diff[2:, :]) 54 | v_mask = torch.mul(mask[0:-2, :], mask[2:, :]) 55 | v_gradient = torch.mul(v_gradient, v_mask) 56 | 57 | h_gradient = torch.abs(log_d_diff[:, 0:-2] - log_d_diff[:, 2:]) 58 | h_mask = torch.mul(mask[:, 0:-2], mask[:, 2:]) 59 | h_gradient = torch.mul(h_gradient, h_mask) 60 | 61 | gradient_loss += (torch.sum(h_gradient) + torch.sum(v_gradient)) / N 62 | 63 | return gradient_loss 64 | 65 | def forward(self, depth_pred, depth_gt): 66 | """Forward function.""" 67 | 68 | gradient_loss = self.loss_weight * self.gradientloss(depth_pred, depth_gt) 69 | return gradient_loss 70 | -------------------------------------------------------------------------------- /dinov2/eval/depth/models/losses/sigloss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from ...models.builder import LOSSES 10 | 11 | 12 | @LOSSES.register_module() 13 | class SigLoss(nn.Module): 14 | """SigLoss. 15 | 16 | This follows `AdaBins `_. 17 | 18 | Args: 19 | valid_mask (bool): Whether filter invalid gt (gt > 0). Default: True. 20 | loss_weight (float): Weight of the loss. Default: 1.0. 21 | max_depth (int): When filtering invalid gt, set a max threshold. Default: None. 22 | warm_up (bool): A simple warm up stage to help convergence. Default: False. 23 | warm_iter (int): The number of warm up stage. Default: 100. 24 | """ 25 | 26 | def __init__( 27 | self, valid_mask=True, loss_weight=1.0, max_depth=None, warm_up=False, warm_iter=100, loss_name="sigloss" 28 | ): 29 | super(SigLoss, self).__init__() 30 | self.valid_mask = valid_mask 31 | self.loss_weight = loss_weight 32 | self.max_depth = max_depth 33 | self.loss_name = loss_name 34 | 35 | self.eps = 0.001 # avoid grad explode 36 | 37 | # HACK: a hack implementation for warmup sigloss 38 | self.warm_up = warm_up 39 | self.warm_iter = warm_iter 40 | self.warm_up_counter = 0 41 | 42 | def sigloss(self, input, target): 43 | if self.valid_mask: 44 | valid_mask = target > 0 45 | if self.max_depth is not None: 46 | valid_mask = torch.logical_and(target > 0, target <= self.max_depth) 47 | input = input[valid_mask] 48 | target = target[valid_mask] 49 | 50 | if self.warm_up: 51 | if self.warm_up_counter < self.warm_iter: 52 | g = torch.log(input + self.eps) - torch.log(target + self.eps) 53 | g = 0.15 * torch.pow(torch.mean(g), 2) 54 | self.warm_up_counter += 1 55 | return torch.sqrt(g) 56 | 57 | g = torch.log(input + self.eps) - torch.log(target + self.eps) 58 | Dg = torch.var(g) + 0.15 * torch.pow(torch.mean(g), 2) 59 | return torch.sqrt(Dg) 60 | 61 | def forward(self, depth_pred, depth_gt): 62 | """Forward function.""" 63 | 64 | loss_depth = self.loss_weight * self.sigloss(depth_pred, depth_gt) 65 | return loss_depth 66 | -------------------------------------------------------------------------------- /dinov2/eval/depth/ops/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .wrappers import resize 7 | -------------------------------------------------------------------------------- /dinov2/eval/depth/ops/wrappers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import warnings 7 | 8 | import torch.nn.functional as F 9 | 10 | 11 | def resize(input, size=None, scale_factor=None, mode="nearest", align_corners=None, warning=False): 12 | if warning: 13 | if size is not None and align_corners: 14 | input_h, input_w = tuple(int(x) for x in input.shape[2:]) 15 | output_h, output_w = tuple(int(x) for x in size) 16 | if output_h > input_h or output_w > output_h: 17 | if ( 18 | (output_h > 1 and output_w > 1 and input_h > 1 and input_w > 1) 19 | and (output_h - 1) % (input_h - 1) 20 | and (output_w - 1) % (input_w - 1) 21 | ): 22 | warnings.warn( 23 | f"When align_corners={align_corners}, " 24 | "the output would more aligned if " 25 | f"input size {(input_h, input_w)} is `x+1` and " 26 | f"out size {(output_h, output_w)} is `nx+1`" 27 | ) 28 | return F.interpolate(input, size, scale_factor, mode, align_corners) 29 | -------------------------------------------------------------------------------- /dinov2/eval/metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from enum import Enum 7 | import logging 8 | from typing import Any, Dict, Optional 9 | 10 | import torch 11 | from torch import Tensor 12 | from torchmetrics import Metric, MetricCollection 13 | from torchmetrics.classification import MulticlassAccuracy 14 | from torchmetrics.utilities.data import dim_zero_cat, select_topk 15 | 16 | 17 | logger = logging.getLogger("dinov2") 18 | 19 | 20 | class MetricType(Enum): 21 | MEAN_ACCURACY = "mean_accuracy" 22 | MEAN_PER_CLASS_ACCURACY = "mean_per_class_accuracy" 23 | PER_CLASS_ACCURACY = "per_class_accuracy" 24 | IMAGENET_REAL_ACCURACY = "imagenet_real_accuracy" 25 | 26 | @property 27 | def accuracy_averaging(self): 28 | return getattr(AccuracyAveraging, self.name, None) 29 | 30 | def __str__(self): 31 | return self.value 32 | 33 | 34 | class AccuracyAveraging(Enum): 35 | MEAN_ACCURACY = "micro" 36 | MEAN_PER_CLASS_ACCURACY = "macro" 37 | PER_CLASS_ACCURACY = "none" 38 | 39 | def __str__(self): 40 | return self.value 41 | 42 | 43 | def build_metric(metric_type: MetricType, *, num_classes: int, ks: Optional[tuple] = None): 44 | if metric_type.accuracy_averaging is not None: 45 | return build_topk_accuracy_metric( 46 | average_type=metric_type.accuracy_averaging, 47 | num_classes=num_classes, 48 | ks=(1, 5) if ks is None else ks, 49 | ) 50 | elif metric_type == MetricType.IMAGENET_REAL_ACCURACY: 51 | return build_topk_imagenet_real_accuracy_metric( 52 | num_classes=num_classes, 53 | ks=(1, 5) if ks is None else ks, 54 | ) 55 | 56 | raise ValueError(f"Unknown metric type {metric_type}") 57 | 58 | 59 | def build_topk_accuracy_metric(average_type: AccuracyAveraging, num_classes: int, ks: tuple = (1, 5)): 60 | metrics: Dict[str, Metric] = { 61 | f"top-{k}": MulticlassAccuracy(top_k=k, num_classes=int(num_classes), average=average_type.value) for k in ks 62 | } 63 | return MetricCollection(metrics) 64 | 65 | 66 | def build_topk_imagenet_real_accuracy_metric(num_classes: int, ks: tuple = (1, 5)): 67 | metrics: Dict[str, Metric] = {f"top-{k}": ImageNetReaLAccuracy(top_k=k, num_classes=int(num_classes)) for k in ks} 68 | return MetricCollection(metrics) 69 | 70 | 71 | class ImageNetReaLAccuracy(Metric): 72 | is_differentiable: bool = False 73 | higher_is_better: Optional[bool] = None 74 | full_state_update: bool = False 75 | 76 | def __init__( 77 | self, 78 | num_classes: int, 79 | top_k: int = 1, 80 | **kwargs: Any, 81 | ) -> None: 82 | super().__init__(**kwargs) 83 | self.num_classes = num_classes 84 | self.top_k = top_k 85 | self.add_state("tp", [], dist_reduce_fx="cat") 86 | 87 | def update(self, preds: Tensor, target: Tensor) -> None: # type: ignore 88 | # preds [B, D] 89 | # target [B, A] 90 | # preds_oh [B, D] with 0 and 1 91 | # select top K highest probabilities, use one hot representation 92 | preds_oh = select_topk(preds, self.top_k) 93 | # target_oh [B, D + 1] with 0 and 1 94 | target_oh = torch.zeros((preds_oh.shape[0], preds_oh.shape[1] + 1), device=target.device, dtype=torch.int32) 95 | target = target.long() 96 | # for undefined targets (-1) use a fake value `num_classes` 97 | target[target == -1] = self.num_classes 98 | # fill targets, use one hot representation 99 | target_oh.scatter_(1, target, 1) 100 | # target_oh [B, D] (remove the fake target at index `num_classes`) 101 | target_oh = target_oh[:, :-1] 102 | # tp [B] with 0 and 1 103 | tp = (preds_oh * target_oh == 1).sum(dim=1) 104 | # at least one match between prediction and target 105 | tp.clip_(max=1) 106 | # ignore instances where no targets are defined 107 | mask = target_oh.sum(dim=1) > 0 108 | tp = tp[mask] 109 | self.tp.append(tp) # type: ignore 110 | 111 | def compute(self) -> Tensor: 112 | tp = dim_zero_cat(self.tp) # type: ignore 113 | return tp.float().mean() 114 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation/hooks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .optimizer import DistOptimizerHook 7 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation/hooks/optimizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | try: 7 | import apex 8 | except ImportError: 9 | print("apex is not installed") 10 | 11 | from mmcv.runner import OptimizerHook, HOOKS 12 | 13 | 14 | @HOOKS.register_module() 15 | class DistOptimizerHook(OptimizerHook): 16 | """Optimizer hook for distributed training.""" 17 | 18 | def __init__(self, update_interval=1, grad_clip=None, coalesce=True, bucket_size_mb=-1, use_fp16=False): 19 | self.grad_clip = grad_clip 20 | self.coalesce = coalesce 21 | self.bucket_size_mb = bucket_size_mb 22 | self.update_interval = update_interval 23 | self.use_fp16 = use_fp16 24 | 25 | def before_run(self, runner): 26 | runner.optimizer.zero_grad() 27 | 28 | def after_train_iter(self, runner): 29 | runner.outputs["loss"] /= self.update_interval 30 | if self.use_fp16: 31 | # runner.outputs['loss'].backward() 32 | with apex.amp.scale_loss(runner.outputs["loss"], runner.optimizer) as scaled_loss: 33 | scaled_loss.backward() 34 | else: 35 | runner.outputs["loss"].backward() 36 | if self.every_n_iters(runner, self.update_interval): 37 | if self.grad_clip is not None: 38 | self.clip_grads(runner.model.parameters()) 39 | runner.optimizer.step() 40 | runner.optimizer.zero_grad() 41 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .backbones import * # noqa: F403 7 | from .decode_heads import * # noqa: F403 8 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .vision_transformer import DinoVisionTransformer 7 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation/models/backbones/vision_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from mmcv.runner import BaseModule 7 | from mmseg.models.builder import BACKBONES 8 | 9 | 10 | @BACKBONES.register_module() 11 | class DinoVisionTransformer(BaseModule): 12 | """Vision Transformer.""" 13 | 14 | def __init__( 15 | self, 16 | *args, 17 | **kwargs, 18 | ): 19 | super().__init__() 20 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation/models/decode_heads/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .linear_head import BNHead 7 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation/models/decode_heads/linear_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from mmseg.models.builder import HEADS 10 | from mmseg.models.decode_heads.decode_head import BaseDecodeHead 11 | from mmseg.ops import resize 12 | 13 | 14 | @HEADS.register_module() 15 | class BNHead(BaseDecodeHead): 16 | """Just a batchnorm.""" 17 | 18 | def __init__(self, resize_factors=None, **kwargs): 19 | super().__init__(**kwargs) 20 | assert self.in_channels == self.channels 21 | self.bn = nn.SyncBatchNorm(self.in_channels) 22 | self.resize_factors = resize_factors 23 | 24 | def _forward_feature(self, inputs): 25 | """Forward function for feature maps before classifying each pixel with 26 | ``self.cls_seg`` fc. 27 | 28 | Args: 29 | inputs (list[Tensor]): List of multi-level img features. 30 | 31 | Returns: 32 | feats (Tensor): A tensor of shape (batch_size, self.channels, 33 | H, W) which is feature map for last layer of decoder head. 34 | """ 35 | # print("inputs", [i.shape for i in inputs]) 36 | x = self._transform_inputs(inputs) 37 | # print("x", x.shape) 38 | feats = self.bn(x) 39 | # print("feats", feats.shape) 40 | return feats 41 | 42 | def _transform_inputs(self, inputs): 43 | """Transform inputs for decoder. 44 | Args: 45 | inputs (list[Tensor]): List of multi-level img features. 46 | Returns: 47 | Tensor: The transformed inputs 48 | """ 49 | 50 | if self.input_transform == "resize_concat": 51 | # accept lists (for cls token) 52 | input_list = [] 53 | for x in inputs: 54 | if isinstance(x, list): 55 | input_list.extend(x) 56 | else: 57 | input_list.append(x) 58 | inputs = input_list 59 | # an image descriptor can be a local descriptor with resolution 1x1 60 | for i, x in enumerate(inputs): 61 | if len(x.shape) == 2: 62 | inputs[i] = x[:, :, None, None] 63 | # select indices 64 | inputs = [inputs[i] for i in self.in_index] 65 | # Resizing shenanigans 66 | # print("before", *(x.shape for x in inputs)) 67 | if self.resize_factors is not None: 68 | assert len(self.resize_factors) == len(inputs), (len(self.resize_factors), len(inputs)) 69 | inputs = [ 70 | resize(input=x, scale_factor=f, mode="bilinear" if f >= 1 else "area") 71 | for x, f in zip(inputs, self.resize_factors) 72 | ] 73 | # print("after", *(x.shape for x in inputs)) 74 | upsampled_inputs = [ 75 | resize(input=x, size=inputs[0].shape[2:], mode="bilinear", align_corners=self.align_corners) 76 | for x in inputs 77 | ] 78 | inputs = torch.cat(upsampled_inputs, dim=1) 79 | elif self.input_transform == "multiple_select": 80 | inputs = [inputs[i] for i in self.in_index] 81 | else: 82 | inputs = inputs[self.in_index] 83 | 84 | return inputs 85 | 86 | def forward(self, inputs): 87 | """Forward function.""" 88 | output = self._forward_feature(inputs) 89 | output = self.cls_seg(output) 90 | return output 91 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .core import * # noqa: F403 7 | from .models import * # noqa: F403 8 | from .ops import * # noqa: F403 9 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/core/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from mmseg.core.evaluation import * # noqa: F403 7 | from mmseg.core.seg import * # noqa: F403 8 | 9 | from .anchor import * # noqa: F403 10 | from .box import * # noqa: F403 11 | from .utils import * # noqa: F403 12 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/core/anchor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .point_generator import MlvlPointGenerator # noqa: F403 7 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/core/anchor/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import warnings 7 | 8 | from mmcv.utils import Registry, build_from_cfg 9 | 10 | PRIOR_GENERATORS = Registry("Generator for anchors and points") 11 | 12 | ANCHOR_GENERATORS = PRIOR_GENERATORS 13 | 14 | 15 | def build_prior_generator(cfg, default_args=None): 16 | return build_from_cfg(cfg, PRIOR_GENERATORS, default_args) 17 | 18 | 19 | def build_anchor_generator(cfg, default_args=None): 20 | warnings.warn("``build_anchor_generator`` would be deprecated soon, please use " "``build_prior_generator`` ") 21 | return build_prior_generator(cfg, default_args=default_args) 22 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/core/box/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .builder import * # noqa: F403 7 | from .samplers import MaskPseudoSampler # noqa: F403 8 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/core/box/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from mmcv.utils import Registry, build_from_cfg 7 | 8 | BBOX_SAMPLERS = Registry("bbox_sampler") 9 | BBOX_CODERS = Registry("bbox_coder") 10 | 11 | 12 | def build_sampler(cfg, **default_args): 13 | """Builder of box sampler.""" 14 | return build_from_cfg(cfg, BBOX_SAMPLERS, default_args) 15 | 16 | 17 | def build_bbox_coder(cfg, **default_args): 18 | """Builder of box coder.""" 19 | return build_from_cfg(cfg, BBOX_CODERS, default_args) 20 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/core/box/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .mask_pseudo_sampler import MaskPseudoSampler # noqa: F403 7 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/core/box/samplers/base_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from abc import ABCMeta, abstractmethod 7 | 8 | import torch 9 | 10 | from .sampling_result import SamplingResult 11 | 12 | 13 | class BaseSampler(metaclass=ABCMeta): 14 | """Base class of samplers.""" 15 | 16 | def __init__(self, num, pos_fraction, neg_pos_ub=-1, add_gt_as_proposals=True, **kwargs): 17 | self.num = num 18 | self.pos_fraction = pos_fraction 19 | self.neg_pos_ub = neg_pos_ub 20 | self.add_gt_as_proposals = add_gt_as_proposals 21 | self.pos_sampler = self 22 | self.neg_sampler = self 23 | 24 | @abstractmethod 25 | def _sample_pos(self, assign_result, num_expected, **kwargs): 26 | """Sample positive samples.""" 27 | pass 28 | 29 | @abstractmethod 30 | def _sample_neg(self, assign_result, num_expected, **kwargs): 31 | """Sample negative samples.""" 32 | pass 33 | 34 | def sample(self, assign_result, bboxes, gt_bboxes, gt_labels=None, **kwargs): 35 | """Sample positive and negative bboxes. 36 | 37 | This is a simple implementation of bbox sampling given candidates, 38 | assigning results and ground truth bboxes. 39 | 40 | Args: 41 | assign_result (:obj:`AssignResult`): Bbox assigning results. 42 | bboxes (Tensor): Boxes to be sampled from. 43 | gt_bboxes (Tensor): Ground truth bboxes. 44 | gt_labels (Tensor, optional): Class labels of ground truth bboxes. 45 | 46 | Returns: 47 | :obj:`SamplingResult`: Sampling result. 48 | 49 | Example: 50 | >>> from mmdet.core.bbox import RandomSampler 51 | >>> from mmdet.core.bbox import AssignResult 52 | >>> from mmdet.core.bbox.demodata import ensure_rng, random_boxes 53 | >>> rng = ensure_rng(None) 54 | >>> assign_result = AssignResult.random(rng=rng) 55 | >>> bboxes = random_boxes(assign_result.num_preds, rng=rng) 56 | >>> gt_bboxes = random_boxes(assign_result.num_gts, rng=rng) 57 | >>> gt_labels = None 58 | >>> self = RandomSampler(num=32, pos_fraction=0.5, neg_pos_ub=-1, 59 | >>> add_gt_as_proposals=False) 60 | >>> self = self.sample(assign_result, bboxes, gt_bboxes, gt_labels) 61 | """ 62 | if len(bboxes.shape) < 2: 63 | bboxes = bboxes[None, :] 64 | 65 | bboxes = bboxes[:, :4] 66 | 67 | gt_flags = bboxes.new_zeros((bboxes.shape[0],), dtype=torch.uint8) 68 | if self.add_gt_as_proposals and len(gt_bboxes) > 0: 69 | if gt_labels is None: 70 | raise ValueError("gt_labels must be given when add_gt_as_proposals is True") 71 | bboxes = torch.cat([gt_bboxes, bboxes], dim=0) 72 | assign_result.add_gt_(gt_labels) 73 | gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8) 74 | gt_flags = torch.cat([gt_ones, gt_flags]) 75 | 76 | num_expected_pos = int(self.num * self.pos_fraction) 77 | pos_inds = self.pos_sampler._sample_pos(assign_result, num_expected_pos, bboxes=bboxes, **kwargs) 78 | # We found that sampled indices have duplicated items occasionally. 79 | # (may be a bug of PyTorch) 80 | pos_inds = pos_inds.unique() 81 | num_sampled_pos = pos_inds.numel() 82 | num_expected_neg = self.num - num_sampled_pos 83 | if self.neg_pos_ub >= 0: 84 | _pos = max(1, num_sampled_pos) 85 | neg_upper_bound = int(self.neg_pos_ub * _pos) 86 | if num_expected_neg > neg_upper_bound: 87 | num_expected_neg = neg_upper_bound 88 | neg_inds = self.neg_sampler._sample_neg(assign_result, num_expected_neg, bboxes=bboxes, **kwargs) 89 | neg_inds = neg_inds.unique() 90 | 91 | sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, gt_flags) 92 | return sampling_result 93 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/core/box/samplers/mask_pseudo_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | # References: 7 | # https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py 8 | 9 | import torch 10 | 11 | from ..builder import BBOX_SAMPLERS 12 | from .base_sampler import BaseSampler 13 | from .mask_sampling_result import MaskSamplingResult 14 | 15 | 16 | @BBOX_SAMPLERS.register_module() 17 | class MaskPseudoSampler(BaseSampler): 18 | """A pseudo sampler that does not do sampling actually.""" 19 | 20 | def __init__(self, **kwargs): 21 | pass 22 | 23 | def _sample_pos(self, **kwargs): 24 | """Sample positive samples.""" 25 | raise NotImplementedError 26 | 27 | def _sample_neg(self, **kwargs): 28 | """Sample negative samples.""" 29 | raise NotImplementedError 30 | 31 | def sample(self, assign_result, masks, gt_masks, **kwargs): 32 | """Directly returns the positive and negative indices of samples. 33 | 34 | Args: 35 | assign_result (:obj:`AssignResult`): Assigned results 36 | masks (torch.Tensor): Bounding boxes 37 | gt_masks (torch.Tensor): Ground truth boxes 38 | Returns: 39 | :obj:`SamplingResult`: sampler results 40 | """ 41 | pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() 42 | neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() 43 | gt_flags = masks.new_zeros(masks.shape[0], dtype=torch.uint8) 44 | sampling_result = MaskSamplingResult(pos_inds, neg_inds, masks, gt_masks, assign_result, gt_flags) 45 | return sampling_result 46 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/core/box/samplers/mask_sampling_result.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | # References: 7 | # https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py 8 | 9 | import torch 10 | 11 | from .sampling_result import SamplingResult 12 | 13 | 14 | class MaskSamplingResult(SamplingResult): 15 | """Mask sampling result.""" 16 | 17 | def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result, gt_flags): 18 | self.pos_inds = pos_inds 19 | self.neg_inds = neg_inds 20 | self.pos_masks = masks[pos_inds] 21 | self.neg_masks = masks[neg_inds] 22 | self.pos_is_gt = gt_flags[pos_inds] 23 | 24 | self.num_gts = gt_masks.shape[0] 25 | self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 26 | 27 | if gt_masks.numel() == 0: 28 | # hack for index error case 29 | assert self.pos_assigned_gt_inds.numel() == 0 30 | self.pos_gt_masks = torch.empty_like(gt_masks) 31 | else: 32 | self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :] 33 | 34 | if assign_result.labels is not None: 35 | self.pos_gt_labels = assign_result.labels[pos_inds] 36 | else: 37 | self.pos_gt_labels = None 38 | 39 | @property 40 | def masks(self): 41 | """torch.Tensor: concatenated positive and negative boxes""" 42 | return torch.cat([self.pos_masks, self.neg_masks]) 43 | 44 | def __nice__(self): 45 | data = self.info.copy() 46 | data["pos_masks"] = data.pop("pos_masks").shape 47 | data["neg_masks"] = data.pop("neg_masks").shape 48 | parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())] 49 | body = " " + ",\n ".join(parts) 50 | return "{\n" + body + "\n}" 51 | 52 | @property 53 | def info(self): 54 | """Returns a dictionary of info about the object.""" 55 | return { 56 | "pos_inds": self.pos_inds, 57 | "neg_inds": self.neg_inds, 58 | "pos_masks": self.pos_masks, 59 | "neg_masks": self.neg_masks, 60 | "pos_is_gt": self.pos_is_gt, 61 | "num_gts": self.num_gts, 62 | "pos_assigned_gt_inds": self.pos_assigned_gt_inds, 63 | } 64 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/core/box/samplers/sampling_result.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | 8 | 9 | class SamplingResult: 10 | """Bbox sampling result. 11 | 12 | Example: 13 | >>> # xdoctest: +IGNORE_WANT 14 | >>> from mmdet.core.bbox.samplers.sampling_result import * # NOQA 15 | >>> self = SamplingResult.random(rng=10) 16 | >>> print(f'self = {self}') 17 | self = 26 | """ 27 | 28 | def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, gt_flags): 29 | self.pos_inds = pos_inds 30 | self.neg_inds = neg_inds 31 | self.pos_bboxes = bboxes[pos_inds] 32 | self.neg_bboxes = bboxes[neg_inds] 33 | self.pos_is_gt = gt_flags[pos_inds] 34 | 35 | self.num_gts = gt_bboxes.shape[0] 36 | self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 37 | 38 | if gt_bboxes.numel() == 0: 39 | # hack for index error case 40 | assert self.pos_assigned_gt_inds.numel() == 0 41 | self.pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4) 42 | else: 43 | if len(gt_bboxes.shape) < 2: 44 | gt_bboxes = gt_bboxes.view(-1, 4) 45 | 46 | self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds.long(), :] 47 | 48 | if assign_result.labels is not None: 49 | self.pos_gt_labels = assign_result.labels[pos_inds] 50 | else: 51 | self.pos_gt_labels = None 52 | 53 | @property 54 | def bboxes(self): 55 | """torch.Tensor: concatenated positive and negative boxes""" 56 | return torch.cat([self.pos_bboxes, self.neg_bboxes]) 57 | 58 | def to(self, device): 59 | """Change the device of the data inplace. 60 | 61 | Example: 62 | >>> self = SamplingResult.random() 63 | >>> print(f'self = {self.to(None)}') 64 | >>> # xdoctest: +REQUIRES(--gpu) 65 | >>> print(f'self = {self.to(0)}') 66 | """ 67 | _dict = self.__dict__ 68 | for key, value in _dict.items(): 69 | if isinstance(value, torch.Tensor): 70 | _dict[key] = value.to(device) 71 | return self 72 | 73 | def __nice__(self): 74 | data = self.info.copy() 75 | data["pos_bboxes"] = data.pop("pos_bboxes").shape 76 | data["neg_bboxes"] = data.pop("neg_bboxes").shape 77 | parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())] 78 | body = " " + ",\n ".join(parts) 79 | return "{\n" + body + "\n}" 80 | 81 | @property 82 | def info(self): 83 | """Returns a dictionary of info about the object.""" 84 | return { 85 | "pos_inds": self.pos_inds, 86 | "neg_inds": self.neg_inds, 87 | "pos_bboxes": self.pos_bboxes, 88 | "neg_bboxes": self.neg_bboxes, 89 | "pos_is_gt": self.pos_is_gt, 90 | "num_gts": self.num_gts, 91 | "pos_assigned_gt_inds": self.pos_assigned_gt_inds, 92 | } 93 | 94 | @classmethod 95 | def random(cls, rng=None, **kwargs): 96 | """ 97 | Args: 98 | rng (None | int | numpy.random.RandomState): seed or state. 99 | kwargs (keyword arguments): 100 | - num_preds: number of predicted boxes 101 | - num_gts: number of true boxes 102 | - p_ignore (float): probability of a predicted box assigned to \ 103 | an ignored truth. 104 | - p_assigned (float): probability of a predicted box not being \ 105 | assigned. 106 | - p_use_label (float | bool): with labels or not. 107 | 108 | Returns: 109 | :obj:`SamplingResult`: Randomly generated sampling result. 110 | 111 | Example: 112 | >>> from mmdet.core.bbox.samplers.sampling_result import * # NOQA 113 | >>> self = SamplingResult.random() 114 | >>> print(self.__dict__) 115 | """ 116 | from mmdet.core.bbox import demodata 117 | from mmdet.core.bbox.assigners.assign_result import AssignResult 118 | from mmdet.core.bbox.samplers.random_sampler import RandomSampler 119 | 120 | rng = demodata.ensure_rng(rng) 121 | 122 | # make probabalistic? 123 | num = 32 124 | pos_fraction = 0.5 125 | neg_pos_ub = -1 126 | 127 | assign_result = AssignResult.random(rng=rng, **kwargs) 128 | 129 | # Note we could just compute an assignment 130 | bboxes = demodata.random_boxes(assign_result.num_preds, rng=rng) 131 | gt_bboxes = demodata.random_boxes(assign_result.num_gts, rng=rng) 132 | 133 | if rng.rand() > 0.2: 134 | # sometimes algorithms squeeze their data, be robust to that 135 | gt_bboxes = gt_bboxes.squeeze() 136 | bboxes = bboxes.squeeze() 137 | 138 | if assign_result.labels is None: 139 | gt_labels = None 140 | else: 141 | gt_labels = None 142 | 143 | if gt_labels is None: 144 | add_gt_as_proposals = False 145 | else: 146 | add_gt_as_proposals = True # make probabalistic? 147 | 148 | sampler = RandomSampler( 149 | num, pos_fraction, neg_pos_ub=neg_pos_ub, add_gt_as_proposals=add_gt_as_proposals, rng=rng 150 | ) 151 | self = sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels) 152 | return self 153 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/core/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .dist_utils import reduce_mean 7 | from .misc import add_prefix, multi_apply 8 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/core/utils/dist_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import torch.distributed as dist 7 | 8 | 9 | def reduce_mean(tensor): 10 | """ "Obtain the mean of tensor on different GPUs.""" 11 | if not (dist.is_available() and dist.is_initialized()): 12 | return tensor 13 | tensor = tensor.clone() 14 | dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM) 15 | return tensor 16 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/core/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from functools import partial 7 | 8 | 9 | def multi_apply(func, *args, **kwargs): 10 | """Apply function to a list of arguments. 11 | 12 | Note: 13 | This function applies the ``func`` to multiple inputs and 14 | map the multiple outputs of the ``func`` into different 15 | list. Each list contains the same type of outputs corresponding 16 | to different inputs. 17 | 18 | Args: 19 | func (Function): A function that will be applied to a list of 20 | arguments 21 | 22 | Returns: 23 | tuple(list): A tuple containing multiple list, each list contains \ 24 | a kind of returned results by the function 25 | """ 26 | pfunc = partial(func, **kwargs) if kwargs else func 27 | map_results = map(pfunc, *args) 28 | return tuple(map(list, zip(*map_results))) 29 | 30 | 31 | def add_prefix(inputs, prefix): 32 | """Add prefix for dict. 33 | 34 | Args: 35 | inputs (dict): The input dict with str keys. 36 | prefix (str): The prefix to add. 37 | 38 | Returns: 39 | 40 | dict: The dict with keys updated with ``prefix``. 41 | """ 42 | 43 | outputs = dict() 44 | for name, value in inputs.items(): 45 | outputs[f"{prefix}.{name}"] = value 46 | 47 | return outputs 48 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .backbones import * # noqa: F403 7 | from .builder import MASK_ASSIGNERS, MATCH_COST, TRANSFORMER, build_assigner, build_match_cost 8 | from .decode_heads import * # noqa: F403 9 | from .losses import * # noqa: F403 10 | from .plugins import * # noqa: F403 11 | from .segmentors import * # noqa: F403 12 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .vit_adapter import ViTAdapter 7 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/models/backbones/drop_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | # References: 7 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 8 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py 9 | 10 | from torch import nn 11 | 12 | 13 | def drop_path(x, drop_prob: float = 0.0, training: bool = False): 14 | if drop_prob == 0.0 or not training: 15 | return x 16 | keep_prob = 1 - drop_prob 17 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 18 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 19 | if keep_prob > 0.0: 20 | random_tensor.div_(keep_prob) 21 | return x * random_tensor 22 | 23 | 24 | class DropPath(nn.Module): 25 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" 26 | 27 | def __init__(self, drop_prob: float = 0.0): 28 | super(DropPath, self).__init__() 29 | self.drop_prob = drop_prob 30 | 31 | def forward(self, x): 32 | return drop_path(x, self.drop_prob, self.training) 33 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/models/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from mmcv.utils import Registry 7 | 8 | TRANSFORMER = Registry("Transformer") 9 | MASK_ASSIGNERS = Registry("mask_assigner") 10 | MATCH_COST = Registry("match_cost") 11 | 12 | 13 | def build_match_cost(cfg): 14 | """Build Match Cost.""" 15 | return MATCH_COST.build(cfg) 16 | 17 | 18 | def build_assigner(cfg): 19 | """Build Assigner.""" 20 | return MASK_ASSIGNERS.build(cfg) 21 | 22 | 23 | def build_transformer(cfg): 24 | """Build Transformer.""" 25 | return TRANSFORMER.build(cfg) 26 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/models/decode_heads/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .mask2former_head import Mask2FormerHead 7 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/models/losses/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .cross_entropy_loss import CrossEntropyLoss, binary_cross_entropy, cross_entropy, mask_cross_entropy 7 | from .dice_loss import DiceLoss 8 | from .match_costs import ClassificationCost, CrossEntropyLossCost, DiceCost 9 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/models/losses/dice_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import torch.nn as nn 8 | from mmseg.models.builder import LOSSES 9 | from mmseg.models.losses.utils import weight_reduce_loss 10 | 11 | 12 | def dice_loss(pred, target, weight=None, eps=1e-3, reduction="mean", avg_factor=None): 13 | """Calculate dice loss, which is proposed in 14 | `V-Net: Fully Convolutional Neural Networks for Volumetric 15 | Medical Image Segmentation `_. 16 | 17 | Args: 18 | pred (torch.Tensor): The prediction, has a shape (n, *) 19 | target (torch.Tensor): The learning label of the prediction, 20 | shape (n, *), same shape of pred. 21 | weight (torch.Tensor, optional): The weight of loss for each 22 | prediction, has a shape (n,). Defaults to None. 23 | eps (float): Avoid dividing by zero. Default: 1e-3. 24 | reduction (str, optional): The method used to reduce the loss into 25 | a scalar. Defaults to 'mean'. 26 | Options are "none", "mean" and "sum". 27 | avg_factor (int, optional): Average factor that is used to average 28 | the loss. Defaults to None. 29 | """ 30 | 31 | input = pred.flatten(1) 32 | target = target.flatten(1).float() 33 | 34 | a = torch.sum(input * target, 1) 35 | b = torch.sum(input * input, 1) + eps 36 | c = torch.sum(target * target, 1) + eps 37 | d = (2 * a) / (b + c) 38 | loss = 1 - d 39 | if weight is not None: 40 | assert weight.ndim == loss.ndim 41 | assert len(weight) == len(pred) 42 | loss = weight_reduce_loss(loss, weight, reduction, avg_factor) 43 | return loss 44 | 45 | 46 | def naive_dice_loss(pred, target, weight=None, eps=1e-3, reduction="mean", avg_factor=None): 47 | """Calculate naive dice loss, the coefficient in the denominator is the 48 | first power instead of the second power. 49 | 50 | Args: 51 | pred (torch.Tensor): The prediction, has a shape (n, *) 52 | target (torch.Tensor): The learning label of the prediction, 53 | shape (n, *), same shape of pred. 54 | weight (torch.Tensor, optional): The weight of loss for each 55 | prediction, has a shape (n,). Defaults to None. 56 | eps (float): Avoid dividing by zero. Default: 1e-3. 57 | reduction (str, optional): The method used to reduce the loss into 58 | a scalar. Defaults to 'mean'. 59 | Options are "none", "mean" and "sum". 60 | avg_factor (int, optional): Average factor that is used to average 61 | the loss. Defaults to None. 62 | """ 63 | input = pred.flatten(1) 64 | target = target.flatten(1).float() 65 | 66 | a = torch.sum(input * target, 1) 67 | b = torch.sum(input, 1) 68 | c = torch.sum(target, 1) 69 | d = (2 * a + eps) / (b + c + eps) 70 | loss = 1 - d 71 | if weight is not None: 72 | assert weight.ndim == loss.ndim 73 | assert len(weight) == len(pred) 74 | loss = weight_reduce_loss(loss, weight, reduction, avg_factor) 75 | return loss 76 | 77 | 78 | @LOSSES.register_module(force=True) 79 | class DiceLoss(nn.Module): 80 | def __init__(self, use_sigmoid=True, activate=True, reduction="mean", naive_dice=False, loss_weight=1.0, eps=1e-3): 81 | """Dice Loss, there are two forms of dice loss is supported: 82 | 83 | - the one proposed in `V-Net: Fully Convolutional Neural 84 | Networks for Volumetric Medical Image Segmentation 85 | `_. 86 | - the dice loss in which the power of the number in the 87 | denominator is the first power instead of the second 88 | power. 89 | 90 | Args: 91 | use_sigmoid (bool, optional): Whether to the prediction is 92 | used for sigmoid or softmax. Defaults to True. 93 | activate (bool): Whether to activate the predictions inside, 94 | this will disable the inside sigmoid operation. 95 | Defaults to True. 96 | reduction (str, optional): The method used 97 | to reduce the loss. Options are "none", 98 | "mean" and "sum". Defaults to 'mean'. 99 | naive_dice (bool, optional): If false, use the dice 100 | loss defined in the V-Net paper, otherwise, use the 101 | naive dice loss in which the power of the number in the 102 | denominator is the first power instead of the second 103 | power.Defaults to False. 104 | loss_weight (float, optional): Weight of loss. Defaults to 1.0. 105 | eps (float): Avoid dividing by zero. Defaults to 1e-3. 106 | """ 107 | 108 | super(DiceLoss, self).__init__() 109 | self.use_sigmoid = use_sigmoid 110 | self.reduction = reduction 111 | self.naive_dice = naive_dice 112 | self.loss_weight = loss_weight 113 | self.eps = eps 114 | self.activate = activate 115 | 116 | def forward(self, pred, target, weight=None, reduction_override=None, avg_factor=None): 117 | """Forward function. 118 | 119 | Args: 120 | pred (torch.Tensor): The prediction, has a shape (n, *). 121 | target (torch.Tensor): The label of the prediction, 122 | shape (n, *), same shape of pred. 123 | weight (torch.Tensor, optional): The weight of loss for each 124 | prediction, has a shape (n,). Defaults to None. 125 | avg_factor (int, optional): Average factor that is used to average 126 | the loss. Defaults to None. 127 | reduction_override (str, optional): The reduction method used to 128 | override the original reduction method of the loss. 129 | Options are "none", "mean" and "sum". 130 | 131 | Returns: 132 | torch.Tensor: The calculated loss 133 | """ 134 | 135 | assert reduction_override in (None, "none", "mean", "sum") 136 | reduction = reduction_override if reduction_override else self.reduction 137 | 138 | if self.activate: 139 | if self.use_sigmoid: 140 | pred = pred.sigmoid() 141 | else: 142 | raise NotImplementedError 143 | 144 | if self.naive_dice: 145 | loss = self.loss_weight * naive_dice_loss( 146 | pred, target, weight, eps=self.eps, reduction=reduction, avg_factor=avg_factor 147 | ) 148 | else: 149 | loss = self.loss_weight * dice_loss( 150 | pred, target, weight, eps=self.eps, reduction=reduction, avg_factor=avg_factor 151 | ) 152 | 153 | return loss 154 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/models/losses/match_costs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import torch.nn.functional as F 8 | 9 | from ..builder import MATCH_COST 10 | 11 | 12 | @MATCH_COST.register_module() 13 | class ClassificationCost: 14 | """ClsSoftmaxCost.Borrow from 15 | mmdet.core.bbox.match_costs.match_cost.ClassificationCost. 16 | 17 | Args: 18 | weight (int | float, optional): loss_weight 19 | 20 | Examples: 21 | >>> import torch 22 | >>> self = ClassificationCost() 23 | >>> cls_pred = torch.rand(4, 3) 24 | >>> gt_labels = torch.tensor([0, 1, 2]) 25 | >>> factor = torch.tensor([10, 8, 10, 8]) 26 | >>> self(cls_pred, gt_labels) 27 | tensor([[-0.3430, -0.3525, -0.3045], 28 | [-0.3077, -0.2931, -0.3992], 29 | [-0.3664, -0.3455, -0.2881], 30 | [-0.3343, -0.2701, -0.3956]]) 31 | """ 32 | 33 | def __init__(self, weight=1.0): 34 | self.weight = weight 35 | 36 | def __call__(self, cls_pred, gt_labels): 37 | """ 38 | Args: 39 | cls_pred (Tensor): Predicted classification logits, shape 40 | [num_query, num_class]. 41 | gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). 42 | 43 | Returns: 44 | torch.Tensor: cls_cost value with weight 45 | """ 46 | # Following the official DETR repo, contrary to the loss that 47 | # NLL is used, we approximate it in 1 - cls_score[gt_label]. 48 | # The 1 is a constant that doesn't change the matching, 49 | # so it can be omitted. 50 | cls_score = cls_pred.softmax(-1) 51 | cls_cost = -cls_score[:, gt_labels] 52 | return cls_cost * self.weight 53 | 54 | 55 | @MATCH_COST.register_module() 56 | class DiceCost: 57 | """Cost of mask assignments based on dice losses. 58 | 59 | Args: 60 | weight (int | float, optional): loss_weight. Defaults to 1. 61 | pred_act (bool, optional): Whether to apply sigmoid to mask_pred. 62 | Defaults to False. 63 | eps (float, optional): default 1e-12. 64 | """ 65 | 66 | def __init__(self, weight=1.0, pred_act=False, eps=1e-3): 67 | self.weight = weight 68 | self.pred_act = pred_act 69 | self.eps = eps 70 | 71 | def binary_mask_dice_loss(self, mask_preds, gt_masks): 72 | """ 73 | Args: 74 | mask_preds (Tensor): Mask prediction in shape (N1, H, W). 75 | gt_masks (Tensor): Ground truth in shape (N2, H, W) 76 | store 0 or 1, 0 for negative class and 1 for 77 | positive class. 78 | 79 | Returns: 80 | Tensor: Dice cost matrix in shape (N1, N2). 81 | """ 82 | mask_preds = mask_preds.reshape((mask_preds.shape[0], -1)) 83 | gt_masks = gt_masks.reshape((gt_masks.shape[0], -1)).float() 84 | numerator = 2 * torch.einsum("nc,mc->nm", mask_preds, gt_masks) 85 | denominator = mask_preds.sum(-1)[:, None] + gt_masks.sum(-1)[None, :] 86 | loss = 1 - (numerator + self.eps) / (denominator + self.eps) 87 | return loss 88 | 89 | def __call__(self, mask_preds, gt_masks): 90 | """ 91 | Args: 92 | mask_preds (Tensor): Mask prediction logits in shape (N1, H, W). 93 | gt_masks (Tensor): Ground truth in shape (N2, H, W). 94 | 95 | Returns: 96 | Tensor: Dice cost matrix in shape (N1, N2). 97 | """ 98 | if self.pred_act: 99 | mask_preds = mask_preds.sigmoid() 100 | dice_cost = self.binary_mask_dice_loss(mask_preds, gt_masks) 101 | return dice_cost * self.weight 102 | 103 | 104 | @MATCH_COST.register_module() 105 | class CrossEntropyLossCost: 106 | """CrossEntropyLossCost. 107 | 108 | Args: 109 | weight (int | float, optional): loss weight. Defaults to 1. 110 | use_sigmoid (bool, optional): Whether the prediction uses sigmoid 111 | of softmax. Defaults to True. 112 | """ 113 | 114 | def __init__(self, weight=1.0, use_sigmoid=True): 115 | assert use_sigmoid, "use_sigmoid = False is not supported yet." 116 | self.weight = weight 117 | self.use_sigmoid = use_sigmoid 118 | 119 | def _binary_cross_entropy(self, cls_pred, gt_labels): 120 | """ 121 | Args: 122 | cls_pred (Tensor): The prediction with shape (num_query, 1, *) or 123 | (num_query, *). 124 | gt_labels (Tensor): The learning label of prediction with 125 | shape (num_gt, *). 126 | Returns: 127 | Tensor: Cross entropy cost matrix in shape (num_query, num_gt). 128 | """ 129 | cls_pred = cls_pred.flatten(1).float() 130 | gt_labels = gt_labels.flatten(1).float() 131 | n = cls_pred.shape[1] 132 | pos = F.binary_cross_entropy_with_logits(cls_pred, torch.ones_like(cls_pred), reduction="none") 133 | neg = F.binary_cross_entropy_with_logits(cls_pred, torch.zeros_like(cls_pred), reduction="none") 134 | cls_cost = torch.einsum("nc,mc->nm", pos, gt_labels) + torch.einsum("nc,mc->nm", neg, 1 - gt_labels) 135 | cls_cost = cls_cost / n 136 | 137 | return cls_cost 138 | 139 | def __call__(self, cls_pred, gt_labels): 140 | """ 141 | Args: 142 | cls_pred (Tensor): Predicted classification logits. 143 | gt_labels (Tensor): Labels. 144 | Returns: 145 | Tensor: Cross entropy cost matrix with weight in 146 | shape (num_query, num_gt). 147 | """ 148 | if self.use_sigmoid: 149 | cls_cost = self._binary_cross_entropy(cls_pred, gt_labels) 150 | else: 151 | raise NotImplementedError 152 | 153 | return cls_cost * self.weight 154 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/models/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .msdeformattn_pixel_decoder import MSDeformAttnPixelDecoder 7 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/models/segmentors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .encoder_decoder_mask2former import EncoderDecoderMask2Former 7 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .assigner import MaskHungarianAssigner 7 | from .point_sample import get_uncertain_point_coords_with_randomness 8 | from .positional_encoding import LearnedPositionalEncoding, SinePositionalEncoding 9 | from .transformer import DetrTransformerDecoder, DetrTransformerDecoderLayer, DynamicConv, Transformer 10 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/models/utils/assigner.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from abc import ABCMeta, abstractmethod 7 | 8 | import torch 9 | 10 | from ..builder import MASK_ASSIGNERS, build_match_cost 11 | 12 | try: 13 | from scipy.optimize import linear_sum_assignment 14 | except ImportError: 15 | linear_sum_assignment = None 16 | 17 | 18 | class AssignResult(metaclass=ABCMeta): 19 | """Collection of assign results.""" 20 | 21 | def __init__(self, num_gts, gt_inds, labels): 22 | self.num_gts = num_gts 23 | self.gt_inds = gt_inds 24 | self.labels = labels 25 | 26 | @property 27 | def info(self): 28 | info = { 29 | "num_gts": self.num_gts, 30 | "gt_inds": self.gt_inds, 31 | "labels": self.labels, 32 | } 33 | return info 34 | 35 | 36 | class BaseAssigner(metaclass=ABCMeta): 37 | """Base assigner that assigns boxes to ground truth boxes.""" 38 | 39 | @abstractmethod 40 | def assign(self, masks, gt_masks, gt_masks_ignore=None, gt_labels=None): 41 | """Assign boxes to either a ground truth boxes or a negative boxes.""" 42 | pass 43 | 44 | 45 | @MASK_ASSIGNERS.register_module() 46 | class MaskHungarianAssigner(BaseAssigner): 47 | """Computes one-to-one matching between predictions and ground truth for 48 | mask. 49 | 50 | This class computes an assignment between the targets and the predictions 51 | based on the costs. The costs are weighted sum of three components: 52 | classification cost, regression L1 cost and regression iou cost. The 53 | targets don't include the no_object, so generally there are more 54 | predictions than targets. After the one-to-one matching, the un-matched 55 | are treated as backgrounds. Thus each query prediction will be assigned 56 | with `0` or a positive integer indicating the ground truth index: 57 | 58 | - 0: negative sample, no assigned gt 59 | - positive integer: positive sample, index (1-based) of assigned gt 60 | 61 | Args: 62 | cls_cost (obj:`mmcv.ConfigDict`|dict): Classification cost config. 63 | mask_cost (obj:`mmcv.ConfigDict`|dict): Mask cost config. 64 | dice_cost (obj:`mmcv.ConfigDict`|dict): Dice cost config. 65 | """ 66 | 67 | def __init__( 68 | self, 69 | cls_cost=dict(type="ClassificationCost", weight=1.0), 70 | dice_cost=dict(type="DiceCost", weight=1.0), 71 | mask_cost=dict(type="MaskFocalCost", weight=1.0), 72 | ): 73 | self.cls_cost = build_match_cost(cls_cost) 74 | self.dice_cost = build_match_cost(dice_cost) 75 | self.mask_cost = build_match_cost(mask_cost) 76 | 77 | def assign(self, cls_pred, mask_pred, gt_labels, gt_masks, img_meta, gt_masks_ignore=None, eps=1e-7): 78 | """Computes one-to-one matching based on the weighted costs. 79 | 80 | This method assign each query prediction to a ground truth or 81 | background. The `assigned_gt_inds` with -1 means don't care, 82 | 0 means negative sample, and positive number is the index (1-based) 83 | of assigned gt. 84 | The assignment is done in the following steps, the order matters. 85 | 86 | 1. assign every prediction to -1 87 | 2. compute the weighted costs 88 | 3. do Hungarian matching on CPU based on the costs 89 | 4. assign all to 0 (background) first, then for each matched pair 90 | between predictions and gts, treat this prediction as foreground 91 | and assign the corresponding gt index (plus 1) to it. 92 | 93 | Args: 94 | mask_pred (Tensor): Predicted mask, shape [num_query, h, w] 95 | cls_pred (Tensor): Predicted classification logits, shape 96 | [num_query, num_class]. 97 | gt_masks (Tensor): Ground truth mask, shape [num_gt, h, w]. 98 | gt_labels (Tensor): Label of `gt_masks`, shape (num_gt,). 99 | img_meta (dict): Meta information for current image. 100 | gt_masks_ignore (Tensor, optional): Ground truth masks that are 101 | labelled as `ignored`. Default None. 102 | eps (int | float, optional): A value added to the denominator for 103 | numerical stability. Default 1e-7. 104 | 105 | Returns: 106 | :obj:`AssignResult`: The assigned result. 107 | """ 108 | assert gt_masks_ignore is None, "Only case when gt_masks_ignore is None is supported." 109 | num_gts, num_queries = gt_labels.shape[0], cls_pred.shape[0] 110 | 111 | # 1. assign -1 by default 112 | assigned_gt_inds = cls_pred.new_full((num_queries,), -1, dtype=torch.long) 113 | assigned_labels = cls_pred.new_full((num_queries,), -1, dtype=torch.long) 114 | if num_gts == 0 or num_queries == 0: 115 | # No ground truth or boxes, return empty assignment 116 | if num_gts == 0: 117 | # No ground truth, assign all to background 118 | assigned_gt_inds[:] = 0 119 | return AssignResult(num_gts, assigned_gt_inds, labels=assigned_labels) 120 | 121 | # 2. compute the weighted costs 122 | # classification and maskcost. 123 | if self.cls_cost.weight != 0 and cls_pred is not None: 124 | cls_cost = self.cls_cost(cls_pred, gt_labels) 125 | else: 126 | cls_cost = 0 127 | 128 | if self.mask_cost.weight != 0: 129 | # mask_pred shape = [nq, h, w] 130 | # gt_mask shape = [ng, h, w] 131 | # mask_cost shape = [nq, ng] 132 | mask_cost = self.mask_cost(mask_pred, gt_masks) 133 | else: 134 | mask_cost = 0 135 | 136 | if self.dice_cost.weight != 0: 137 | dice_cost = self.dice_cost(mask_pred, gt_masks) 138 | else: 139 | dice_cost = 0 140 | cost = cls_cost + mask_cost + dice_cost 141 | 142 | # 3. do Hungarian matching on CPU using linear_sum_assignment 143 | cost = cost.detach().cpu() 144 | if linear_sum_assignment is None: 145 | raise ImportError('Please run "pip install scipy" ' "to install scipy first.") 146 | 147 | matched_row_inds, matched_col_inds = linear_sum_assignment(cost) 148 | matched_row_inds = torch.from_numpy(matched_row_inds).to(cls_pred.device) 149 | matched_col_inds = torch.from_numpy(matched_col_inds).to(cls_pred.device) 150 | 151 | # 4. assign backgrounds and foregrounds 152 | # assign all indices to backgrounds first 153 | assigned_gt_inds[:] = 0 154 | # assign foregrounds based on matching results 155 | assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 156 | assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] 157 | return AssignResult(num_gts, assigned_gt_inds, labels=assigned_labels) 158 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/models/utils/point_sample.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | from mmcv.ops import point_sample 8 | 9 | 10 | def get_uncertainty(mask_pred, labels): 11 | """Estimate uncertainty based on pred logits. 12 | 13 | We estimate uncertainty as L1 distance between 0.0 and the logits 14 | prediction in 'mask_pred' for the foreground class in `classes`. 15 | 16 | Args: 17 | mask_pred (Tensor): mask predication logits, shape (num_rois, 18 | num_classes, mask_height, mask_width). 19 | 20 | labels (list[Tensor]): Either predicted or ground truth label for 21 | each predicted mask, of length num_rois. 22 | 23 | Returns: 24 | scores (Tensor): Uncertainty scores with the most uncertain 25 | locations having the highest uncertainty score, 26 | shape (num_rois, 1, mask_height, mask_width) 27 | """ 28 | if mask_pred.shape[1] == 1: 29 | gt_class_logits = mask_pred.clone() 30 | else: 31 | inds = torch.arange(mask_pred.shape[0], device=mask_pred.device) 32 | gt_class_logits = mask_pred[inds, labels].unsqueeze(1) 33 | return -torch.abs(gt_class_logits) 34 | 35 | 36 | def get_uncertain_point_coords_with_randomness( 37 | mask_pred, labels, num_points, oversample_ratio, importance_sample_ratio 38 | ): 39 | """Get ``num_points`` most uncertain points with random points during 40 | train. 41 | 42 | Sample points in [0, 1] x [0, 1] coordinate space based on their 43 | uncertainty. The uncertainties are calculated for each point using 44 | 'get_uncertainty()' function that takes point's logit prediction as 45 | input. 46 | 47 | Args: 48 | mask_pred (Tensor): A tensor of shape (num_rois, num_classes, 49 | mask_height, mask_width) for class-specific or class-agnostic 50 | prediction. 51 | labels (list): The ground truth class for each instance. 52 | num_points (int): The number of points to sample. 53 | oversample_ratio (int): Oversampling parameter. 54 | importance_sample_ratio (float): Ratio of points that are sampled 55 | via importnace sampling. 56 | 57 | Returns: 58 | point_coords (Tensor): A tensor of shape (num_rois, num_points, 2) 59 | that contains the coordinates sampled points. 60 | """ 61 | assert oversample_ratio >= 1 62 | assert 0 <= importance_sample_ratio <= 1 63 | batch_size = mask_pred.shape[0] 64 | num_sampled = int(num_points * oversample_ratio) 65 | point_coords = torch.rand(batch_size, num_sampled, 2, device=mask_pred.device) 66 | point_logits = point_sample(mask_pred, point_coords) 67 | # It is crucial to calculate uncertainty based on the sampled 68 | # prediction value for the points. Calculating uncertainties of the 69 | # coarse predictions first and sampling them for points leads to 70 | # incorrect results. To illustrate this: assume uncertainty func( 71 | # logits)=-abs(logits), a sampled point between two coarse 72 | # predictions with -1 and 1 logits has 0 logits, and therefore 0 73 | # uncertainty value. However, if we calculate uncertainties for the 74 | # coarse predictions first, both will have -1 uncertainty, 75 | # and sampled point will get -1 uncertainty. 76 | point_uncertainties = get_uncertainty(point_logits, labels) 77 | num_uncertain_points = int(importance_sample_ratio * num_points) 78 | num_random_points = num_points - num_uncertain_points 79 | idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1] 80 | shift = num_sampled * torch.arange(batch_size, dtype=torch.long, device=mask_pred.device) 81 | idx += shift[:, None] 82 | point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(batch_size, num_uncertain_points, 2) 83 | if num_random_points > 0: 84 | rand_roi_coords = torch.rand(batch_size, num_random_points, 2, device=mask_pred.device) 85 | point_coords = torch.cat((point_coords, rand_roi_coords), dim=1) 86 | return point_coords 87 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/models/utils/positional_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import math 7 | 8 | import torch 9 | import torch.nn as nn 10 | from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING 11 | from mmcv.runner import BaseModule 12 | 13 | 14 | @POSITIONAL_ENCODING.register_module() 15 | class SinePositionalEncoding(BaseModule): 16 | """Position encoding with sine and cosine functions. 17 | 18 | See `End-to-End Object Detection with Transformers 19 | `_ for details. 20 | 21 | Args: 22 | num_feats (int): The feature dimension for each position 23 | along x-axis or y-axis. Note the final returned dimension 24 | for each position is 2 times of this value. 25 | temperature (int, optional): The temperature used for scaling 26 | the position embedding. Defaults to 10000. 27 | normalize (bool, optional): Whether to normalize the position 28 | embedding. Defaults to False. 29 | scale (float, optional): A scale factor that scales the position 30 | embedding. The scale will be used only when `normalize` is True. 31 | Defaults to 2*pi. 32 | eps (float, optional): A value added to the denominator for 33 | numerical stability. Defaults to 1e-6. 34 | offset (float): offset add to embed when do the normalization. 35 | Defaults to 0. 36 | init_cfg (dict or list[dict], optional): Initialization config dict. 37 | Default: None 38 | """ 39 | 40 | def __init__( 41 | self, num_feats, temperature=10000, normalize=False, scale=2 * math.pi, eps=1e-6, offset=0.0, init_cfg=None 42 | ): 43 | super(SinePositionalEncoding, self).__init__(init_cfg) 44 | if normalize: 45 | assert isinstance(scale, (float, int)), ( 46 | "when normalize is set," "scale should be provided and in float or int type, " f"found {type(scale)}" 47 | ) 48 | self.num_feats = num_feats 49 | self.temperature = temperature 50 | self.normalize = normalize 51 | self.scale = scale 52 | self.eps = eps 53 | self.offset = offset 54 | 55 | def forward(self, mask): 56 | """Forward function for `SinePositionalEncoding`. 57 | 58 | Args: 59 | mask (Tensor): ByteTensor mask. Non-zero values representing 60 | ignored positions, while zero values means valid positions 61 | for this image. Shape [bs, h, w]. 62 | 63 | Returns: 64 | pos (Tensor): Returned position embedding with shape 65 | [bs, num_feats*2, h, w]. 66 | """ 67 | # For convenience of exporting to ONNX, it's required to convert 68 | # `masks` from bool to int. 69 | mask = mask.to(torch.int) 70 | not_mask = 1 - mask # logical_not 71 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 72 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 73 | if self.normalize: 74 | y_embed = (y_embed + self.offset) / (y_embed[:, -1:, :] + self.eps) * self.scale 75 | x_embed = (x_embed + self.offset) / (x_embed[:, :, -1:] + self.eps) * self.scale 76 | dim_t = torch.arange(self.num_feats, dtype=torch.float32, device=mask.device) 77 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_feats) 78 | pos_x = x_embed[:, :, :, None] / dim_t 79 | pos_y = y_embed[:, :, :, None] / dim_t 80 | # use `view` instead of `flatten` for dynamically exporting to ONNX 81 | B, H, W = mask.size() 82 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).view(B, H, W, -1) 83 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).view(B, H, W, -1) 84 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 85 | return pos 86 | 87 | def __repr__(self): 88 | """str: a string that describes the module""" 89 | repr_str = self.__class__.__name__ 90 | repr_str += f"(num_feats={self.num_feats}, " 91 | repr_str += f"temperature={self.temperature}, " 92 | repr_str += f"normalize={self.normalize}, " 93 | repr_str += f"scale={self.scale}, " 94 | repr_str += f"eps={self.eps})" 95 | return repr_str 96 | 97 | 98 | @POSITIONAL_ENCODING.register_module() 99 | class LearnedPositionalEncoding(BaseModule): 100 | """Position embedding with learnable embedding weights. 101 | 102 | Args: 103 | num_feats (int): The feature dimension for each position 104 | along x-axis or y-axis. The final returned dimension for 105 | each position is 2 times of this value. 106 | row_num_embed (int, optional): The dictionary size of row embeddings. 107 | Default 50. 108 | col_num_embed (int, optional): The dictionary size of col embeddings. 109 | Default 50. 110 | init_cfg (dict or list[dict], optional): Initialization config dict. 111 | """ 112 | 113 | def __init__(self, num_feats, row_num_embed=50, col_num_embed=50, init_cfg=dict(type="Uniform", layer="Embedding")): 114 | super(LearnedPositionalEncoding, self).__init__(init_cfg) 115 | self.row_embed = nn.Embedding(row_num_embed, num_feats) 116 | self.col_embed = nn.Embedding(col_num_embed, num_feats) 117 | self.num_feats = num_feats 118 | self.row_num_embed = row_num_embed 119 | self.col_num_embed = col_num_embed 120 | 121 | def forward(self, mask): 122 | """Forward function for `LearnedPositionalEncoding`. 123 | 124 | Args: 125 | mask (Tensor): ByteTensor mask. Non-zero values representing 126 | ignored positions, while zero values means valid positions 127 | for this image. Shape [bs, h, w]. 128 | 129 | Returns: 130 | pos (Tensor): Returned position embedding with shape 131 | [bs, num_feats*2, h, w]. 132 | """ 133 | h, w = mask.shape[-2:] 134 | x = torch.arange(w, device=mask.device) 135 | y = torch.arange(h, device=mask.device) 136 | x_embed = self.col_embed(x) 137 | y_embed = self.row_embed(y) 138 | pos = ( 139 | torch.cat((x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(1, w, 1)), dim=-1) 140 | .permute(2, 0, 1) 141 | .unsqueeze(0) 142 | .repeat(mask.shape[0], 1, 1, 1) 143 | ) 144 | return pos 145 | 146 | def __repr__(self): 147 | """str: a string that describes the module""" 148 | repr_str = self.__class__.__name__ 149 | repr_str += f"(num_feats={self.num_feats}, " 150 | repr_str += f"row_num_embed={self.row_num_embed}, " 151 | repr_str += f"col_num_embed={self.col_num_embed})" 152 | return repr_str 153 | -------------------------------------------------------------------------------- /dinov2/eval/segmentation_m2f/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | # References: 7 | # https://github.com/fundamentalvision/Deformable-DETR/tree/main/models/ops/modules 8 | # https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | 10 | from .ms_deform_attn import MSDeformAttn 11 | -------------------------------------------------------------------------------- /dinov2/eval/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import argparse 7 | from typing import Any, List, Optional, Tuple 8 | 9 | import torch 10 | import torch.backends.cudnn as cudnn 11 | 12 | from dinov2.models import build_model_from_cfg 13 | from dinov2.utils.config import setup 14 | import dinov2.utils.utils as dinov2_utils 15 | 16 | 17 | def get_args_parser( 18 | description: Optional[str] = None, 19 | parents: Optional[List[argparse.ArgumentParser]] = None, 20 | add_help: bool = True, 21 | ): 22 | parser = argparse.ArgumentParser( 23 | description=description, 24 | parents=parents or [], 25 | add_help=add_help, 26 | ) 27 | parser.add_argument( 28 | "--config-file", 29 | type=str, 30 | help="Model configuration file", 31 | ) 32 | parser.add_argument( 33 | "--pretrained-weights", 34 | type=str, 35 | help="Pretrained model weights", 36 | ) 37 | parser.add_argument( 38 | "--output-dir", 39 | default="", 40 | type=str, 41 | help="Output directory to write results and logs", 42 | ) 43 | parser.add_argument( 44 | "--opts", 45 | help="Extra configuration options", 46 | default=[], 47 | nargs="+", 48 | ) 49 | return parser 50 | 51 | 52 | def get_autocast_dtype(config): 53 | teacher_dtype_str = config.compute_precision.teacher.backbone.mixed_precision.param_dtype 54 | if teacher_dtype_str == "fp16": 55 | return torch.half 56 | elif teacher_dtype_str == "bf16": 57 | return torch.bfloat16 58 | else: 59 | return torch.float 60 | 61 | 62 | def build_model_for_eval(config, pretrained_weights): 63 | model, _ = build_model_from_cfg(config, only_teacher=True) 64 | dinov2_utils.load_pretrained_weights(model, pretrained_weights, "teacher") 65 | model.eval() 66 | model.cuda() 67 | return model 68 | 69 | 70 | def setup_and_build_model(args) -> Tuple[Any, torch.dtype]: 71 | cudnn.benchmark = True 72 | config = setup(args) 73 | model = build_model_for_eval(config, args.pretrained_weights) 74 | autocast_dtype = get_autocast_dtype(config) 75 | return model, autocast_dtype 76 | -------------------------------------------------------------------------------- /dinov2/eval/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import logging 7 | from typing import Dict, Optional 8 | 9 | import torch 10 | from torch import nn 11 | from torchmetrics import MetricCollection 12 | 13 | from dinov2.data import DatasetWithEnumeratedTargets, SamplerType, make_data_loader 14 | import dinov2.distributed as distributed 15 | from dinov2.logging import MetricLogger 16 | 17 | 18 | logger = logging.getLogger("dinov2") 19 | 20 | 21 | class ModelWithNormalize(torch.nn.Module): 22 | def __init__(self, model): 23 | super().__init__() 24 | self.model = model 25 | 26 | def forward(self, samples): 27 | return nn.functional.normalize(self.model(samples), dim=1, p=2) 28 | 29 | 30 | class ModelWithIntermediateLayers(nn.Module): 31 | def __init__(self, feature_model, n_last_blocks, autocast_ctx): 32 | super().__init__() 33 | self.feature_model = feature_model 34 | self.feature_model.eval() 35 | self.n_last_blocks = n_last_blocks 36 | self.autocast_ctx = autocast_ctx 37 | 38 | def forward(self, images): 39 | with torch.inference_mode(): 40 | with self.autocast_ctx(): 41 | features = self.feature_model.get_intermediate_layers( 42 | images, self.n_last_blocks, return_class_token=True 43 | ) 44 | return features 45 | 46 | 47 | @torch.inference_mode() 48 | def evaluate( 49 | model: nn.Module, 50 | data_loader, 51 | postprocessors: Dict[str, nn.Module], 52 | metrics: Dict[str, MetricCollection], 53 | device: torch.device, 54 | criterion: Optional[nn.Module] = None, 55 | ): 56 | model.eval() 57 | if criterion is not None: 58 | criterion.eval() 59 | 60 | for metric in metrics.values(): 61 | metric = metric.to(device) 62 | 63 | metric_logger = MetricLogger(delimiter=" ") 64 | header = "Test:" 65 | 66 | for samples, targets, *_ in metric_logger.log_every(data_loader, 10, header): 67 | outputs = model(samples.to(device)) 68 | targets = targets.to(device) 69 | 70 | if criterion is not None: 71 | loss = criterion(outputs, targets) 72 | metric_logger.update(loss=loss.item()) 73 | 74 | for k, metric in metrics.items(): 75 | metric_inputs = postprocessors[k](outputs, targets) 76 | metric.update(**metric_inputs) 77 | 78 | metric_logger.synchronize_between_processes() 79 | logger.info(f"Averaged stats: {metric_logger}") 80 | 81 | stats = {k: metric.compute() for k, metric in metrics.items()} 82 | metric_logger_stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} 83 | return metric_logger_stats, stats 84 | 85 | 86 | def all_gather_and_flatten(tensor_rank): 87 | tensor_all_ranks = torch.empty( 88 | distributed.get_global_size(), 89 | *tensor_rank.shape, 90 | dtype=tensor_rank.dtype, 91 | device=tensor_rank.device, 92 | ) 93 | tensor_list = list(tensor_all_ranks.unbind(0)) 94 | torch.distributed.all_gather(tensor_list, tensor_rank.contiguous()) 95 | return tensor_all_ranks.flatten(end_dim=1) 96 | 97 | 98 | def extract_features(model, dataset, batch_size, num_workers, gather_on_cpu=False): 99 | dataset_with_enumerated_targets = DatasetWithEnumeratedTargets(dataset) 100 | sample_count = len(dataset_with_enumerated_targets) 101 | data_loader = make_data_loader( 102 | dataset=dataset_with_enumerated_targets, 103 | batch_size=batch_size, 104 | num_workers=num_workers, 105 | sampler_type=SamplerType.DISTRIBUTED, 106 | drop_last=False, 107 | shuffle=False, 108 | ) 109 | return extract_features_with_dataloader(model, data_loader, sample_count, gather_on_cpu) 110 | 111 | 112 | @torch.inference_mode() 113 | def extract_features_with_dataloader(model, data_loader, sample_count, gather_on_cpu=False): 114 | gather_device = torch.device("cpu") if gather_on_cpu else torch.device("cuda") 115 | metric_logger = MetricLogger(delimiter=" ") 116 | features, all_labels = None, None 117 | for samples, (index, labels_rank) in metric_logger.log_every(data_loader, 10): 118 | samples = samples.cuda(non_blocking=True) 119 | labels_rank = labels_rank.cuda(non_blocking=True) 120 | index = index.cuda(non_blocking=True) 121 | features_rank = model(samples).float() 122 | 123 | # init storage feature matrix 124 | if features is None: 125 | features = torch.zeros(sample_count, features_rank.shape[-1], device=gather_device) 126 | labels_shape = list(labels_rank.shape) 127 | labels_shape[0] = sample_count 128 | all_labels = torch.full(labels_shape, fill_value=-1, device=gather_device) 129 | logger.info(f"Storing features into tensor of shape {features.shape}") 130 | 131 | # share indexes, features and labels between processes 132 | index_all = all_gather_and_flatten(index).to(gather_device) 133 | features_all_ranks = all_gather_and_flatten(features_rank).to(gather_device) 134 | labels_all_ranks = all_gather_and_flatten(labels_rank).to(gather_device) 135 | 136 | # update storage feature matrix 137 | if len(index_all) > 0: 138 | features.index_copy_(0, index_all, features_all_ranks) 139 | all_labels.index_copy_(0, index_all, labels_all_ranks) 140 | 141 | logger.info(f"Features shape: {tuple(features.shape)}") 142 | logger.info(f"Labels shape: {tuple(all_labels.shape)}") 143 | 144 | assert torch.all(all_labels > -1) 145 | 146 | return features, all_labels 147 | -------------------------------------------------------------------------------- /dinov2/fsdp/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import os 7 | from typing import Any 8 | 9 | import torch 10 | import dinov2.distributed as distributed 11 | from functools import partial 12 | from fvcore.common.checkpoint import Checkpointer 13 | from torch.distributed.fsdp import FullyShardedDataParallel as FSDP 14 | from torch.distributed.fsdp import ShardingStrategy 15 | from torch.distributed.fsdp import MixedPrecision 16 | from torch.distributed.fsdp import StateDictType 17 | from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler 18 | from torch.distributed.fsdp.wrap import ModuleWrapPolicy 19 | from torch.distributed.fsdp._runtime_utils import _reshard 20 | 21 | 22 | def get_fsdp_wrapper(model_cfg, modules_to_wrap=set()): 23 | sharding_strategy_dict = { 24 | "NO_SHARD": ShardingStrategy.NO_SHARD, 25 | "SHARD_GRAD_OP": ShardingStrategy.SHARD_GRAD_OP, 26 | "FULL_SHARD": ShardingStrategy.FULL_SHARD, 27 | } 28 | 29 | dtype_dict = { 30 | "fp32": torch.float32, 31 | "fp16": torch.float16, 32 | "bf16": torch.bfloat16, 33 | } 34 | 35 | mixed_precision_config = MixedPrecision( 36 | param_dtype=dtype_dict[model_cfg.mixed_precision.param_dtype], 37 | reduce_dtype=dtype_dict[model_cfg.mixed_precision.reduce_dtype], 38 | buffer_dtype=dtype_dict[model_cfg.mixed_precision.buffer_dtype], 39 | ) 40 | 41 | sharding_strategy_config = sharding_strategy_dict[model_cfg.sharding_strategy] 42 | 43 | local_rank = distributed.get_local_rank() 44 | 45 | fsdp_wrapper = partial( 46 | FSDP, 47 | sharding_strategy=sharding_strategy_config, 48 | mixed_precision=mixed_precision_config, 49 | device_id=local_rank, 50 | sync_module_states=True, 51 | use_orig_params=True, 52 | auto_wrap_policy=ModuleWrapPolicy(modules_to_wrap), 53 | ) 54 | return fsdp_wrapper 55 | 56 | 57 | def is_fsdp(x): 58 | return isinstance(x, FSDP) 59 | 60 | 61 | def is_sharded_fsdp(x): 62 | return is_fsdp(x) and x.sharding_strategy is not ShardingStrategy.NO_SHARD 63 | 64 | 65 | def free_if_fsdp(x): 66 | if is_sharded_fsdp(x): 67 | handles = x._handles 68 | true_list = [True for h in handles] 69 | _reshard(x, handles, true_list) 70 | 71 | 72 | def get_fsdp_modules(x): 73 | return FSDP.fsdp_modules(x) 74 | 75 | 76 | def reshard_fsdp_model(x): 77 | for m in get_fsdp_modules(x): 78 | free_if_fsdp(m) 79 | 80 | 81 | def rankstr(): 82 | return f"rank_{distributed.get_global_rank()}" 83 | 84 | 85 | class FSDPCheckpointer(Checkpointer): 86 | def save(self, name: str, **kwargs: Any) -> None: 87 | """ 88 | Dump model and checkpointables to a file. 89 | 90 | Args: 91 | name (str): name of the file. 92 | kwargs (dict): extra arbitrary data to save. 93 | """ 94 | if not self.save_dir or not self.save_to_disk: 95 | return 96 | 97 | data = {} 98 | with FSDP.state_dict_type(self.model, StateDictType.LOCAL_STATE_DICT): 99 | data["model"] = self.model.state_dict() 100 | 101 | # data["model"] = self.model.state_dict() 102 | for key, obj in self.checkpointables.items(): 103 | data[key] = obj.state_dict() 104 | data.update(kwargs) 105 | 106 | basename = f"{name}.{rankstr()}.pth" 107 | save_file = os.path.join(self.save_dir, basename) 108 | assert os.path.basename(save_file) == basename, basename 109 | self.logger.info("Saving checkpoint to {}".format(save_file)) 110 | with self.path_manager.open(save_file, "wb") as f: 111 | torch.save(data, f) 112 | self.tag_last_checkpoint(basename) 113 | 114 | def load(self, *args, **kwargs): 115 | with FSDP.state_dict_type(self.model, StateDictType.LOCAL_STATE_DICT): 116 | return super().load(*args, **kwargs) 117 | 118 | def has_checkpoint(self) -> bool: 119 | """ 120 | Returns: 121 | bool: whether a checkpoint exists in the target directory. 122 | """ 123 | save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}") 124 | return self.path_manager.exists(save_file) 125 | 126 | def get_checkpoint_file(self) -> str: 127 | """ 128 | Returns: 129 | str: The latest checkpoint file in target directory. 130 | """ 131 | save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}") 132 | try: 133 | with self.path_manager.open(save_file, "r") as f: 134 | last_saved = f.read().strip() 135 | except IOError: 136 | # if file doesn't exist, maybe because it has just been 137 | # deleted by a separate process 138 | return "" 139 | # pyre-fixme[6]: For 2nd param expected `Union[PathLike[str], str]` but got 140 | # `Union[bytes, str]`. 141 | return os.path.join(self.save_dir, last_saved) 142 | 143 | def tag_last_checkpoint(self, last_filename_basename: str) -> None: 144 | """ 145 | Tag the last checkpoint. 146 | 147 | Args: 148 | last_filename_basename (str): the basename of the last filename. 149 | """ 150 | if distributed.is_enabled(): 151 | torch.distributed.barrier() 152 | save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}") 153 | with self.path_manager.open(save_file, "w") as f: 154 | f.write(last_filename_basename) # pyre-ignore 155 | 156 | 157 | ShardedGradScaler = ShardedGradScaler 158 | -------------------------------------------------------------------------------- /dinov2/hub/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /dinov2/hub/backbones.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from enum import Enum 7 | from typing import Union 8 | 9 | import torch 10 | 11 | from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name 12 | 13 | 14 | class Weights(Enum): 15 | LVD142M = "LVD142M" 16 | 17 | 18 | def _make_dinov2_model( 19 | *, 20 | arch_name: str = "vit_large", 21 | img_size: int = 518, 22 | patch_size: int = 14, 23 | init_values: float = 1.0, 24 | ffn_layer: str = "mlp", 25 | block_chunks: int = 0, 26 | num_register_tokens: int = 0, 27 | interpolate_antialias: bool = False, 28 | interpolate_offset: float = 0.1, 29 | pretrained: bool = True, 30 | weights: Union[Weights, str] = Weights.LVD142M, 31 | **kwargs, 32 | ): 33 | from ..models import vision_transformer as vits 34 | 35 | if isinstance(weights, str): 36 | try: 37 | weights = Weights[weights] 38 | except KeyError: 39 | raise AssertionError(f"Unsupported weights: {weights}") 40 | 41 | model_base_name = _make_dinov2_model_name(arch_name, patch_size) 42 | vit_kwargs = dict( 43 | img_size=img_size, 44 | patch_size=patch_size, 45 | init_values=init_values, 46 | ffn_layer=ffn_layer, 47 | block_chunks=block_chunks, 48 | num_register_tokens=num_register_tokens, 49 | interpolate_antialias=interpolate_antialias, 50 | interpolate_offset=interpolate_offset, 51 | ) 52 | vit_kwargs.update(**kwargs) 53 | model = vits.__dict__[arch_name](**vit_kwargs) 54 | 55 | if pretrained: 56 | model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens) 57 | url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth" 58 | state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") 59 | model.load_state_dict(state_dict, strict=True) 60 | 61 | return model 62 | 63 | 64 | def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): 65 | """ 66 | DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset. 67 | """ 68 | return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs) 69 | 70 | 71 | def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): 72 | """ 73 | DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset. 74 | """ 75 | return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs) 76 | 77 | 78 | def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): 79 | """ 80 | DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset. 81 | """ 82 | return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs) 83 | 84 | 85 | def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): 86 | """ 87 | DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset. 88 | """ 89 | return _make_dinov2_model( 90 | arch_name="vit_giant2", 91 | ffn_layer="swiglufused", 92 | weights=weights, 93 | pretrained=pretrained, 94 | **kwargs, 95 | ) 96 | 97 | 98 | def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): 99 | """ 100 | DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset. 101 | """ 102 | return _make_dinov2_model( 103 | arch_name="vit_small", 104 | pretrained=pretrained, 105 | weights=weights, 106 | num_register_tokens=4, 107 | interpolate_antialias=True, 108 | interpolate_offset=0.0, 109 | **kwargs, 110 | ) 111 | 112 | 113 | def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): 114 | """ 115 | DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset. 116 | """ 117 | return _make_dinov2_model( 118 | arch_name="vit_base", 119 | pretrained=pretrained, 120 | weights=weights, 121 | num_register_tokens=4, 122 | interpolate_antialias=True, 123 | interpolate_offset=0.0, 124 | **kwargs, 125 | ) 126 | 127 | 128 | def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): 129 | """ 130 | DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset. 131 | """ 132 | return _make_dinov2_model( 133 | arch_name="vit_large", 134 | pretrained=pretrained, 135 | weights=weights, 136 | num_register_tokens=4, 137 | interpolate_antialias=True, 138 | interpolate_offset=0.0, 139 | **kwargs, 140 | ) 141 | 142 | 143 | def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): 144 | """ 145 | DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset. 146 | """ 147 | return _make_dinov2_model( 148 | arch_name="vit_giant2", 149 | ffn_layer="swiglufused", 150 | weights=weights, 151 | pretrained=pretrained, 152 | num_register_tokens=4, 153 | interpolate_antialias=True, 154 | interpolate_offset=0.0, 155 | **kwargs, 156 | ) 157 | -------------------------------------------------------------------------------- /dinov2/hub/depth/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .decode_heads import BNHead, DPTHead 7 | from .encoder_decoder import DepthEncoderDecoder 8 | -------------------------------------------------------------------------------- /dinov2/hub/depth/ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import warnings 7 | 8 | import torch.nn.functional as F 9 | 10 | 11 | def resize(input, size=None, scale_factor=None, mode="nearest", align_corners=None, warning=False): 12 | if warning: 13 | if size is not None and align_corners: 14 | input_h, input_w = tuple(int(x) for x in input.shape[2:]) 15 | output_h, output_w = tuple(int(x) for x in size) 16 | if output_h > input_h or output_w > output_h: 17 | if ( 18 | (output_h > 1 and output_w > 1 and input_h > 1 and input_w > 1) 19 | and (output_h - 1) % (input_h - 1) 20 | and (output_w - 1) % (input_w - 1) 21 | ): 22 | warnings.warn( 23 | f"When align_corners={align_corners}, " 24 | "the output would more aligned if " 25 | f"input size {(input_h, input_w)} is `x+1` and " 26 | f"out size {(output_h, output_w)} is `nx+1`" 27 | ) 28 | return F.interpolate(input, size, scale_factor, mode, align_corners) 29 | -------------------------------------------------------------------------------- /dinov2/hub/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import itertools 7 | import math 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | 13 | 14 | _DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2" 15 | 16 | 17 | def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str: 18 | compact_arch_name = arch_name.replace("_", "")[:4] 19 | registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else "" 20 | return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}" 21 | 22 | 23 | class CenterPadding(nn.Module): 24 | def __init__(self, multiple): 25 | super().__init__() 26 | self.multiple = multiple 27 | 28 | def _get_pad(self, size): 29 | new_size = math.ceil(size / self.multiple) * self.multiple 30 | pad_size = new_size - size 31 | pad_size_left = pad_size // 2 32 | pad_size_right = pad_size - pad_size_left 33 | return pad_size_left, pad_size_right 34 | 35 | @torch.inference_mode() 36 | def forward(self, x): 37 | pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1])) 38 | output = F.pad(x, pads) 39 | return output 40 | -------------------------------------------------------------------------------- /dinov2/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .dino_head import DINOHead 7 | from .mlp import Mlp 8 | from .patch_embed import PatchEmbed 9 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused 10 | from .block import NestedTensorBlock 11 | from .attention import MemEffAttention 12 | -------------------------------------------------------------------------------- /dinov2/layers/attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | # References: 7 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 8 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py 9 | 10 | import logging 11 | import os 12 | import warnings 13 | 14 | from torch import Tensor 15 | from torch import nn 16 | 17 | 18 | logger = logging.getLogger("dinov2") 19 | 20 | 21 | XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None 22 | try: 23 | if XFORMERS_ENABLED: 24 | from xformers.ops import memory_efficient_attention, unbind 25 | 26 | XFORMERS_AVAILABLE = True 27 | warnings.warn("xFormers is available (Attention)") 28 | else: 29 | warnings.warn("xFormers is disabled (Attention)") 30 | raise ImportError 31 | except ImportError: 32 | XFORMERS_AVAILABLE = False 33 | warnings.warn("xFormers is not available (Attention)") 34 | 35 | 36 | class Attention(nn.Module): 37 | def __init__( 38 | self, 39 | dim: int, 40 | num_heads: int = 8, 41 | qkv_bias: bool = False, 42 | proj_bias: bool = True, 43 | attn_drop: float = 0.0, 44 | proj_drop: float = 0.0, 45 | ) -> None: 46 | super().__init__() 47 | self.num_heads = num_heads 48 | head_dim = dim // num_heads 49 | self.scale = head_dim**-0.5 50 | 51 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 52 | self.attn_drop = nn.Dropout(attn_drop) 53 | self.proj = nn.Linear(dim, dim, bias=proj_bias) 54 | self.proj_drop = nn.Dropout(proj_drop) 55 | 56 | def forward(self, x: Tensor) -> Tensor: 57 | B, N, C = x.shape 58 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 59 | 60 | q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] 61 | attn = q @ k.transpose(-2, -1) 62 | 63 | attn = attn.softmax(dim=-1) 64 | attn = self.attn_drop(attn) 65 | 66 | x = (attn @ v).transpose(1, 2).reshape(B, N, C) 67 | x = self.proj(x) 68 | x = self.proj_drop(x) 69 | return x 70 | 71 | 72 | class MemEffAttention(Attention): 73 | def forward(self, x: Tensor, attn_bias=None) -> Tensor: 74 | if not XFORMERS_AVAILABLE: 75 | if attn_bias is not None: 76 | raise AssertionError("xFormers is required for using nested tensors") 77 | return super().forward(x) 78 | 79 | B, N, C = x.shape 80 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) 81 | 82 | q, k, v = unbind(qkv, 2) 83 | 84 | x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) 85 | x = x.reshape([B, N, C]) 86 | 87 | x = self.proj(x) 88 | x = self.proj_drop(x) 89 | return x 90 | -------------------------------------------------------------------------------- /dinov2/layers/dino_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import torch.nn as nn 8 | from torch.nn.init import trunc_normal_ 9 | from torch.nn.utils import weight_norm 10 | 11 | 12 | class DINOHead(nn.Module): 13 | def __init__( 14 | self, 15 | in_dim, 16 | out_dim, 17 | use_bn=False, 18 | nlayers=3, 19 | hidden_dim=2048, 20 | bottleneck_dim=256, 21 | mlp_bias=True, 22 | ): 23 | super().__init__() 24 | nlayers = max(nlayers, 1) 25 | self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias) 26 | self.apply(self._init_weights) 27 | self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False)) 28 | self.last_layer.weight_g.data.fill_(1) 29 | 30 | def _init_weights(self, m): 31 | if isinstance(m, nn.Linear): 32 | trunc_normal_(m.weight, std=0.02) 33 | if isinstance(m, nn.Linear) and m.bias is not None: 34 | nn.init.constant_(m.bias, 0) 35 | 36 | def forward(self, x): 37 | x = self.mlp(x) 38 | eps = 1e-6 if x.dtype == torch.float16 else 1e-12 39 | x = nn.functional.normalize(x, dim=-1, p=2, eps=eps) 40 | x = self.last_layer(x) 41 | return x 42 | 43 | 44 | def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True): 45 | if nlayers == 1: 46 | return nn.Linear(in_dim, bottleneck_dim, bias=bias) 47 | else: 48 | layers = [nn.Linear(in_dim, hidden_dim, bias=bias)] 49 | if use_bn: 50 | layers.append(nn.BatchNorm1d(hidden_dim)) 51 | layers.append(nn.GELU()) 52 | for _ in range(nlayers - 2): 53 | layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias)) 54 | if use_bn: 55 | layers.append(nn.BatchNorm1d(hidden_dim)) 56 | layers.append(nn.GELU()) 57 | layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias)) 58 | return nn.Sequential(*layers) 59 | -------------------------------------------------------------------------------- /dinov2/layers/drop_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | # References: 7 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 8 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py 9 | 10 | 11 | from torch import nn 12 | 13 | 14 | def drop_path(x, drop_prob: float = 0.0, training: bool = False): 15 | if drop_prob == 0.0 or not training: 16 | return x 17 | keep_prob = 1 - drop_prob 18 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 19 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 20 | if keep_prob > 0.0: 21 | random_tensor.div_(keep_prob) 22 | output = x * random_tensor 23 | return output 24 | 25 | 26 | class DropPath(nn.Module): 27 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" 28 | 29 | def __init__(self, drop_prob=None): 30 | super(DropPath, self).__init__() 31 | self.drop_prob = drop_prob 32 | 33 | def forward(self, x): 34 | return drop_path(x, self.drop_prob, self.training) 35 | -------------------------------------------------------------------------------- /dinov2/layers/layer_scale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 7 | 8 | from typing import Optional, Union 9 | 10 | import torch 11 | from torch import Tensor 12 | from torch import nn 13 | 14 | 15 | class LayerScale(nn.Module): 16 | def __init__( 17 | self, 18 | dim: int, 19 | init_values: Union[float, Tensor] = 1e-5, 20 | inplace: bool = False, 21 | device: Optional[torch.device] = None, 22 | dtype: Optional[torch.dtype] = None, 23 | ) -> None: 24 | super().__init__() 25 | self.inplace = inplace 26 | self.init_values = init_values 27 | self.gamma = nn.Parameter(torch.empty(dim, device=device, dtype=dtype)) 28 | self.reset_parameters() 29 | 30 | def reset_parameters(self): 31 | nn.init.constant_(self.gamma, self.init_values) 32 | 33 | def forward(self, x: Tensor) -> Tensor: 34 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 35 | -------------------------------------------------------------------------------- /dinov2/layers/mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | # References: 7 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 8 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py 9 | 10 | 11 | from typing import Callable, Optional 12 | 13 | from torch import Tensor, nn 14 | 15 | 16 | class Mlp(nn.Module): 17 | def __init__( 18 | self, 19 | in_features: int, 20 | hidden_features: Optional[int] = None, 21 | out_features: Optional[int] = None, 22 | act_layer: Callable[..., nn.Module] = nn.GELU, 23 | drop: float = 0.0, 24 | bias: bool = True, 25 | ) -> None: 26 | super().__init__() 27 | out_features = out_features or in_features 28 | hidden_features = hidden_features or in_features 29 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) 30 | self.act = act_layer() 31 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) 32 | self.drop = nn.Dropout(drop) 33 | 34 | def forward(self, x: Tensor) -> Tensor: 35 | x = self.fc1(x) 36 | x = self.act(x) 37 | x = self.drop(x) 38 | x = self.fc2(x) 39 | x = self.drop(x) 40 | return x 41 | -------------------------------------------------------------------------------- /dinov2/layers/patch_embed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | # References: 7 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 8 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py 9 | 10 | from typing import Callable, Optional, Tuple, Union 11 | 12 | from torch import Tensor 13 | import torch.nn as nn 14 | 15 | 16 | def make_2tuple(x): 17 | if isinstance(x, tuple): 18 | assert len(x) == 2 19 | return x 20 | 21 | assert isinstance(x, int) 22 | return (x, x) 23 | 24 | 25 | class PatchEmbed(nn.Module): 26 | """ 27 | 2D image to patch embedding: (B,C,H,W) -> (B,N,D) 28 | 29 | Args: 30 | img_size: Image size. 31 | patch_size: Patch token size. 32 | in_chans: Number of input image channels. 33 | embed_dim: Number of linear projection output channels. 34 | norm_layer: Normalization layer. 35 | """ 36 | 37 | def __init__( 38 | self, 39 | img_size: Union[int, Tuple[int, int]] = 224, 40 | patch_size: Union[int, Tuple[int, int]] = 16, 41 | in_chans: int = 3, 42 | embed_dim: int = 768, 43 | norm_layer: Optional[Callable] = None, 44 | flatten_embedding: bool = True, 45 | ) -> None: 46 | super().__init__() 47 | 48 | image_HW = make_2tuple(img_size) 49 | patch_HW = make_2tuple(patch_size) 50 | patch_grid_size = ( 51 | image_HW[0] // patch_HW[0], 52 | image_HW[1] // patch_HW[1], 53 | ) 54 | 55 | self.img_size = image_HW 56 | self.patch_size = patch_HW 57 | self.patches_resolution = patch_grid_size 58 | self.num_patches = patch_grid_size[0] * patch_grid_size[1] 59 | 60 | self.in_chans = in_chans 61 | self.embed_dim = embed_dim 62 | 63 | self.flatten_embedding = flatten_embedding 64 | 65 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) 66 | self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() 67 | 68 | def forward(self, x: Tensor) -> Tensor: 69 | _, _, H, W = x.shape 70 | patch_H, patch_W = self.patch_size 71 | 72 | assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" 73 | assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" 74 | 75 | x = self.proj(x) # B C H W 76 | H, W = x.size(2), x.size(3) 77 | x = x.flatten(2).transpose(1, 2) # B HW C 78 | x = self.norm(x) 79 | if not self.flatten_embedding: 80 | x = x.reshape(-1, H, W, self.embed_dim) # B H W C 81 | return x 82 | 83 | def flops(self) -> float: 84 | Ho, Wo = self.patches_resolution 85 | flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) 86 | if self.norm is not None: 87 | flops += Ho * Wo * self.embed_dim 88 | return flops 89 | -------------------------------------------------------------------------------- /dinov2/layers/swiglu_ffn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import os 7 | from typing import Callable, Optional 8 | import warnings 9 | 10 | from torch import Tensor, nn 11 | import torch.nn.functional as F 12 | 13 | 14 | class SwiGLUFFN(nn.Module): 15 | def __init__( 16 | self, 17 | in_features: int, 18 | hidden_features: Optional[int] = None, 19 | out_features: Optional[int] = None, 20 | act_layer: Callable[..., nn.Module] = None, 21 | drop: float = 0.0, 22 | bias: bool = True, 23 | ) -> None: 24 | super().__init__() 25 | out_features = out_features or in_features 26 | hidden_features = hidden_features or in_features 27 | self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) 28 | self.w3 = nn.Linear(hidden_features, out_features, bias=bias) 29 | 30 | def forward(self, x: Tensor) -> Tensor: 31 | x12 = self.w12(x) 32 | x1, x2 = x12.chunk(2, dim=-1) 33 | hidden = F.silu(x1) * x2 34 | return self.w3(hidden) 35 | 36 | 37 | XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None 38 | try: 39 | if XFORMERS_ENABLED: 40 | from xformers.ops import SwiGLU 41 | 42 | XFORMERS_AVAILABLE = True 43 | warnings.warn("xFormers is available (SwiGLU)") 44 | else: 45 | warnings.warn("xFormers is disabled (SwiGLU)") 46 | raise ImportError 47 | except ImportError: 48 | SwiGLU = SwiGLUFFN 49 | XFORMERS_AVAILABLE = False 50 | 51 | warnings.warn("xFormers is not available (SwiGLU)") 52 | 53 | 54 | class SwiGLUFFNFused(SwiGLU): 55 | def __init__( 56 | self, 57 | in_features: int, 58 | hidden_features: Optional[int] = None, 59 | out_features: Optional[int] = None, 60 | act_layer: Callable[..., nn.Module] = None, 61 | drop: float = 0.0, 62 | bias: bool = True, 63 | ) -> None: 64 | out_features = out_features or in_features 65 | hidden_features = hidden_features or in_features 66 | hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 67 | super().__init__( 68 | in_features=in_features, 69 | hidden_features=hidden_features, 70 | out_features=out_features, 71 | bias=bias, 72 | ) 73 | -------------------------------------------------------------------------------- /dinov2/logging/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import functools 7 | import logging 8 | import os 9 | import sys 10 | from typing import Optional 11 | 12 | import dinov2.distributed as distributed 13 | from .helpers import MetricLogger, SmoothedValue 14 | 15 | 16 | # So that calling _configure_logger multiple times won't add many handlers 17 | @functools.lru_cache() 18 | def _configure_logger( 19 | name: Optional[str] = None, 20 | *, 21 | level: int = logging.DEBUG, 22 | output: Optional[str] = None, 23 | ): 24 | """ 25 | Configure a logger. 26 | 27 | Adapted from Detectron2. 28 | 29 | Args: 30 | name: The name of the logger to configure. 31 | level: The logging level to use. 32 | output: A file name or a directory to save log. If None, will not save log file. 33 | If ends with ".txt" or ".log", assumed to be a file name. 34 | Otherwise, logs will be saved to `output/log.txt`. 35 | 36 | Returns: 37 | The configured logger. 38 | """ 39 | 40 | logger = logging.getLogger(name) 41 | logger.setLevel(level) 42 | logger.propagate = False 43 | 44 | # Loosely match Google glog format: 45 | # [IWEF]yyyymmdd hh:mm:ss.uuuuuu threadid file:line] msg 46 | # but use a shorter timestamp and include the logger name: 47 | # [IWEF]yyyymmdd hh:mm:ss logger threadid file:line] msg 48 | fmt_prefix = "%(levelname).1s%(asctime)s %(process)s %(name)s %(filename)s:%(lineno)s] " 49 | fmt_message = "%(message)s" 50 | fmt = fmt_prefix + fmt_message 51 | datefmt = "%Y%m%d %H:%M:%S" 52 | formatter = logging.Formatter(fmt=fmt, datefmt=datefmt) 53 | 54 | # stdout logging for main worker only 55 | if distributed.is_main_process(): 56 | handler = logging.StreamHandler(stream=sys.stdout) 57 | handler.setLevel(logging.DEBUG) 58 | handler.setFormatter(formatter) 59 | logger.addHandler(handler) 60 | 61 | # file logging for all workers 62 | if output: 63 | if os.path.splitext(output)[-1] in (".txt", ".log"): 64 | filename = output 65 | else: 66 | filename = os.path.join(output, "logs", "log.txt") 67 | 68 | if not distributed.is_main_process(): 69 | global_rank = distributed.get_global_rank() 70 | filename = filename + ".rank{}".format(global_rank) 71 | 72 | os.makedirs(os.path.dirname(filename), exist_ok=True) 73 | 74 | handler = logging.StreamHandler(open(filename, "a")) 75 | handler.setLevel(logging.DEBUG) 76 | handler.setFormatter(formatter) 77 | logger.addHandler(handler) 78 | 79 | return logger 80 | 81 | 82 | def setup_logging( 83 | output: Optional[str] = None, 84 | *, 85 | name: Optional[str] = None, 86 | level: int = logging.DEBUG, 87 | capture_warnings: bool = True, 88 | ) -> None: 89 | """ 90 | Setup logging. 91 | 92 | Args: 93 | output: A file name or a directory to save log files. If None, log 94 | files will not be saved. If output ends with ".txt" or ".log", it 95 | is assumed to be a file name. 96 | Otherwise, logs will be saved to `output/log.txt`. 97 | name: The name of the logger to configure, by default the root logger. 98 | level: The logging level to use. 99 | capture_warnings: Whether warnings should be captured as logs. 100 | """ 101 | logging.captureWarnings(capture_warnings) 102 | _configure_logger(name, level=level, output=output) 103 | -------------------------------------------------------------------------------- /dinov2/logging/helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from collections import defaultdict, deque 7 | import datetime 8 | import json 9 | import logging 10 | import time 11 | 12 | import torch 13 | 14 | import dinov2.distributed as distributed 15 | 16 | 17 | logger = logging.getLogger("dinov2") 18 | 19 | 20 | class MetricLogger(object): 21 | def __init__(self, delimiter="\t", output_file=None): 22 | self.meters = defaultdict(SmoothedValue) 23 | self.delimiter = delimiter 24 | self.output_file = output_file 25 | 26 | def update(self, **kwargs): 27 | for k, v in kwargs.items(): 28 | if isinstance(v, torch.Tensor): 29 | v = v.item() 30 | assert isinstance(v, (float, int)) 31 | self.meters[k].update(v) 32 | 33 | def __getattr__(self, attr): 34 | if attr in self.meters: 35 | return self.meters[attr] 36 | if attr in self.__dict__: 37 | return self.__dict__[attr] 38 | raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr)) 39 | 40 | def __str__(self): 41 | loss_str = [] 42 | for name, meter in self.meters.items(): 43 | loss_str.append("{}: {}".format(name, str(meter))) 44 | return self.delimiter.join(loss_str) 45 | 46 | def synchronize_between_processes(self): 47 | for meter in self.meters.values(): 48 | meter.synchronize_between_processes() 49 | 50 | def add_meter(self, name, meter): 51 | self.meters[name] = meter 52 | 53 | def dump_in_output_file(self, iteration, iter_time, data_time): 54 | if self.output_file is None or not distributed.is_main_process(): 55 | return 56 | dict_to_dump = dict( 57 | iteration=iteration, 58 | iter_time=iter_time, 59 | data_time=data_time, 60 | ) 61 | dict_to_dump.update({k: v.median for k, v in self.meters.items()}) 62 | with open(self.output_file, "a") as f: 63 | f.write(json.dumps(dict_to_dump) + "\n") 64 | pass 65 | 66 | def log_every(self, iterable, print_freq, header=None, n_iterations=None, start_iteration=0): 67 | i = start_iteration 68 | if not header: 69 | header = "" 70 | start_time = time.time() 71 | end = time.time() 72 | iter_time = SmoothedValue(fmt="{avg:.6f}") 73 | data_time = SmoothedValue(fmt="{avg:.6f}") 74 | 75 | if n_iterations is None: 76 | n_iterations = len(iterable) 77 | 78 | space_fmt = ":" + str(len(str(n_iterations))) + "d" 79 | 80 | log_list = [ 81 | header, 82 | "[{0" + space_fmt + "}/{1}]", 83 | "eta: {eta}", 84 | "{meters}", 85 | "time: {time}", 86 | "data: {data}", 87 | ] 88 | if torch.cuda.is_available(): 89 | log_list += ["max mem: {memory:.0f}"] 90 | 91 | log_msg = self.delimiter.join(log_list) 92 | MB = 1024.0 * 1024.0 93 | for obj in iterable: 94 | data_time.update(time.time() - end) 95 | yield obj 96 | iter_time.update(time.time() - end) 97 | if i % print_freq == 0 or i == n_iterations - 1: 98 | self.dump_in_output_file(iteration=i, iter_time=iter_time.avg, data_time=data_time.avg) 99 | eta_seconds = iter_time.global_avg * (n_iterations - i) 100 | eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) 101 | if torch.cuda.is_available(): 102 | logger.info( 103 | log_msg.format( 104 | i, 105 | n_iterations, 106 | eta=eta_string, 107 | meters=str(self), 108 | time=str(iter_time), 109 | data=str(data_time), 110 | memory=torch.cuda.max_memory_allocated() / MB, 111 | ) 112 | ) 113 | else: 114 | logger.info( 115 | log_msg.format( 116 | i, 117 | n_iterations, 118 | eta=eta_string, 119 | meters=str(self), 120 | time=str(iter_time), 121 | data=str(data_time), 122 | ) 123 | ) 124 | i += 1 125 | end = time.time() 126 | if i >= n_iterations: 127 | break 128 | total_time = time.time() - start_time 129 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 130 | logger.info("{} Total time: {} ({:.6f} s / it)".format(header, total_time_str, total_time / n_iterations)) 131 | 132 | 133 | class SmoothedValue: 134 | """Track a series of values and provide access to smoothed values over a 135 | window or the global series average. 136 | """ 137 | 138 | def __init__(self, window_size=20, fmt=None): 139 | if fmt is None: 140 | fmt = "{median:.4f} ({global_avg:.4f})" 141 | self.deque = deque(maxlen=window_size) 142 | self.total = 0.0 143 | self.count = 0 144 | self.fmt = fmt 145 | 146 | def update(self, value, num=1): 147 | self.deque.append(value) 148 | self.count += num 149 | self.total += value * num 150 | 151 | def synchronize_between_processes(self): 152 | """ 153 | Distributed synchronization of the metric 154 | Warning: does not synchronize the deque! 155 | """ 156 | if not distributed.is_enabled(): 157 | return 158 | t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda") 159 | torch.distributed.barrier() 160 | torch.distributed.all_reduce(t) 161 | t = t.tolist() 162 | self.count = int(t[0]) 163 | self.total = t[1] 164 | 165 | @property 166 | def median(self): 167 | d = torch.tensor(list(self.deque)) 168 | return d.median().item() 169 | 170 | @property 171 | def avg(self): 172 | d = torch.tensor(list(self.deque), dtype=torch.float32) 173 | return d.mean().item() 174 | 175 | @property 176 | def global_avg(self): 177 | return self.total / self.count 178 | 179 | @property 180 | def max(self): 181 | return max(self.deque) 182 | 183 | @property 184 | def value(self): 185 | return self.deque[-1] 186 | 187 | def __str__(self): 188 | return self.fmt.format( 189 | median=self.median, 190 | avg=self.avg, 191 | global_avg=self.global_avg, 192 | max=self.max, 193 | value=self.value, 194 | ) 195 | -------------------------------------------------------------------------------- /dinov2/loss/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .dino_clstoken_loss import DINOLoss 7 | from .ibot_patch_loss import iBOTPatchLoss 8 | from .koleo_loss import KoLeoLoss 9 | -------------------------------------------------------------------------------- /dinov2/loss/dino_clstoken_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import torch.distributed as dist 8 | import torch.nn.functional as F 9 | from torch import nn 10 | 11 | 12 | class DINOLoss(nn.Module): 13 | def __init__( 14 | self, 15 | out_dim, 16 | student_temp=0.1, 17 | center_momentum=0.9, 18 | ): 19 | super().__init__() 20 | self.student_temp = student_temp 21 | self.center_momentum = center_momentum 22 | self.register_buffer("center", torch.zeros(1, out_dim)) 23 | self.updated = True 24 | self.reduce_handle = None 25 | self.len_teacher_output = None 26 | self.async_batch_center = None 27 | 28 | @torch.no_grad() 29 | def softmax_center_teacher(self, teacher_output, teacher_temp): 30 | self.apply_center_update() 31 | # teacher centering and sharpening 32 | return F.softmax((teacher_output - self.center) / teacher_temp, dim=-1) 33 | 34 | @torch.no_grad() 35 | def sinkhorn_knopp_teacher(self, teacher_output, teacher_temp, n_iterations=3): 36 | teacher_output = teacher_output.float() 37 | world_size = dist.get_world_size() if dist.is_initialized() else 1 38 | Q = torch.exp(teacher_output / teacher_temp).t() # Q is K-by-B for consistency with notations from our paper 39 | B = Q.shape[1] * world_size # number of samples to assign 40 | K = Q.shape[0] # how many prototypes 41 | 42 | # make the matrix sums to 1 43 | sum_Q = torch.sum(Q) 44 | if dist.is_initialized(): 45 | dist.all_reduce(sum_Q) 46 | Q /= sum_Q 47 | 48 | for it in range(n_iterations): 49 | # normalize each row: total weight per prototype must be 1/K 50 | sum_of_rows = torch.sum(Q, dim=1, keepdim=True) 51 | if dist.is_initialized(): 52 | dist.all_reduce(sum_of_rows) 53 | Q /= sum_of_rows 54 | Q /= K 55 | 56 | # normalize each column: total weight per sample must be 1/B 57 | Q /= torch.sum(Q, dim=0, keepdim=True) 58 | Q /= B 59 | 60 | Q *= B # the columns must sum to 1 so that Q is an assignment 61 | return Q.t() 62 | 63 | def forward(self, student_output_list, teacher_out_softmaxed_centered_list): 64 | """ 65 | Cross-entropy between softmax outputs of the teacher and student networks. 66 | """ 67 | # TODO: Use cross_entropy_distribution here 68 | total_loss = 0 69 | for s in student_output_list: 70 | lsm = F.log_softmax(s / self.student_temp, dim=-1) 71 | for t in teacher_out_softmaxed_centered_list: 72 | loss = torch.sum(t * lsm, dim=-1) 73 | total_loss -= loss.mean() 74 | return total_loss 75 | 76 | @torch.no_grad() 77 | def update_center(self, teacher_output): 78 | self.reduce_center_update(teacher_output) 79 | 80 | @torch.no_grad() 81 | def reduce_center_update(self, teacher_output): 82 | self.updated = False 83 | self.len_teacher_output = len(teacher_output) 84 | self.async_batch_center = torch.sum(teacher_output, dim=0, keepdim=True) 85 | if dist.is_initialized(): 86 | self.reduce_handle = dist.all_reduce(self.async_batch_center, async_op=True) 87 | 88 | @torch.no_grad() 89 | def apply_center_update(self): 90 | if self.updated is False: 91 | world_size = dist.get_world_size() if dist.is_initialized() else 1 92 | 93 | if self.reduce_handle is not None: 94 | self.reduce_handle.wait() 95 | _t = self.async_batch_center / (self.len_teacher_output * world_size) 96 | 97 | self.center = self.center * self.center_momentum + _t * (1 - self.center_momentum) 98 | 99 | self.updated = True 100 | -------------------------------------------------------------------------------- /dinov2/loss/ibot_patch_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import torch.distributed as dist 8 | import torch.nn.functional as F 9 | from torch import nn 10 | 11 | import logging 12 | 13 | 14 | logger = logging.getLogger("dinov2") 15 | 16 | 17 | try: 18 | from xformers.ops import cross_entropy 19 | 20 | def lossfunc(t, s, temp): 21 | s = s.float() 22 | t = t.float() 23 | if s.ndim == 2: 24 | return -cross_entropy(s.unsqueeze(0), t.unsqueeze(0), temp, bw_inplace=True).squeeze(0) 25 | elif s.ndim == 3: 26 | return -cross_entropy(s, t, temp, bw_inplace=True) 27 | 28 | except ImportError: 29 | 30 | def lossfunc(t, s, temp): 31 | return torch.sum(t * F.log_softmax(s / temp, dim=-1), dim=-1) 32 | 33 | 34 | class iBOTPatchLoss(nn.Module): 35 | def __init__(self, patch_out_dim, student_temp=0.1, center_momentum=0.9): 36 | super().__init__() 37 | self.student_temp = student_temp 38 | self.center_momentum = center_momentum 39 | self.register_buffer("center", torch.zeros(1, 1, patch_out_dim)) 40 | self.updated = True 41 | self.reduce_handle = None 42 | self.len_teacher_patch_tokens = None 43 | self.async_batch_center = None 44 | 45 | @torch.no_grad() 46 | def softmax_center_teacher(self, teacher_patch_tokens, teacher_temp): 47 | self.apply_center_update() 48 | # teacher centering and sharpening 49 | # 50 | # WARNING: 51 | # as self.center is a float32, everything gets casted to float32 afterwards 52 | # 53 | # teacher_patch_tokens = teacher_patch_tokens.float() 54 | # return F.softmax((teacher_patch_tokens.sub_(self.center.to(teacher_patch_tokens.dtype))).mul_(1 / teacher_temp), dim=-1) 55 | 56 | return F.softmax((teacher_patch_tokens - self.center) / teacher_temp, dim=-1) 57 | 58 | # this is experimental, keep everything in float16 and let's see what happens: 59 | # return F.softmax((teacher_patch_tokens.sub_(self.center)) / teacher_temp, dim=-1) 60 | 61 | @torch.no_grad() 62 | def sinkhorn_knopp_teacher(self, teacher_output, teacher_temp, n_masked_patches_tensor, n_iterations=3): 63 | teacher_output = teacher_output.float() 64 | # world_size = dist.get_world_size() if dist.is_initialized() else 1 65 | Q = torch.exp(teacher_output / teacher_temp).t() # Q is K-by-B for consistency with notations from our paper 66 | # B = Q.shape[1] * world_size # number of samples to assign 67 | B = n_masked_patches_tensor 68 | dist.all_reduce(B) 69 | K = Q.shape[0] # how many prototypes 70 | 71 | # make the matrix sums to 1 72 | sum_Q = torch.sum(Q) 73 | if dist.is_initialized(): 74 | dist.all_reduce(sum_Q) 75 | Q /= sum_Q 76 | 77 | for it in range(n_iterations): 78 | # normalize each row: total weight per prototype must be 1/K 79 | sum_of_rows = torch.sum(Q, dim=1, keepdim=True) 80 | if dist.is_initialized(): 81 | dist.all_reduce(sum_of_rows) 82 | Q /= sum_of_rows 83 | Q /= K 84 | 85 | # normalize each column: total weight per sample must be 1/B 86 | Q /= torch.sum(Q, dim=0, keepdim=True) 87 | Q /= B 88 | 89 | Q *= B # the columns must sum to 1 so that Q is an assignment 90 | return Q.t() 91 | 92 | def forward(self, student_patch_tokens, teacher_patch_tokens, student_masks_flat): 93 | """ 94 | Cross-entropy between softmax outputs of the teacher and student networks. 95 | student_patch_tokens: (B, N, D) tensor 96 | teacher_patch_tokens: (B, N, D) tensor 97 | student_masks_flat: (B, N) tensor 98 | """ 99 | t = teacher_patch_tokens 100 | s = student_patch_tokens 101 | loss = torch.sum(t * F.log_softmax(s / self.student_temp, dim=-1), dim=-1) 102 | loss = torch.sum(loss * student_masks_flat.float(), dim=-1) / student_masks_flat.sum(dim=-1).clamp(min=1.0) 103 | return -loss.mean() 104 | 105 | def forward_masked( 106 | self, 107 | student_patch_tokens_masked, 108 | teacher_patch_tokens_masked, 109 | student_masks_flat, 110 | n_masked_patches=None, 111 | masks_weight=None, 112 | ): 113 | t = teacher_patch_tokens_masked 114 | s = student_patch_tokens_masked 115 | # loss = torch.sum(t * F.log_softmax(s / self.student_temp, dim=-1), dim=-1) 116 | loss = lossfunc(t, s, self.student_temp) 117 | if masks_weight is None: 118 | masks_weight = ( 119 | (1 / student_masks_flat.sum(-1).clamp(min=1.0)) 120 | .unsqueeze(-1) 121 | .expand_as(student_masks_flat)[student_masks_flat] 122 | ) 123 | if n_masked_patches is not None: 124 | loss = loss[:n_masked_patches] 125 | loss = loss * masks_weight 126 | return -loss.sum() / student_masks_flat.shape[0] 127 | 128 | @torch.no_grad() 129 | def update_center(self, teacher_patch_tokens): 130 | self.reduce_center_update(teacher_patch_tokens) 131 | 132 | @torch.no_grad() 133 | def reduce_center_update(self, teacher_patch_tokens): 134 | self.updated = False 135 | self.len_teacher_patch_tokens = len(teacher_patch_tokens) 136 | self.async_batch_center = torch.sum(teacher_patch_tokens.mean(1), dim=0, keepdim=True) 137 | if dist.is_initialized(): 138 | self.reduce_handle = dist.all_reduce(self.async_batch_center, async_op=True) 139 | 140 | @torch.no_grad() 141 | def apply_center_update(self): 142 | if self.updated is False: 143 | world_size = dist.get_world_size() if dist.is_initialized() else 1 144 | 145 | if self.reduce_handle is not None: 146 | self.reduce_handle.wait() 147 | _t = self.async_batch_center / (self.len_teacher_patch_tokens * world_size) 148 | 149 | self.center = self.center * self.center_momentum + _t * (1 - self.center_momentum) 150 | 151 | self.updated = True 152 | -------------------------------------------------------------------------------- /dinov2/loss/koleo_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import logging 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | 12 | # import torch.distributed as dist 13 | 14 | 15 | logger = logging.getLogger("dinov2") 16 | 17 | 18 | class KoLeoLoss(nn.Module): 19 | """Kozachenko-Leonenko entropic loss regularizer from Sablayrolles et al. - 2018 - Spreading vectors for similarity search""" 20 | 21 | def __init__(self): 22 | super().__init__() 23 | self.pdist = nn.PairwiseDistance(2, eps=1e-8) 24 | 25 | def pairwise_NNs_inner(self, x): 26 | """ 27 | Pairwise nearest neighbors for L2-normalized vectors. 28 | Uses Torch rather than Faiss to remain on GPU. 29 | """ 30 | # parwise dot products (= inverse distance) 31 | dots = torch.mm(x, x.t()) 32 | n = x.shape[0] 33 | dots.view(-1)[:: (n + 1)].fill_(-1) # Trick to fill diagonal with -1 34 | # max inner prod -> min distance 35 | _, I = torch.max(dots, dim=1) # noqa: E741 36 | return I 37 | 38 | def forward(self, student_output, eps=1e-8): 39 | """ 40 | Args: 41 | student_output (BxD): backbone output of student 42 | """ 43 | with torch.cuda.amp.autocast(enabled=False): 44 | student_output = F.normalize(student_output, eps=eps, p=2, dim=-1) 45 | I = self.pairwise_NNs_inner(student_output) # noqa: E741 46 | distances = self.pdist(student_output, student_output[I]) # BxD, BxD -> B 47 | loss = -torch.log(distances + eps).mean() 48 | return loss 49 | -------------------------------------------------------------------------------- /dinov2/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import logging 7 | 8 | from . import vision_transformer as vits 9 | 10 | 11 | logger = logging.getLogger("dinov2") 12 | 13 | 14 | def build_model(args, only_teacher=False, img_size=224): 15 | args.arch = args.arch.removesuffix("_memeff") 16 | if "vit" in args.arch: 17 | vit_kwargs = dict( 18 | img_size=img_size, 19 | patch_size=args.patch_size, 20 | init_values=args.layerscale, 21 | ffn_layer=args.ffn_layer, 22 | block_chunks=args.block_chunks, 23 | qkv_bias=args.qkv_bias, 24 | proj_bias=args.proj_bias, 25 | ffn_bias=args.ffn_bias, 26 | num_register_tokens=args.num_register_tokens, 27 | interpolate_offset=args.interpolate_offset, 28 | interpolate_antialias=args.interpolate_antialias, 29 | ) 30 | teacher = vits.__dict__[args.arch](**vit_kwargs) 31 | if only_teacher: 32 | return teacher, teacher.embed_dim 33 | student = vits.__dict__[args.arch]( 34 | **vit_kwargs, 35 | drop_path_rate=args.drop_path_rate, 36 | drop_path_uniform=args.drop_path_uniform, 37 | ) 38 | embed_dim = student.embed_dim 39 | return student, teacher, embed_dim 40 | 41 | 42 | def build_model_from_cfg(cfg, only_teacher=False): 43 | return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size) 44 | -------------------------------------------------------------------------------- /dinov2/run/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /dinov2/run/eval/knn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import logging 7 | import os 8 | import sys 9 | 10 | from dinov2.eval.knn import get_args_parser as get_knn_args_parser 11 | from dinov2.logging import setup_logging 12 | from dinov2.run.submit import get_args_parser, submit_jobs 13 | 14 | 15 | logger = logging.getLogger("dinov2") 16 | 17 | 18 | class Evaluator: 19 | def __init__(self, args): 20 | self.args = args 21 | 22 | def __call__(self): 23 | from dinov2.eval.knn import main as knn_main 24 | 25 | self._setup_args() 26 | knn_main(self.args) 27 | 28 | def checkpoint(self): 29 | import submitit 30 | 31 | logger.info(f"Requeuing {self.args}") 32 | empty = type(self)(self.args) 33 | return submitit.helpers.DelayedSubmission(empty) 34 | 35 | def _setup_args(self): 36 | import submitit 37 | 38 | job_env = submitit.JobEnvironment() 39 | self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id)) 40 | logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 41 | logger.info(f"Args: {self.args}") 42 | 43 | 44 | def main(): 45 | description = "Submitit launcher for DINOv2 k-NN evaluation" 46 | knn_args_parser = get_knn_args_parser(add_help=False) 47 | parents = [knn_args_parser] 48 | args_parser = get_args_parser(description=description, parents=parents) 49 | args = args_parser.parse_args() 50 | 51 | setup_logging() 52 | 53 | assert os.path.exists(args.config_file), "Configuration file does not exist!" 54 | submit_jobs(Evaluator, args, name="dinov2:knn") 55 | return 0 56 | 57 | 58 | if __name__ == "__main__": 59 | sys.exit(main()) 60 | -------------------------------------------------------------------------------- /dinov2/run/eval/linear.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import logging 7 | import os 8 | import sys 9 | 10 | from dinov2.eval.linear import get_args_parser as get_linear_args_parser 11 | from dinov2.logging import setup_logging 12 | from dinov2.run.submit import get_args_parser, submit_jobs 13 | 14 | 15 | logger = logging.getLogger("dinov2") 16 | 17 | 18 | class Evaluator: 19 | def __init__(self, args): 20 | self.args = args 21 | 22 | def __call__(self): 23 | from dinov2.eval.linear import main as linear_main 24 | 25 | self._setup_args() 26 | linear_main(self.args) 27 | 28 | def checkpoint(self): 29 | import submitit 30 | 31 | logger.info(f"Requeuing {self.args}") 32 | empty = type(self)(self.args) 33 | return submitit.helpers.DelayedSubmission(empty) 34 | 35 | def _setup_args(self): 36 | import submitit 37 | 38 | job_env = submitit.JobEnvironment() 39 | self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id)) 40 | logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 41 | logger.info(f"Args: {self.args}") 42 | 43 | 44 | def main(): 45 | description = "Submitit launcher for DINOv2 linear evaluation" 46 | linear_args_parser = get_linear_args_parser(add_help=False) 47 | parents = [linear_args_parser] 48 | args_parser = get_args_parser(description=description, parents=parents) 49 | args = args_parser.parse_args() 50 | 51 | setup_logging() 52 | 53 | assert os.path.exists(args.config_file), "Configuration file does not exist!" 54 | submit_jobs(Evaluator, args, name="dinov2:linear") 55 | return 0 56 | 57 | 58 | if __name__ == "__main__": 59 | sys.exit(main()) 60 | -------------------------------------------------------------------------------- /dinov2/run/eval/log_regression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import logging 7 | import os 8 | import sys 9 | 10 | from dinov2.eval.log_regression import get_args_parser as get_log_regression_args_parser 11 | from dinov2.logging import setup_logging 12 | from dinov2.run.submit import get_args_parser, submit_jobs 13 | 14 | 15 | logger = logging.getLogger("dinov2") 16 | 17 | 18 | class Evaluator: 19 | def __init__(self, args): 20 | self.args = args 21 | 22 | def __call__(self): 23 | from dinov2.eval.log_regression import main as log_regression_main 24 | 25 | self._setup_args() 26 | log_regression_main(self.args) 27 | 28 | def checkpoint(self): 29 | import submitit 30 | 31 | logger.info(f"Requeuing {self.args}") 32 | empty = type(self)(self.args) 33 | return submitit.helpers.DelayedSubmission(empty) 34 | 35 | def _setup_args(self): 36 | import submitit 37 | 38 | job_env = submitit.JobEnvironment() 39 | self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id)) 40 | logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 41 | logger.info(f"Args: {self.args}") 42 | 43 | 44 | def main(): 45 | description = "Submitit launcher for DINOv2 logistic evaluation" 46 | log_regression_args_parser = get_log_regression_args_parser(add_help=False) 47 | parents = [log_regression_args_parser] 48 | args_parser = get_args_parser(description=description, parents=parents) 49 | args = args_parser.parse_args() 50 | 51 | setup_logging() 52 | 53 | assert os.path.exists(args.config_file), "Configuration file does not exist!" 54 | submit_jobs(Evaluator, args, name="dinov2:logreg") 55 | return 0 56 | 57 | 58 | if __name__ == "__main__": 59 | sys.exit(main()) 60 | -------------------------------------------------------------------------------- /dinov2/run/submit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import argparse 7 | import logging 8 | import os 9 | from pathlib import Path 10 | from typing import List, Optional 11 | 12 | import submitit 13 | 14 | from dinov2.utils.cluster import ( 15 | get_slurm_executor_parameters, 16 | get_slurm_partition, 17 | get_user_checkpoint_path, 18 | ) 19 | 20 | 21 | logger = logging.getLogger("dinov2") 22 | 23 | 24 | def get_args_parser( 25 | description: Optional[str] = None, 26 | parents: Optional[List[argparse.ArgumentParser]] = None, 27 | add_help: bool = True, 28 | ) -> argparse.ArgumentParser: 29 | parents = parents or [] 30 | slurm_partition = get_slurm_partition() 31 | parser = argparse.ArgumentParser( 32 | description=description, 33 | parents=parents, 34 | add_help=add_help, 35 | ) 36 | parser.add_argument( 37 | "--ngpus", 38 | "--gpus", 39 | "--gpus-per-node", 40 | default=8, 41 | type=int, 42 | help="Number of GPUs to request on each node", 43 | ) 44 | parser.add_argument( 45 | "--nodes", 46 | "--nnodes", 47 | default=1, 48 | type=int, 49 | help="Number of nodes to request", 50 | ) 51 | parser.add_argument( 52 | "--timeout", 53 | default=2800, 54 | type=int, 55 | help="Duration of the job", 56 | ) 57 | parser.add_argument( 58 | "--partition", 59 | default=slurm_partition, 60 | type=str, 61 | help="Partition where to submit", 62 | ) 63 | parser.add_argument( 64 | "--use-volta32", 65 | action="store_true", 66 | help="Request V100-32GB GPUs", 67 | ) 68 | parser.add_argument( 69 | "--comment", 70 | default="", 71 | type=str, 72 | help="Comment to pass to scheduler, e.g. priority message", 73 | ) 74 | parser.add_argument( 75 | "--exclude", 76 | default="", 77 | type=str, 78 | help="Nodes to exclude", 79 | ) 80 | return parser 81 | 82 | 83 | def get_shared_folder() -> Path: 84 | user_checkpoint_path = get_user_checkpoint_path() 85 | if user_checkpoint_path is None: 86 | raise RuntimeError("Path to user checkpoint cannot be determined") 87 | path = user_checkpoint_path / "experiments" 88 | path.mkdir(exist_ok=True) 89 | return path 90 | 91 | 92 | def submit_jobs(task_class, args, name: str): 93 | if not args.output_dir: 94 | args.output_dir = str(get_shared_folder() / "%j") 95 | 96 | Path(args.output_dir).mkdir(parents=True, exist_ok=True) 97 | executor = submitit.AutoExecutor(folder=args.output_dir, slurm_max_num_timeout=30) 98 | 99 | kwargs = {} 100 | if args.use_volta32: 101 | kwargs["slurm_constraint"] = "volta32gb" 102 | if args.comment: 103 | kwargs["slurm_comment"] = args.comment 104 | if args.exclude: 105 | kwargs["slurm_exclude"] = args.exclude 106 | 107 | executor_params = get_slurm_executor_parameters( 108 | nodes=args.nodes, 109 | num_gpus_per_node=args.ngpus, 110 | timeout_min=args.timeout, # max is 60 * 72 111 | slurm_signal_delay_s=120, 112 | slurm_partition=args.partition, 113 | **kwargs, 114 | ) 115 | executor.update_parameters(name=name, **executor_params) 116 | 117 | task = task_class(args) 118 | job = executor.submit(task) 119 | 120 | logger.info(f"Submitted job_id: {job.job_id}") 121 | str_output_dir = os.path.abspath(args.output_dir).replace("%j", str(job.job_id)) 122 | logger.info(f"Logs and checkpoints will be saved at: {str_output_dir}") 123 | -------------------------------------------------------------------------------- /dinov2/run/train/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import logging 7 | import os 8 | import sys 9 | 10 | from dinov2.logging import setup_logging 11 | from dinov2.train import get_args_parser as get_train_args_parser 12 | from dinov2.run.submit import get_args_parser, submit_jobs 13 | 14 | 15 | logger = logging.getLogger("dinov2") 16 | 17 | 18 | class Trainer(object): 19 | def __init__(self, args): 20 | self.args = args 21 | 22 | def __call__(self): 23 | from dinov2.train import main as train_main 24 | 25 | self._setup_args() 26 | train_main(self.args) 27 | 28 | def checkpoint(self): 29 | import submitit 30 | 31 | logger.info(f"Requeuing {self.args}") 32 | empty = type(self)(self.args) 33 | return submitit.helpers.DelayedSubmission(empty) 34 | 35 | def _setup_args(self): 36 | import submitit 37 | 38 | job_env = submitit.JobEnvironment() 39 | self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id)) 40 | logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 41 | logger.info(f"Args: {self.args}") 42 | 43 | 44 | def main(): 45 | description = "Submitit launcher for DINOv2 training" 46 | train_args_parser = get_train_args_parser(add_help=False) 47 | parents = [train_args_parser] 48 | args_parser = get_args_parser(description=description, parents=parents) 49 | args = args_parser.parse_args() 50 | 51 | setup_logging() 52 | 53 | assert os.path.exists(args.config_file), "Configuration file does not exist!" 54 | submit_jobs(Trainer, args, name="dinov2:train") 55 | return 0 56 | 57 | 58 | if __name__ == "__main__": 59 | sys.exit(main()) 60 | -------------------------------------------------------------------------------- /dinov2/train/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .train import get_args_parser, main 7 | from .ssl_meta_arch import SSLMetaArch 8 | -------------------------------------------------------------------------------- /dinov2/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /dinov2/utils/cluster.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from enum import Enum 7 | import os 8 | from pathlib import Path 9 | from typing import Any, Dict, Optional 10 | 11 | 12 | class ClusterType(Enum): 13 | AWS = "aws" 14 | FAIR = "fair" 15 | RSC = "rsc" 16 | 17 | 18 | def _guess_cluster_type() -> ClusterType: 19 | uname = os.uname() 20 | if uname.sysname == "Linux": 21 | if uname.release.endswith("-aws"): 22 | # Linux kernel versions on AWS instances are of the form "5.4.0-1051-aws" 23 | return ClusterType.AWS 24 | elif uname.nodename.startswith("rsc"): 25 | # Linux kernel versions on RSC instances are standard ones but hostnames start with "rsc" 26 | return ClusterType.RSC 27 | 28 | return ClusterType.FAIR 29 | 30 | 31 | def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]: 32 | if cluster_type is None: 33 | return _guess_cluster_type() 34 | 35 | return cluster_type 36 | 37 | 38 | def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]: 39 | cluster_type = get_cluster_type(cluster_type) 40 | if cluster_type is None: 41 | return None 42 | 43 | CHECKPOINT_DIRNAMES = { 44 | ClusterType.AWS: "checkpoints", 45 | ClusterType.FAIR: "checkpoint", 46 | ClusterType.RSC: "checkpoint/dino", 47 | } 48 | return Path("/") / CHECKPOINT_DIRNAMES[cluster_type] 49 | 50 | 51 | def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]: 52 | checkpoint_path = get_checkpoint_path(cluster_type) 53 | if checkpoint_path is None: 54 | return None 55 | 56 | username = os.environ.get("USER") 57 | assert username is not None 58 | return checkpoint_path / username 59 | 60 | 61 | def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]: 62 | cluster_type = get_cluster_type(cluster_type) 63 | if cluster_type is None: 64 | return None 65 | 66 | SLURM_PARTITIONS = { 67 | ClusterType.AWS: "learnlab", 68 | ClusterType.FAIR: "learnlab", 69 | ClusterType.RSC: "learn", 70 | } 71 | return SLURM_PARTITIONS[cluster_type] 72 | 73 | 74 | def get_slurm_executor_parameters( 75 | nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs 76 | ) -> Dict[str, Any]: 77 | # create default parameters 78 | params = { 79 | "mem_gb": 0, # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html 80 | "gpus_per_node": num_gpus_per_node, 81 | "tasks_per_node": num_gpus_per_node, # one task per GPU 82 | "cpus_per_task": 10, 83 | "nodes": nodes, 84 | "slurm_partition": get_slurm_partition(cluster_type), 85 | } 86 | # apply cluster-specific adjustments 87 | cluster_type = get_cluster_type(cluster_type) 88 | if cluster_type == ClusterType.AWS: 89 | params["cpus_per_task"] = 12 90 | del params["mem_gb"] 91 | elif cluster_type == ClusterType.RSC: 92 | params["cpus_per_task"] = 12 93 | # set additional parameters / apply overrides 94 | params.update(kwargs) 95 | return params 96 | -------------------------------------------------------------------------------- /dinov2/utils/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import math 7 | import logging 8 | import os 9 | 10 | from omegaconf import OmegaConf 11 | 12 | import dinov2.distributed as distributed 13 | from dinov2.logging import setup_logging 14 | from dinov2.utils import utils 15 | from dinov2.configs import dinov2_default_config 16 | 17 | 18 | logger = logging.getLogger("dinov2") 19 | 20 | 21 | def apply_scaling_rules_to_cfg(cfg): # to fix 22 | if cfg.optim.scaling_rule == "sqrt_wrt_1024": 23 | base_lr = cfg.optim.base_lr 24 | cfg.optim.lr = base_lr 25 | cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0) 26 | logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}") 27 | else: 28 | raise NotImplementedError 29 | return cfg 30 | 31 | 32 | def write_config(cfg, output_dir, name="config.yaml"): 33 | logger.info(OmegaConf.to_yaml(cfg)) 34 | saved_cfg_path = os.path.join(output_dir, name) 35 | with open(saved_cfg_path, "w") as f: 36 | OmegaConf.save(config=cfg, f=f) 37 | return saved_cfg_path 38 | 39 | 40 | def get_cfg_from_args(args): 41 | args.output_dir = os.path.abspath(args.output_dir) 42 | args.opts += [f"train.output_dir={args.output_dir}"] 43 | default_cfg = OmegaConf.create(dinov2_default_config) 44 | cfg = OmegaConf.load(args.config_file) 45 | cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts)) 46 | return cfg 47 | 48 | 49 | def default_setup(args): 50 | distributed.enable(overwrite=True) 51 | seed = getattr(args, "seed", 0) 52 | rank = distributed.get_global_rank() 53 | 54 | global logger 55 | setup_logging(output=args.output_dir, level=logging.INFO) 56 | logger = logging.getLogger("dinov2") 57 | 58 | utils.fix_random_seeds(seed + rank) 59 | logger.info("git:\n {}\n".format(utils.get_sha())) 60 | logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items()))) 61 | 62 | 63 | def setup(args): 64 | """ 65 | Create configs and perform basic setups. 66 | """ 67 | cfg = get_cfg_from_args(args) 68 | os.makedirs(args.output_dir, exist_ok=True) 69 | default_setup(args) 70 | apply_scaling_rules_to_cfg(cfg) 71 | write_config(cfg, args.output_dir) 72 | return cfg 73 | -------------------------------------------------------------------------------- /dinov2/utils/dtype.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | 7 | from typing import Dict, Union 8 | 9 | import numpy as np 10 | import torch 11 | 12 | 13 | TypeSpec = Union[str, np.dtype, torch.dtype] 14 | 15 | 16 | _NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = { 17 | np.dtype("bool"): torch.bool, 18 | np.dtype("uint8"): torch.uint8, 19 | np.dtype("int8"): torch.int8, 20 | np.dtype("int16"): torch.int16, 21 | np.dtype("int32"): torch.int32, 22 | np.dtype("int64"): torch.int64, 23 | np.dtype("float16"): torch.float16, 24 | np.dtype("float32"): torch.float32, 25 | np.dtype("float64"): torch.float64, 26 | np.dtype("complex64"): torch.complex64, 27 | np.dtype("complex128"): torch.complex128, 28 | } 29 | 30 | 31 | def as_torch_dtype(dtype: TypeSpec) -> torch.dtype: 32 | if isinstance(dtype, torch.dtype): 33 | return dtype 34 | if isinstance(dtype, str): 35 | dtype = np.dtype(dtype) 36 | assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}" 37 | return _NUMPY_TO_TORCH_DTYPE[dtype] 38 | -------------------------------------------------------------------------------- /dinov2/utils/param_groups.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from collections import defaultdict 7 | import logging 8 | 9 | 10 | logger = logging.getLogger("dinov2") 11 | 12 | 13 | def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12, force_is_backbone=False, chunked_blocks=False): 14 | """ 15 | Calculate lr decay rate for different ViT blocks. 16 | Args: 17 | name (string): parameter name. 18 | lr_decay_rate (float): base lr decay rate. 19 | num_layers (int): number of ViT blocks. 20 | Returns: 21 | lr decay rate for the given parameter. 22 | """ 23 | layer_id = num_layers + 1 24 | if name.startswith("backbone") or force_is_backbone: 25 | if ( 26 | ".pos_embed" in name 27 | or ".patch_embed" in name 28 | or ".mask_token" in name 29 | or ".cls_token" in name 30 | or ".register_tokens" in name 31 | ): 32 | layer_id = 0 33 | elif force_is_backbone and ( 34 | "pos_embed" in name 35 | or "patch_embed" in name 36 | or "mask_token" in name 37 | or "cls_token" in name 38 | or "register_tokens" in name 39 | ): 40 | layer_id = 0 41 | elif ".blocks." in name and ".residual." not in name: 42 | layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1 43 | elif chunked_blocks and "blocks." in name and "residual." not in name: 44 | layer_id = int(name[name.find("blocks.") :].split(".")[2]) + 1 45 | elif "blocks." in name and "residual." not in name: 46 | layer_id = int(name[name.find("blocks.") :].split(".")[1]) + 1 47 | 48 | return lr_decay_rate ** (num_layers + 1 - layer_id) 49 | 50 | 51 | def get_params_groups_with_decay(model, lr_decay_rate=1.0, patch_embed_lr_mult=1.0): 52 | chunked_blocks = False 53 | if hasattr(model, "n_blocks"): 54 | logger.info("chunked fsdp") 55 | n_blocks = model.n_blocks 56 | chunked_blocks = model.chunked_blocks 57 | elif hasattr(model, "blocks"): 58 | logger.info("first code branch") 59 | n_blocks = len(model.blocks) 60 | elif hasattr(model, "backbone"): 61 | logger.info("second code branch") 62 | n_blocks = len(model.backbone.blocks) 63 | else: 64 | logger.info("else code branch") 65 | n_blocks = 0 66 | all_param_groups = [] 67 | 68 | for name, param in model.named_parameters(): 69 | name = name.replace("_fsdp_wrapped_module.", "") 70 | if not param.requires_grad: 71 | continue 72 | decay_rate = get_vit_lr_decay_rate( 73 | name, lr_decay_rate, num_layers=n_blocks, force_is_backbone=n_blocks > 0, chunked_blocks=chunked_blocks 74 | ) 75 | d = {"params": param, "is_last_layer": False, "lr_multiplier": decay_rate, "wd_multiplier": 1.0, "name": name} 76 | 77 | if "last_layer" in name: 78 | d.update({"is_last_layer": True}) 79 | 80 | if name.endswith(".bias") or "norm" in name or "gamma" in name: 81 | d.update({"wd_multiplier": 0.0}) 82 | 83 | if "patch_embed" in name: 84 | d.update({"lr_multiplier": d["lr_multiplier"] * patch_embed_lr_mult}) 85 | 86 | all_param_groups.append(d) 87 | logger.info(f"""{name}: lr_multiplier: {d["lr_multiplier"]}, wd_multiplier: {d["wd_multiplier"]}""") 88 | 89 | return all_param_groups 90 | 91 | 92 | def fuse_params_groups(all_params_groups, keys=("lr_multiplier", "wd_multiplier", "is_last_layer")): 93 | fused_params_groups = defaultdict(lambda: {"params": []}) 94 | for d in all_params_groups: 95 | identifier = "" 96 | for k in keys: 97 | identifier += k + str(d[k]) + "_" 98 | 99 | for k in keys: 100 | fused_params_groups[identifier][k] = d[k] 101 | fused_params_groups[identifier]["params"].append(d["params"]) 102 | 103 | return fused_params_groups.values() 104 | -------------------------------------------------------------------------------- /dinov2/utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import logging 7 | import os 8 | import random 9 | import subprocess 10 | from urllib.parse import urlparse 11 | 12 | import numpy as np 13 | import torch 14 | from torch import nn 15 | 16 | 17 | logger = logging.getLogger("dinov2") 18 | 19 | 20 | def load_pretrained_weights(model, pretrained_weights, checkpoint_key): 21 | if urlparse(pretrained_weights).scheme: # If it looks like an URL 22 | state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu") 23 | else: 24 | state_dict = torch.load(pretrained_weights, map_location="cpu") 25 | if checkpoint_key is not None and checkpoint_key in state_dict: 26 | logger.info(f"Take key {checkpoint_key} in provided checkpoint dict") 27 | state_dict = state_dict[checkpoint_key] 28 | # remove `module.` prefix 29 | state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} 30 | # remove `backbone.` prefix induced by multicrop wrapper 31 | state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()} 32 | msg = model.load_state_dict(state_dict, strict=False) 33 | logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg)) 34 | 35 | 36 | def fix_random_seeds(seed=31): 37 | """ 38 | Fix random seeds. 39 | """ 40 | torch.manual_seed(seed) 41 | torch.cuda.manual_seed_all(seed) 42 | np.random.seed(seed) 43 | random.seed(seed) 44 | 45 | 46 | def get_sha(): 47 | cwd = os.path.dirname(os.path.abspath(__file__)) 48 | 49 | def _run(command): 50 | return subprocess.check_output(command, cwd=cwd).decode("ascii").strip() 51 | 52 | sha = "N/A" 53 | diff = "clean" 54 | branch = "N/A" 55 | try: 56 | sha = _run(["git", "rev-parse", "HEAD"]) 57 | subprocess.check_output(["git", "diff"], cwd=cwd) 58 | diff = _run(["git", "diff-index", "HEAD"]) 59 | diff = "has uncommitted changes" if diff else "clean" 60 | branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"]) 61 | except Exception: 62 | pass 63 | message = f"sha: {sha}, status: {diff}, branch: {branch}" 64 | return message 65 | 66 | 67 | class CosineScheduler(object): 68 | def __init__(self, base_value, final_value, total_iters, warmup_iters=0, start_warmup_value=0, freeze_iters=0): 69 | super().__init__() 70 | self.final_value = final_value 71 | self.total_iters = total_iters 72 | 73 | freeze_schedule = np.zeros((freeze_iters)) 74 | 75 | warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters) 76 | 77 | iters = np.arange(total_iters - warmup_iters - freeze_iters) 78 | schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters))) 79 | self.schedule = np.concatenate((freeze_schedule, warmup_schedule, schedule)) 80 | 81 | assert len(self.schedule) == self.total_iters 82 | 83 | def __getitem__(self, it): 84 | if it >= self.total_iters: 85 | return self.final_value 86 | else: 87 | return self.schedule[it] 88 | 89 | 90 | def has_batchnorms(model): 91 | bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm) 92 | for name, module in model.named_modules(): 93 | if isinstance(module, bn_types): 94 | return True 95 | return False 96 | -------------------------------------------------------------------------------- /hubconf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | 7 | from dinov2.hub.backbones import dinov2_vitb14, dinov2_vitg14, dinov2_vitl14, dinov2_vits14 8 | from dinov2.hub.backbones import dinov2_vitb14_reg, dinov2_vitg14_reg, dinov2_vitl14_reg, dinov2_vits14_reg 9 | from dinov2.hub.classifiers import dinov2_vitb14_lc, dinov2_vitg14_lc, dinov2_vitl14_lc, dinov2_vits14_lc 10 | from dinov2.hub.classifiers import dinov2_vitb14_reg_lc, dinov2_vitg14_reg_lc, dinov2_vitl14_reg_lc, dinov2_vits14_reg_lc 11 | from dinov2.hub.depthers import dinov2_vitb14_ld, dinov2_vitg14_ld, dinov2_vitl14_ld, dinov2_vits14_ld 12 | from dinov2.hub.depthers import dinov2_vitb14_dd, dinov2_vitg14_dd, dinov2_vitl14_dd, dinov2_vits14_dd 13 | 14 | 15 | dependencies = ["torch"] 16 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 120 3 | 4 | [tool.pylint.master] 5 | persistent = false 6 | score = false 7 | 8 | [tool.pylint.messages_control] 9 | disable = "all" 10 | enable = [ 11 | "miscellaneous", 12 | "similarities", 13 | ] 14 | 15 | [tool.pylint.similarities] 16 | ignore-comments = true 17 | ignore-docstrings = true 18 | ignore-imports = true 19 | min-similarity-lines = 8 20 | 21 | [tool.pylint.reports] 22 | reports = false 23 | 24 | [tool.pylint.miscellaneous] 25 | notes = [ 26 | "FIXME", 27 | "XXX", 28 | "TODO", 29 | ] 30 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | black==22.6.0 2 | flake8==5.0.4 3 | pylint==2.15.0 4 | -------------------------------------------------------------------------------- /requirements-extras.txt: -------------------------------------------------------------------------------- 1 | mmcv-full==1.5.0 2 | mmsegmentation==0.27.0 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu117 2 | torch==2.0.0 3 | torchvision==0.15.0 4 | omegaconf 5 | torchmetrics==0.10.3 6 | fvcore 7 | iopath 8 | xformers==0.0.18 9 | submitit 10 | --extra-index-url https://pypi.nvidia.com 11 | cuml-cu11 12 | -------------------------------------------------------------------------------- /scripts/lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ -n "$1" ]; then 4 | echo "linting \"$1\"" 5 | fi 6 | 7 | echo "running black" 8 | if [ -n "$1" ]; then 9 | black "$1" 10 | else 11 | black dinov2 12 | fi 13 | 14 | echo "running flake8" 15 | if [ -n "$1" ]; then 16 | flake8 "$1" 17 | else 18 | flake8 19 | fi 20 | 21 | echo "running pylint" 22 | if [ -n "$1" ]; then 23 | pylint "$1" 24 | else 25 | pylint dinov2 26 | fi 27 | 28 | exit 0 29 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | ignore = E203,E501,W503 4 | per-file-ignores = 5 | __init__.py:F401 6 | hubconf.py:F401 7 | exclude = 8 | venv 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from pathlib import Path 7 | import re 8 | from typing import List, Tuple 9 | 10 | from setuptools import setup, find_packages 11 | 12 | 13 | NAME = "dinov2" 14 | DESCRIPTION = "PyTorch code and models for the DINOv2 self-supervised learning method." 15 | 16 | URL = "https://github.com/facebookresearch/dinov2" 17 | AUTHOR = "FAIR" 18 | REQUIRES_PYTHON = ">=3.9.0" 19 | HERE = Path(__file__).parent 20 | 21 | 22 | try: 23 | with open(HERE / "README.md", encoding="utf-8") as f: 24 | long_description = "\n" + f.read() 25 | except FileNotFoundError: 26 | long_description = DESCRIPTION 27 | 28 | 29 | def get_requirements(path: str = HERE / "requirements.txt") -> Tuple[List[str], List[str]]: 30 | requirements = [] 31 | extra_indices = [] 32 | with open(path) as f: 33 | for line in f.readlines(): 34 | line = line.rstrip("\r\n") 35 | if line.startswith("--extra-index-url "): 36 | extra_indices.append(line[18:]) 37 | continue 38 | requirements.append(line) 39 | return requirements, extra_indices 40 | 41 | 42 | def get_package_version() -> str: 43 | with open(HERE / "dinov2/__init__.py") as f: 44 | result = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", f.read(), re.M) 45 | if result: 46 | return result.group(1) 47 | raise RuntimeError("Can't get package version") 48 | 49 | 50 | requirements, extra_indices = get_requirements() 51 | version = get_package_version() 52 | dev_requirements, _ = get_requirements(HERE / "requirements-dev.txt") 53 | extras_requirements, _ = get_requirements(HERE / "requirements-extras.txt") 54 | 55 | 56 | setup( 57 | name=NAME, 58 | version=version, 59 | description=DESCRIPTION, 60 | long_description=long_description, 61 | long_description_content_type="text/markdown", 62 | author=AUTHOR, 63 | python_requires=REQUIRES_PYTHON, 64 | url=URL, 65 | packages=find_packages(), 66 | package_data={ 67 | "": ["*.yaml"], 68 | }, 69 | install_requires=requirements, 70 | extras_require={ 71 | "dev": dev_requirements, 72 | "extras": extras_requirements, 73 | }, 74 | dependency_links=extra_indices, 75 | install_package_data=True, 76 | license="Apache", 77 | license_files=("LICENSE",), 78 | classifiers=[ 79 | # Trove classifiers: https://github.com/pypa/trove-classifiers/blob/main/src/trove_classifiers/__init__.py 80 | "Development Status :: 3 - Alpha", 81 | "Intended Audience :: Developers", 82 | "Intended Audience :: Science/Research", 83 | "License :: OSI Approved :: Apache Software License", 84 | "Programming Language :: Python :: 3.9", 85 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 86 | "Topic :: Software Development :: Libraries :: Python Modules", 87 | ], 88 | ) 89 | --------------------------------------------------------------------------------