├── .github
    └── workflows
    │   └── lint.yaml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MODEL_CARD.md
├── README.md
├── conda-extras.yaml
├── conda.yaml
├── dinov2
    ├── __init__.py
    ├── configs
    │   ├── __init__.py
    │   ├── eval
    │   │   ├── vitb14_pretrain.yaml
    │   │   ├── vitb14_reg4_pretrain.yaml
    │   │   ├── vitg14_pretrain.yaml
    │   │   ├── vitg14_reg4_pretrain.yaml
    │   │   ├── vitl14_pretrain.yaml
    │   │   ├── vitl14_reg4_pretrain.yaml
    │   │   ├── vits14_pretrain.yaml
    │   │   └── vits14_reg4_pretrain.yaml
    │   ├── ssl_default_config.yaml
    │   └── train
    │   │   ├── vitg14.yaml
    │   │   ├── vitl14.yaml
    │   │   └── vitl16_short.yaml
    ├── data
    │   ├── __init__.py
    │   ├── adapters.py
    │   ├── augmentations.py
    │   ├── collate.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── decoders.py
    │   │   ├── extended.py
    │   │   ├── image_net.py
    │   │   └── image_net_22k.py
    │   ├── loaders.py
    │   ├── masking.py
    │   ├── samplers.py
    │   └── transforms.py
    ├── distributed
    │   └── __init__.py
    ├── eval
    │   ├── __init__.py
    │   ├── depth
    │   │   ├── __init__.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── backbones
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── vision_transformer.py
    │   │   │   ├── builder.py
    │   │   │   ├── decode_heads
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── decode_head.py
    │   │   │   │   ├── dpt_head.py
    │   │   │   │   └── linear_head.py
    │   │   │   ├── depther
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base.py
    │   │   │   │   └── encoder_decoder.py
    │   │   │   └── losses
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── gradientloss.py
    │   │   │   │   └── sigloss.py
    │   │   └── ops
    │   │   │   ├── __init__.py
    │   │   │   └── wrappers.py
    │   ├── knn.py
    │   ├── linear.py
    │   ├── log_regression.py
    │   ├── metrics.py
    │   ├── segmentation
    │   │   ├── __init__.py
    │   │   ├── hooks
    │   │   │   ├── __init__.py
    │   │   │   └── optimizer.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── backbones
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── vision_transformer.py
    │   │   │   └── decode_heads
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── linear_head.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   └── colormaps.py
    │   ├── segmentation_m2f
    │   │   ├── __init__.py
    │   │   ├── core
    │   │   │   ├── __init__.py
    │   │   │   ├── anchor
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── builder.py
    │   │   │   │   └── point_generator.py
    │   │   │   ├── box
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── builder.py
    │   │   │   │   └── samplers
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── base_sampler.py
    │   │   │   │   │   ├── mask_pseudo_sampler.py
    │   │   │   │   │   ├── mask_sampling_result.py
    │   │   │   │   │   └── sampling_result.py
    │   │   │   └── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── dist_utils.py
    │   │   │   │   └── misc.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── backbones
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── adapter_modules.py
    │   │   │   │   ├── drop_path.py
    │   │   │   │   ├── vit.py
    │   │   │   │   └── vit_adapter.py
    │   │   │   ├── builder.py
    │   │   │   ├── decode_heads
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── mask2former_head.py
    │   │   │   ├── losses
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── cross_entropy_loss.py
    │   │   │   │   ├── dice_loss.py
    │   │   │   │   └── match_costs.py
    │   │   │   ├── plugins
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── msdeformattn_pixel_decoder.py
    │   │   │   ├── segmentors
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── encoder_decoder_mask2former.py
    │   │   │   └── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── assigner.py
    │   │   │   │   ├── point_sample.py
    │   │   │   │   ├── positional_encoding.py
    │   │   │   │   └── transformer.py
    │   │   └── ops
    │   │   │   └── modules
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn.py
    │   ├── setup.py
    │   └── utils.py
    ├── fsdp
    │   └── __init__.py
    ├── hub
    │   ├── __init__.py
    │   ├── backbones.py
    │   ├── classifiers.py
    │   ├── depth
    │   │   ├── __init__.py
    │   │   ├── decode_heads.py
    │   │   ├── encoder_decoder.py
    │   │   └── ops.py
    │   ├── depthers.py
    │   └── utils.py
    ├── layers
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── block.py
    │   ├── dino_head.py
    │   ├── drop_path.py
    │   ├── layer_scale.py
    │   ├── mlp.py
    │   ├── patch_embed.py
    │   └── swiglu_ffn.py
    ├── logging
    │   ├── __init__.py
    │   └── helpers.py
    ├── loss
    │   ├── __init__.py
    │   ├── dino_clstoken_loss.py
    │   ├── ibot_patch_loss.py
    │   └── koleo_loss.py
    ├── models
    │   ├── __init__.py
    │   └── vision_transformer.py
    ├── run
    │   ├── __init__.py
    │   ├── eval
    │   │   ├── knn.py
    │   │   ├── linear.py
    │   │   └── log_regression.py
    │   ├── submit.py
    │   └── train
    │   │   └── train.py
    ├── train
    │   ├── __init__.py
    │   ├── ssl_meta_arch.py
    │   └── train.py
    └── utils
    │   ├── __init__.py
    │   ├── cluster.py
    │   ├── config.py
    │   ├── dtype.py
    │   ├── param_groups.py
    │   └── utils.py
├── hubconf.py
├── notebooks
    ├── depth_estimation.ipynb
    └── semantic_segmentation.ipynb
├── pyproject.toml
├── requirements-dev.txt
├── requirements-extras.txt
├── requirements.txt
├── scripts
    └── lint.sh
├── setup.cfg
└── setup.py


/.github/workflows/lint.yaml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 | jobs:
12 |   run-linters:
13 |     name: Run linters
14 |     runs-on: ubuntu-20.04
15 | 
16 |     steps:
17 |       - name: Checkout repository
18 |         uses: actions/checkout@v3
19 |       - name: Set up Python
20 |         uses: actions/setup-python@v4
21 |         with:
22 |           python-version: 3.9
23 |           cache: 'pip'
24 |           cache-dependency-path: '**/requirements*.txt'
25 |       - name: Install Python (development) dependencies
26 |         run: |
27 |           pip install -r requirements-dev.txt
28 |       - name: Run flake8
29 |         run: |
30 |           flake8
31 |       - name: Run black
32 |         if: always()
33 |         run: |
34 |           black --check dinov2
35 |       - name: Run pylint
36 |         if: always()
37 |         run: |
38 |           pylint --exit-zero dinov2
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | dist/
 3 | *.egg-info/
 4 | **/__pycache__/
 5 | 
 6 | **/.ipynb_checkpoints
 7 | **/.ipynb_checkpoints/**
 8 | 
 9 | *.swp
10 | 
11 | .vscode/
12 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@meta.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq
81 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to DINOv2
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `main`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Meta's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to DINOv2, you agree that your contributions will be licensed
31 | under the LICENSE file in the root directory of this source tree.
32 | 


--------------------------------------------------------------------------------
/conda-extras.yaml:
--------------------------------------------------------------------------------
 1 | name: dinov2-extras
 2 | channels:
 3 |   - defaults
 4 |   - pytorch
 5 |   - nvidia
 6 |   - xformers
 7 |   - conda-forge
 8 | dependencies:
 9 |   - python=3.9
10 |   - pytorch::pytorch=2.0.0
11 |   - pytorch::pytorch-cuda=11.7.0
12 |   - pytorch::torchvision=0.15.0
13 |   - omegaconf
14 |   - torchmetrics=0.10.3
15 |   - fvcore
16 |   - iopath
17 |   - xformers::xformers=0.0.18
18 |   - pip
19 |   - pip:
20 |     - git+https://github.com/facebookincubator/submitit
21 |     - --extra-index-url https://pypi.nvidia.com
22 |     - cuml-cu11
23 |     - mmcv-full==1.5.0
24 |     - mmsegmentation==0.27.0
25 | 


--------------------------------------------------------------------------------
/conda.yaml:
--------------------------------------------------------------------------------
 1 | name: dinov2
 2 | channels:
 3 |   - defaults
 4 |   - pytorch
 5 |   - nvidia
 6 |   - xformers
 7 |   - conda-forge
 8 | dependencies:
 9 |   - python=3.9
10 |   - pytorch::pytorch=2.0.0
11 |   - pytorch::pytorch-cuda=11.7.0
12 |   - pytorch::torchvision=0.15.0
13 |   - omegaconf
14 |   - torchmetrics=0.10.3
15 |   - fvcore
16 |   - iopath
17 |   - xformers::xformers=0.0.18
18 |   - pip
19 |   - pip:
20 |     - git+https://github.com/facebookincubator/submitit
21 |     - --extra-index-url https://pypi.nvidia.com
22 |     - cuml-cu11
23 | 


--------------------------------------------------------------------------------
/dinov2/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | __version__ = "0.0.1"
7 | 


--------------------------------------------------------------------------------
/dinov2/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import pathlib
 7 | 
 8 | from omegaconf import OmegaConf
 9 | 
10 | 
11 | def load_config(config_name: str):
12 |     config_filename = config_name + ".yaml"
13 |     return OmegaConf.load(pathlib.Path(__file__).parent.resolve() / config_filename)
14 | 
15 | 
16 | dinov2_default_config = load_config("ssl_default_config")
17 | 
18 | 
19 | def load_and_merge_config(config_name: str):
20 |     default_config = OmegaConf.create(dinov2_default_config)
21 |     loaded_config = load_config(config_name)
22 |     return OmegaConf.merge(default_config, loaded_config)
23 | 


--------------------------------------------------------------------------------
/dinov2/configs/eval/vitb14_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_base
3 |   patch_size: 14
4 | crops:
5 |   global_crops_size: 518  # this is to set up the position embeddings properly
6 |   local_crops_size: 98


--------------------------------------------------------------------------------
/dinov2/configs/eval/vitb14_reg4_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_base
3 |   patch_size: 14
4 |   num_register_tokens: 4
5 |   interpolate_antialias: true
6 |   interpolate_offset: 0.0
7 | crops:
8 |   global_crops_size: 518  # this is to set up the position embeddings properly
9 |   local_crops_size: 98


--------------------------------------------------------------------------------
/dinov2/configs/eval/vitg14_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_giant2
3 |   patch_size: 14
4 |   ffn_layer: swiglufused
5 | crops:
6 |   global_crops_size: 518  # this is to set up the position embeddings properly
7 |   local_crops_size: 98


--------------------------------------------------------------------------------
/dinov2/configs/eval/vitg14_reg4_pretrain.yaml:
--------------------------------------------------------------------------------
 1 | student:
 2 |   arch: vit_giant2
 3 |   patch_size: 14
 4 |   ffn_layer: swiglufused
 5 |   num_register_tokens: 4
 6 |   interpolate_antialias: true
 7 |   interpolate_offset: 0.0
 8 | crops:
 9 |   global_crops_size: 518  # this is to set up the position embeddings properly
10 |   local_crops_size: 98


--------------------------------------------------------------------------------
/dinov2/configs/eval/vitl14_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_large
3 |   patch_size: 14
4 | crops:
5 |   global_crops_size: 518  # this is to set up the position embeddings properly
6 |   local_crops_size: 98


--------------------------------------------------------------------------------
/dinov2/configs/eval/vitl14_reg4_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_large
3 |   patch_size: 14
4 |   num_register_tokens: 4
5 |   interpolate_antialias: true
6 |   interpolate_offset: 0.0
7 | crops:
8 |   global_crops_size: 518  # this is to set up the position embeddings properly
9 |   local_crops_size: 98


--------------------------------------------------------------------------------
/dinov2/configs/eval/vits14_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_small
3 |   patch_size: 14
4 | crops:
5 |   global_crops_size: 518  # this is to set up the position embeddings properly
6 |   local_crops_size: 98


--------------------------------------------------------------------------------
/dinov2/configs/eval/vits14_reg4_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_small
3 |   patch_size: 14
4 |   num_register_tokens: 4
5 |   interpolate_antialias: true
6 |   interpolate_offset: 0.0
7 | crops:
8 |   global_crops_size: 518  # this is to set up the position embeddings properly
9 |   local_crops_size: 98


--------------------------------------------------------------------------------
/dinov2/configs/ssl_default_config.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   WEIGHTS: ''
  3 | compute_precision:
  4 |   grad_scaler: true
  5 |   teacher:
  6 |     backbone:
  7 |       sharding_strategy: SHARD_GRAD_OP
  8 |       mixed_precision:
  9 |         param_dtype: fp16
 10 |         reduce_dtype: fp16
 11 |         buffer_dtype: fp32
 12 |     dino_head:
 13 |       sharding_strategy: SHARD_GRAD_OP
 14 |       mixed_precision:
 15 |         param_dtype: fp16
 16 |         reduce_dtype: fp16
 17 |         buffer_dtype: fp32
 18 |     ibot_head:
 19 |       sharding_strategy: SHARD_GRAD_OP
 20 |       mixed_precision:
 21 |         param_dtype: fp16
 22 |         reduce_dtype: fp16
 23 |         buffer_dtype: fp32
 24 |   student:
 25 |     backbone:
 26 |       sharding_strategy: SHARD_GRAD_OP
 27 |       mixed_precision:
 28 |         param_dtype: fp16
 29 |         reduce_dtype: fp16
 30 |         buffer_dtype: fp32
 31 |     dino_head:
 32 |       sharding_strategy: SHARD_GRAD_OP
 33 |       mixed_precision:
 34 |         param_dtype: fp16
 35 |         reduce_dtype: fp32
 36 |         buffer_dtype: fp32
 37 |     ibot_head:
 38 |       sharding_strategy: SHARD_GRAD_OP
 39 |       mixed_precision:
 40 |         param_dtype: fp16
 41 |         reduce_dtype: fp32
 42 |         buffer_dtype: fp32
 43 | dino:
 44 |   loss_weight: 1.0
 45 |   head_n_prototypes: 65536
 46 |   head_bottleneck_dim: 256
 47 |   head_nlayers: 3
 48 |   head_hidden_dim: 2048
 49 |   koleo_loss_weight: 0.1
 50 | ibot:
 51 |   loss_weight: 1.0
 52 |   mask_sample_probability: 0.5
 53 |   mask_ratio_min_max:
 54 |   - 0.1
 55 |   - 0.5
 56 |   separate_head: false
 57 |   head_n_prototypes: 65536
 58 |   head_bottleneck_dim: 256
 59 |   head_nlayers: 3
 60 |   head_hidden_dim: 2048
 61 | train:
 62 |   batch_size_per_gpu: 64
 63 |   dataset_path: ImageNet:split=TRAIN
 64 |   output_dir: .
 65 |   saveckp_freq: 20
 66 |   seed: 0
 67 |   num_workers: 10
 68 |   OFFICIAL_EPOCH_LENGTH: 1250
 69 |   cache_dataset: true
 70 |   centering: "centering" # or "sinkhorn_knopp"
 71 | student:
 72 |   arch: vit_large
 73 |   patch_size: 16
 74 |   drop_path_rate: 0.3
 75 |   layerscale: 1.0e-05
 76 |   drop_path_uniform: true
 77 |   pretrained_weights: ''
 78 |   ffn_layer: "mlp"
 79 |   block_chunks: 0
 80 |   qkv_bias: true
 81 |   proj_bias: true
 82 |   ffn_bias: true
 83 |   num_register_tokens: 0
 84 |   interpolate_antialias: false
 85 |   interpolate_offset: 0.1
 86 | teacher:
 87 |   momentum_teacher: 0.992
 88 |   final_momentum_teacher: 1
 89 |   warmup_teacher_temp: 0.04
 90 |   teacher_temp: 0.07
 91 |   warmup_teacher_temp_epochs: 30
 92 | optim:
 93 |   epochs: 100
 94 |   weight_decay: 0.04
 95 |   weight_decay_end: 0.4
 96 |   base_lr: 0.004  # learning rate for a batch size of 1024
 97 |   lr: 0.  # will be set after applying scaling rule
 98 |   warmup_epochs: 10
 99 |   min_lr: 1.0e-06
100 |   clip_grad: 3.0
101 |   freeze_last_layer_epochs: 1
102 |   scaling_rule: sqrt_wrt_1024
103 |   patch_embed_lr_mult: 0.2
104 |   layerwise_decay: 0.9
105 |   adamw_beta1: 0.9
106 |   adamw_beta2: 0.999
107 | crops:
108 |   global_crops_scale:
109 |   - 0.32
110 |   - 1.0
111 |   local_crops_number: 8
112 |   local_crops_scale:
113 |   - 0.05
114 |   - 0.32
115 |   global_crops_size: 224
116 |   local_crops_size: 96
117 | evaluation:
118 |   eval_period_iterations: 12500
119 | 


--------------------------------------------------------------------------------
/dinov2/configs/train/vitg14.yaml:
--------------------------------------------------------------------------------
 1 | dino:
 2 |   head_n_prototypes: 131072
 3 |   head_bottleneck_dim: 384
 4 | ibot:
 5 |   separate_head: true
 6 |   head_n_prototypes: 131072
 7 | train:
 8 |   batch_size_per_gpu: 12
 9 |   dataset_path: ImageNet22k
10 |   centering: sinkhorn_knopp
11 | student:
12 |   arch: vit_giant2
13 |   patch_size: 14
14 |   drop_path_rate: 0.4
15 |   ffn_layer: swiglufused
16 |   block_chunks: 4
17 | teacher:
18 |   momentum_teacher: 0.994
19 | optim:
20 |   epochs: 500
21 |   weight_decay_end: 0.2
22 |   base_lr: 2.0e-04  # learning rate for a batch size of 1024
23 |   warmup_epochs: 80
24 |   layerwise_decay: 1.0
25 | crops:
26 |   local_crops_size: 98


--------------------------------------------------------------------------------
/dinov2/configs/train/vitl14.yaml:
--------------------------------------------------------------------------------
 1 | dino:
 2 |   head_n_prototypes: 131072
 3 |   head_bottleneck_dim: 384
 4 | ibot:
 5 |   separate_head: true
 6 |   head_n_prototypes: 131072
 7 | train:
 8 |   batch_size_per_gpu: 32
 9 |   dataset_path: ImageNet22k
10 |   centering: sinkhorn_knopp
11 | student:
12 |   arch: vit_large
13 |   patch_size: 14
14 |   drop_path_rate: 0.4
15 |   ffn_layer: swiglufused
16 |   block_chunks: 4
17 | teacher:
18 |   momentum_teacher: 0.994
19 | optim:
20 |   epochs: 500
21 |   weight_decay_end: 0.2
22 |   base_lr: 2.0e-04  # learning rate for a batch size of 1024
23 |   warmup_epochs: 80
24 |   layerwise_decay: 1.0
25 | crops:
26 |   local_crops_size: 98


--------------------------------------------------------------------------------
/dinov2/configs/train/vitl16_short.yaml:
--------------------------------------------------------------------------------
1 | # this corresponds to the default config
2 | train:
3 |   dataset_path: ImageNet:split=TRAIN
4 |   batch_size_per_gpu: 64
5 | student:
6 |   block_chunks: 4
7 | 


--------------------------------------------------------------------------------
/dinov2/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from .adapters import DatasetWithEnumeratedTargets
 7 | from .loaders import make_data_loader, make_dataset, SamplerType
 8 | from .collate import collate_data_and_cast
 9 | from .masking import MaskingGenerator
10 | from .augmentations import DataAugmentationDINO
11 | 


--------------------------------------------------------------------------------
/dinov2/data/adapters.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from typing import Any, Tuple
 7 | 
 8 | from torch.utils.data import Dataset
 9 | 
10 | 
11 | class DatasetWithEnumeratedTargets(Dataset):
12 |     def __init__(self, dataset):
13 |         self._dataset = dataset
14 | 
15 |     def get_image_data(self, index: int) -> bytes:
16 |         return self._dataset.get_image_data(index)
17 | 
18 |     def get_target(self, index: int) -> Tuple[Any, int]:
19 |         target = self._dataset.get_target(index)
20 |         return (index, target)
21 | 
22 |     def __getitem__(self, index: int) -> Tuple[Any, Tuple[Any, int]]:
23 |         image, target = self._dataset[index]
24 |         target = index if target is None else target
25 |         return image, (index, target)
26 | 
27 |     def __len__(self) -> int:
28 |         return len(self._dataset)
29 | 


--------------------------------------------------------------------------------
/dinov2/data/augmentations.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | import logging
  7 | 
  8 | from torchvision import transforms
  9 | 
 10 | from .transforms import (
 11 |     GaussianBlur,
 12 |     make_normalize_transform,
 13 | )
 14 | 
 15 | 
 16 | logger = logging.getLogger("dinov2")
 17 | 
 18 | 
 19 | class DataAugmentationDINO(object):
 20 |     def __init__(
 21 |         self,
 22 |         global_crops_scale,
 23 |         local_crops_scale,
 24 |         local_crops_number,
 25 |         global_crops_size=224,
 26 |         local_crops_size=96,
 27 |     ):
 28 |         self.global_crops_scale = global_crops_scale
 29 |         self.local_crops_scale = local_crops_scale
 30 |         self.local_crops_number = local_crops_number
 31 |         self.global_crops_size = global_crops_size
 32 |         self.local_crops_size = local_crops_size
 33 | 
 34 |         logger.info("###################################")
 35 |         logger.info("Using data augmentation parameters:")
 36 |         logger.info(f"global_crops_scale: {global_crops_scale}")
 37 |         logger.info(f"local_crops_scale: {local_crops_scale}")
 38 |         logger.info(f"local_crops_number: {local_crops_number}")
 39 |         logger.info(f"global_crops_size: {global_crops_size}")
 40 |         logger.info(f"local_crops_size: {local_crops_size}")
 41 |         logger.info("###################################")
 42 | 
 43 |         # random resized crop and flip
 44 |         self.geometric_augmentation_global = transforms.Compose(
 45 |             [
 46 |                 transforms.RandomResizedCrop(
 47 |                     global_crops_size, scale=global_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC
 48 |                 ),
 49 |                 transforms.RandomHorizontalFlip(p=0.5),
 50 |             ]
 51 |         )
 52 | 
 53 |         self.geometric_augmentation_local = transforms.Compose(
 54 |             [
 55 |                 transforms.RandomResizedCrop(
 56 |                     local_crops_size, scale=local_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC
 57 |                 ),
 58 |                 transforms.RandomHorizontalFlip(p=0.5),
 59 |             ]
 60 |         )
 61 | 
 62 |         # color distorsions / blurring
 63 |         color_jittering = transforms.Compose(
 64 |             [
 65 |                 transforms.RandomApply(
 66 |                     [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
 67 |                     p=0.8,
 68 |                 ),
 69 |                 transforms.RandomGrayscale(p=0.2),
 70 |             ]
 71 |         )
 72 | 
 73 |         global_transfo1_extra = GaussianBlur(p=1.0)
 74 | 
 75 |         global_transfo2_extra = transforms.Compose(
 76 |             [
 77 |                 GaussianBlur(p=0.1),
 78 |                 transforms.RandomSolarize(threshold=128, p=0.2),
 79 |             ]
 80 |         )
 81 | 
 82 |         local_transfo_extra = GaussianBlur(p=0.5)
 83 | 
 84 |         # normalization
 85 |         self.normalize = transforms.Compose(
 86 |             [
 87 |                 transforms.ToTensor(),
 88 |                 make_normalize_transform(),
 89 |             ]
 90 |         )
 91 | 
 92 |         self.global_transfo1 = transforms.Compose([color_jittering, global_transfo1_extra, self.normalize])
 93 |         self.global_transfo2 = transforms.Compose([color_jittering, global_transfo2_extra, self.normalize])
 94 |         self.local_transfo = transforms.Compose([color_jittering, local_transfo_extra, self.normalize])
 95 | 
 96 |     def __call__(self, image):
 97 |         output = {}
 98 | 
 99 |         # global crops:
100 |         im1_base = self.geometric_augmentation_global(image)
101 |         global_crop_1 = self.global_transfo1(im1_base)
102 | 
103 |         im2_base = self.geometric_augmentation_global(image)
104 |         global_crop_2 = self.global_transfo2(im2_base)
105 | 
106 |         output["global_crops"] = [global_crop_1, global_crop_2]
107 | 
108 |         # global crops for teacher:
109 |         output["global_crops_teacher"] = [global_crop_1, global_crop_2]
110 | 
111 |         # local crops:
112 |         local_crops = [
113 |             self.local_transfo(self.geometric_augmentation_local(image)) for _ in range(self.local_crops_number)
114 |         ]
115 |         output["local_crops"] = local_crops
116 |         output["offsets"] = ()
117 | 
118 |         return output
119 | 


--------------------------------------------------------------------------------
/dinov2/data/collate.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import torch
 7 | import random
 8 | 
 9 | 
10 | def collate_data_and_cast(samples_list, mask_ratio_tuple, mask_probability, dtype, n_tokens=None, mask_generator=None):
11 |     # dtype = torch.half  # TODO: Remove
12 | 
13 |     n_global_crops = len(samples_list[0][0]["global_crops"])
14 |     n_local_crops = len(samples_list[0][0]["local_crops"])
15 | 
16 |     collated_global_crops = torch.stack([s[0]["global_crops"][i] for i in range(n_global_crops) for s in samples_list])
17 | 
18 |     collated_local_crops = torch.stack([s[0]["local_crops"][i] for i in range(n_local_crops) for s in samples_list])
19 | 
20 |     B = len(collated_global_crops)
21 |     N = n_tokens
22 |     n_samples_masked = int(B * mask_probability)
23 |     probs = torch.linspace(*mask_ratio_tuple, n_samples_masked + 1)
24 |     upperbound = 0
25 |     masks_list = []
26 |     for i in range(0, n_samples_masked):
27 |         prob_min = probs[i]
28 |         prob_max = probs[i + 1]
29 |         masks_list.append(torch.BoolTensor(mask_generator(int(N * random.uniform(prob_min, prob_max)))))
30 |         upperbound += int(N * prob_max)
31 |     for i in range(n_samples_masked, B):
32 |         masks_list.append(torch.BoolTensor(mask_generator(0)))
33 | 
34 |     random.shuffle(masks_list)
35 | 
36 |     collated_masks = torch.stack(masks_list).flatten(1)
37 |     mask_indices_list = collated_masks.flatten().nonzero().flatten()
38 | 
39 |     masks_weight = (1 / collated_masks.sum(-1).clamp(min=1.0)).unsqueeze(-1).expand_as(collated_masks)[collated_masks]
40 | 
41 |     return {
42 |         "collated_global_crops": collated_global_crops.to(dtype),
43 |         "collated_local_crops": collated_local_crops.to(dtype),
44 |         "collated_masks": collated_masks,
45 |         "mask_indices_list": mask_indices_list,
46 |         "masks_weight": masks_weight,
47 |         "upperbound": upperbound,
48 |         "n_masked_patches": torch.full((1,), fill_value=mask_indices_list.shape[0], dtype=torch.long),
49 |     }
50 | 


--------------------------------------------------------------------------------
/dinov2/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .image_net import ImageNet
7 | from .image_net_22k import ImageNet22k
8 | 


--------------------------------------------------------------------------------
/dinov2/data/datasets/decoders.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from io import BytesIO
 7 | from typing import Any
 8 | 
 9 | from PIL import Image
10 | 
11 | 
12 | class Decoder:
13 |     def decode(self) -> Any:
14 |         raise NotImplementedError
15 | 
16 | 
17 | class ImageDataDecoder(Decoder):
18 |     def __init__(self, image_data: bytes) -> None:
19 |         self._image_data = image_data
20 | 
21 |     def decode(self) -> Image:
22 |         f = BytesIO(self._image_data)
23 |         return Image.open(f).convert(mode="RGB")
24 | 
25 | 
26 | class TargetDecoder(Decoder):
27 |     def __init__(self, target: Any):
28 |         self._target = target
29 | 
30 |     def decode(self) -> Any:
31 |         return self._target
32 | 


--------------------------------------------------------------------------------
/dinov2/data/datasets/extended.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from typing import Any, Tuple
 7 | 
 8 | from torchvision.datasets import VisionDataset
 9 | 
10 | from .decoders import TargetDecoder, ImageDataDecoder
11 | 
12 | 
13 | class ExtendedVisionDataset(VisionDataset):
14 |     def __init__(self, *args, **kwargs) -> None:
15 |         super().__init__(*args, **kwargs)  # type: ignore
16 | 
17 |     def get_image_data(self, index: int) -> bytes:
18 |         raise NotImplementedError
19 | 
20 |     def get_target(self, index: int) -> Any:
21 |         raise NotImplementedError
22 | 
23 |     def __getitem__(self, index: int) -> Tuple[Any, Any]:
24 |         try:
25 |             image_data = self.get_image_data(index)
26 |             image = ImageDataDecoder(image_data).decode()
27 |         except Exception as e:
28 |             raise RuntimeError(f"can not read image for sample {index}") from e
29 |         target = self.get_target(index)
30 |         target = TargetDecoder(target).decode()
31 | 
32 |         if self.transforms is not None:
33 |             image, target = self.transforms(image, target)
34 | 
35 |         return image, target
36 | 
37 |     def __len__(self) -> int:
38 |         raise NotImplementedError
39 | 


--------------------------------------------------------------------------------
/dinov2/data/masking.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import random
 7 | import math
 8 | import numpy as np
 9 | 
10 | 
11 | class MaskingGenerator:
12 |     def __init__(
13 |         self,
14 |         input_size,
15 |         num_masking_patches=None,
16 |         min_num_patches=4,
17 |         max_num_patches=None,
18 |         min_aspect=0.3,
19 |         max_aspect=None,
20 |     ):
21 |         if not isinstance(input_size, tuple):
22 |             input_size = (input_size,) * 2
23 |         self.height, self.width = input_size
24 | 
25 |         self.num_patches = self.height * self.width
26 |         self.num_masking_patches = num_masking_patches
27 | 
28 |         self.min_num_patches = min_num_patches
29 |         self.max_num_patches = num_masking_patches if max_num_patches is None else max_num_patches
30 | 
31 |         max_aspect = max_aspect or 1 / min_aspect
32 |         self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
33 | 
34 |     def __repr__(self):
35 |         repr_str = "Generator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
36 |             self.height,
37 |             self.width,
38 |             self.min_num_patches,
39 |             self.max_num_patches,
40 |             self.num_masking_patches,
41 |             self.log_aspect_ratio[0],
42 |             self.log_aspect_ratio[1],
43 |         )
44 |         return repr_str
45 | 
46 |     def get_shape(self):
47 |         return self.height, self.width
48 | 
49 |     def _mask(self, mask, max_mask_patches):
50 |         delta = 0
51 |         for _ in range(10):
52 |             target_area = random.uniform(self.min_num_patches, max_mask_patches)
53 |             aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
54 |             h = int(round(math.sqrt(target_area * aspect_ratio)))
55 |             w = int(round(math.sqrt(target_area / aspect_ratio)))
56 |             if w < self.width and h < self.height:
57 |                 top = random.randint(0, self.height - h)
58 |                 left = random.randint(0, self.width - w)
59 | 
60 |                 num_masked = mask[top : top + h, left : left + w].sum()
61 |                 # Overlap
62 |                 if 0 < h * w - num_masked <= max_mask_patches:
63 |                     for i in range(top, top + h):
64 |                         for j in range(left, left + w):
65 |                             if mask[i, j] == 0:
66 |                                 mask[i, j] = 1
67 |                                 delta += 1
68 | 
69 |                 if delta > 0:
70 |                     break
71 |         return delta
72 | 
73 |     def __call__(self, num_masking_patches=0):
74 |         mask = np.zeros(shape=self.get_shape(), dtype=bool)
75 |         mask_count = 0
76 |         while mask_count < num_masking_patches:
77 |             max_mask_patches = num_masking_patches - mask_count
78 |             max_mask_patches = min(max_mask_patches, self.max_num_patches)
79 | 
80 |             delta = self._mask(mask, max_mask_patches)
81 |             if delta == 0:
82 |                 break
83 |             else:
84 |                 mask_count += delta
85 | 
86 |         return mask
87 | 


--------------------------------------------------------------------------------
/dinov2/data/transforms.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from typing import Sequence
 7 | 
 8 | import torch
 9 | from torchvision import transforms
10 | 
11 | 
12 | class GaussianBlur(transforms.RandomApply):
13 |     """
14 |     Apply Gaussian Blur to the PIL image.
15 |     """
16 | 
17 |     def __init__(self, *, p: float = 0.5, radius_min: float = 0.1, radius_max: float = 2.0):
18 |         # NOTE: torchvision is applying 1 - probability to return the original image
19 |         keep_p = 1 - p
20 |         transform = transforms.GaussianBlur(kernel_size=9, sigma=(radius_min, radius_max))
21 |         super().__init__(transforms=[transform], p=keep_p)
22 | 
23 | 
24 | class MaybeToTensor(transforms.ToTensor):
25 |     """
26 |     Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor, or keep as is if already a tensor.
27 |     """
28 | 
29 |     def __call__(self, pic):
30 |         """
31 |         Args:
32 |             pic (PIL Image, numpy.ndarray or torch.tensor): Image to be converted to tensor.
33 |         Returns:
34 |             Tensor: Converted image.
35 |         """
36 |         if isinstance(pic, torch.Tensor):
37 |             return pic
38 |         return super().__call__(pic)
39 | 
40 | 
41 | # Use timm's names
42 | IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
43 | IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
44 | 
45 | 
46 | def make_normalize_transform(
47 |     mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
48 |     std: Sequence[float] = IMAGENET_DEFAULT_STD,
49 | ) -> transforms.Normalize:
50 |     return transforms.Normalize(mean=mean, std=std)
51 | 
52 | 
53 | # This roughly matches torchvision's preset for classification training:
54 | #   https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L6-L44
55 | def make_classification_train_transform(
56 |     *,
57 |     crop_size: int = 224,
58 |     interpolation=transforms.InterpolationMode.BICUBIC,
59 |     hflip_prob: float = 0.5,
60 |     mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
61 |     std: Sequence[float] = IMAGENET_DEFAULT_STD,
62 | ):
63 |     transforms_list = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)]
64 |     if hflip_prob > 0.0:
65 |         transforms_list.append(transforms.RandomHorizontalFlip(hflip_prob))
66 |     transforms_list.extend(
67 |         [
68 |             MaybeToTensor(),
69 |             make_normalize_transform(mean=mean, std=std),
70 |         ]
71 |     )
72 |     return transforms.Compose(transforms_list)
73 | 
74 | 
75 | # This matches (roughly) torchvision's preset for classification evaluation:
76 | #   https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L47-L69
77 | def make_classification_eval_transform(
78 |     *,
79 |     resize_size: int = 256,
80 |     interpolation=transforms.InterpolationMode.BICUBIC,
81 |     crop_size: int = 224,
82 |     mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
83 |     std: Sequence[float] = IMAGENET_DEFAULT_STD,
84 | ) -> transforms.Compose:
85 |     transforms_list = [
86 |         transforms.Resize(resize_size, interpolation=interpolation),
87 |         transforms.CenterCrop(crop_size),
88 |         MaybeToTensor(),
89 |         make_normalize_transform(mean=mean, std=std),
90 |     ]
91 |     return transforms.Compose(transforms_list)
92 | 


--------------------------------------------------------------------------------
/dinov2/eval/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/dinov2/eval/depth/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/dinov2/eval/depth/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from .backbones import *  # noqa: F403
 7 | from .builder import BACKBONES, DEPTHER, HEADS, LOSSES, build_backbone, build_depther, build_head, build_loss
 8 | from .decode_heads import *  # noqa: F403
 9 | from .depther import *  # noqa: F403
10 | from .losses import *  # noqa: F403
11 | 


--------------------------------------------------------------------------------
/dinov2/eval/depth/models/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .vision_transformer import DinoVisionTransformer
7 | 


--------------------------------------------------------------------------------
/dinov2/eval/depth/models/backbones/vision_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from mmcv.runner import BaseModule
 7 | 
 8 | from ..builder import BACKBONES
 9 | 
10 | 
11 | @BACKBONES.register_module()
12 | class DinoVisionTransformer(BaseModule):
13 |     """Vision Transformer."""
14 | 
15 |     def __init__(self, *args, **kwargs):
16 |         super().__init__()
17 | 


--------------------------------------------------------------------------------
/dinov2/eval/depth/models/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import warnings
 7 | 
 8 | from mmcv.cnn import MODELS as MMCV_MODELS
 9 | from mmcv.cnn.bricks.registry import ATTENTION as MMCV_ATTENTION
10 | from mmcv.utils import Registry
11 | 
12 | MODELS = Registry("models", parent=MMCV_MODELS)
13 | ATTENTION = Registry("attention", parent=MMCV_ATTENTION)
14 | 
15 | 
16 | BACKBONES = MODELS
17 | NECKS = MODELS
18 | HEADS = MODELS
19 | LOSSES = MODELS
20 | DEPTHER = MODELS
21 | 
22 | 
23 | def build_backbone(cfg):
24 |     """Build backbone."""
25 |     return BACKBONES.build(cfg)
26 | 
27 | 
28 | def build_neck(cfg):
29 |     """Build neck."""
30 |     return NECKS.build(cfg)
31 | 
32 | 
33 | def build_head(cfg):
34 |     """Build head."""
35 |     return HEADS.build(cfg)
36 | 
37 | 
38 | def build_loss(cfg):
39 |     """Build loss."""
40 |     return LOSSES.build(cfg)
41 | 
42 | 
43 | def build_depther(cfg, train_cfg=None, test_cfg=None):
44 |     """Build depther."""
45 |     if train_cfg is not None or test_cfg is not None:
46 |         warnings.warn("train_cfg and test_cfg is deprecated, " "please specify them in model", UserWarning)
47 |     assert cfg.get("train_cfg") is None or train_cfg is None, "train_cfg specified in both outer field and model field "
48 |     assert cfg.get("test_cfg") is None or test_cfg is None, "test_cfg specified in both outer field and model field "
49 |     return DEPTHER.build(cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
50 | 


--------------------------------------------------------------------------------
/dinov2/eval/depth/models/decode_heads/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .dpt_head import DPTHead
7 | from .linear_head import BNHead
8 | 


--------------------------------------------------------------------------------
/dinov2/eval/depth/models/decode_heads/linear_head.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | from ...ops import resize
10 | from ..builder import HEADS
11 | from .decode_head import DepthBaseDecodeHead
12 | 
13 | 
14 | @HEADS.register_module()
15 | class BNHead(DepthBaseDecodeHead):
16 |     """Just a batchnorm."""
17 | 
18 |     def __init__(self, input_transform="resize_concat", in_index=(0, 1, 2, 3), upsample=1, **kwargs):
19 |         super().__init__(**kwargs)
20 |         self.input_transform = input_transform
21 |         self.in_index = in_index
22 |         self.upsample = upsample
23 |         # self.bn = nn.SyncBatchNorm(self.in_channels)
24 |         if self.classify:
25 |             self.conv_depth = nn.Conv2d(self.channels, self.n_bins, kernel_size=1, padding=0, stride=1)
26 |         else:
27 |             self.conv_depth = nn.Conv2d(self.channels, 1, kernel_size=1, padding=0, stride=1)
28 | 
29 |     def _transform_inputs(self, inputs):
30 |         """Transform inputs for decoder.
31 |         Args:
32 |             inputs (list[Tensor]): List of multi-level img features.
33 |         Returns:
34 |             Tensor: The transformed inputs
35 |         """
36 | 
37 |         if "concat" in self.input_transform:
38 |             inputs = [inputs[i] for i in self.in_index]
39 |             if "resize" in self.input_transform:
40 |                 inputs = [
41 |                     resize(
42 |                         input=x,
43 |                         size=[s * self.upsample for s in inputs[0].shape[2:]],
44 |                         mode="bilinear",
45 |                         align_corners=self.align_corners,
46 |                     )
47 |                     for x in inputs
48 |                 ]
49 |             inputs = torch.cat(inputs, dim=1)
50 |         elif self.input_transform == "multiple_select":
51 |             inputs = [inputs[i] for i in self.in_index]
52 |         else:
53 |             inputs = inputs[self.in_index]
54 | 
55 |         return inputs
56 | 
57 |     def _forward_feature(self, inputs, img_metas=None, **kwargs):
58 |         """Forward function for feature maps before classifying each pixel with
59 |         ``self.cls_seg`` fc.
60 |         Args:
61 |             inputs (list[Tensor]): List of multi-level img features.
62 |         Returns:
63 |             feats (Tensor): A tensor of shape (batch_size, self.channels,
64 |                 H, W) which is feature map for last layer of decoder head.
65 |         """
66 |         # accept lists (for cls token)
67 |         inputs = list(inputs)
68 |         for i, x in enumerate(inputs):
69 |             if len(x) == 2:
70 |                 x, cls_token = x[0], x[1]
71 |                 if len(x.shape) == 2:
72 |                     x = x[:, :, None, None]
73 |                 cls_token = cls_token[:, :, None, None].expand_as(x)
74 |                 inputs[i] = torch.cat((x, cls_token), 1)
75 |             else:
76 |                 x = x[0]
77 |                 if len(x.shape) == 2:
78 |                     x = x[:, :, None, None]
79 |                 inputs[i] = x
80 |         x = self._transform_inputs(inputs)
81 |         # feats = self.bn(x)
82 |         return x
83 | 
84 |     def forward(self, inputs, img_metas=None, **kwargs):
85 |         """Forward function."""
86 |         output = self._forward_feature(inputs, img_metas=img_metas, **kwargs)
87 |         output = self.depth_pred(output)
88 | 
89 |         return output
90 | 


--------------------------------------------------------------------------------
/dinov2/eval/depth/models/depther/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .base import BaseDepther
7 | from .encoder_decoder import DepthEncoderDecoder
8 | 


--------------------------------------------------------------------------------
/dinov2/eval/depth/models/losses/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .gradientloss import GradientLoss
7 | from .sigloss import SigLoss
8 | 


--------------------------------------------------------------------------------
/dinov2/eval/depth/models/losses/gradientloss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | from ...models.builder import LOSSES
10 | 
11 | 
12 | @LOSSES.register_module()
13 | class GradientLoss(nn.Module):
14 |     """GradientLoss.
15 | 
16 |     Adapted from https://www.cs.cornell.edu/projects/megadepth/
17 | 
18 |     Args:
19 |         valid_mask (bool): Whether filter invalid gt (gt > 0). Default: True.
20 |         loss_weight (float): Weight of the loss. Default: 1.0.
21 |         max_depth (int): When filtering invalid gt, set a max threshold. Default: None.
22 |     """
23 | 
24 |     def __init__(self, valid_mask=True, loss_weight=1.0, max_depth=None, loss_name="loss_grad"):
25 |         super(GradientLoss, self).__init__()
26 |         self.valid_mask = valid_mask
27 |         self.loss_weight = loss_weight
28 |         self.max_depth = max_depth
29 |         self.loss_name = loss_name
30 | 
31 |         self.eps = 0.001  # avoid grad explode
32 | 
33 |     def gradientloss(self, input, target):
34 |         input_downscaled = [input] + [input[:: 2 * i, :: 2 * i] for i in range(1, 4)]
35 |         target_downscaled = [target] + [target[:: 2 * i, :: 2 * i] for i in range(1, 4)]
36 | 
37 |         gradient_loss = 0
38 |         for input, target in zip(input_downscaled, target_downscaled):
39 |             if self.valid_mask:
40 |                 mask = target > 0
41 |                 if self.max_depth is not None:
42 |                     mask = torch.logical_and(target > 0, target <= self.max_depth)
43 |                 N = torch.sum(mask)
44 |             else:
45 |                 mask = torch.ones_like(target)
46 |                 N = input.numel()
47 |             input_log = torch.log(input + self.eps)
48 |             target_log = torch.log(target + self.eps)
49 |             log_d_diff = input_log - target_log
50 | 
51 |             log_d_diff = torch.mul(log_d_diff, mask)
52 | 
53 |             v_gradient = torch.abs(log_d_diff[0:-2, :] - log_d_diff[2:, :])
54 |             v_mask = torch.mul(mask[0:-2, :], mask[2:, :])
55 |             v_gradient = torch.mul(v_gradient, v_mask)
56 | 
57 |             h_gradient = torch.abs(log_d_diff[:, 0:-2] - log_d_diff[:, 2:])
58 |             h_mask = torch.mul(mask[:, 0:-2], mask[:, 2:])
59 |             h_gradient = torch.mul(h_gradient, h_mask)
60 | 
61 |             gradient_loss += (torch.sum(h_gradient) + torch.sum(v_gradient)) / N
62 | 
63 |         return gradient_loss
64 | 
65 |     def forward(self, depth_pred, depth_gt):
66 |         """Forward function."""
67 | 
68 |         gradient_loss = self.loss_weight * self.gradientloss(depth_pred, depth_gt)
69 |         return gradient_loss
70 | 


--------------------------------------------------------------------------------
/dinov2/eval/depth/models/losses/sigloss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | from ...models.builder import LOSSES
10 | 
11 | 
12 | @LOSSES.register_module()
13 | class SigLoss(nn.Module):
14 |     """SigLoss.
15 | 
16 |         This follows `AdaBins <https://arxiv.org/abs/2011.14141>`_.
17 | 
18 |     Args:
19 |         valid_mask (bool): Whether filter invalid gt (gt > 0). Default: True.
20 |         loss_weight (float): Weight of the loss. Default: 1.0.
21 |         max_depth (int): When filtering invalid gt, set a max threshold. Default: None.
22 |         warm_up (bool): A simple warm up stage to help convergence. Default: False.
23 |         warm_iter (int): The number of warm up stage. Default: 100.
24 |     """
25 | 
26 |     def __init__(
27 |         self, valid_mask=True, loss_weight=1.0, max_depth=None, warm_up=False, warm_iter=100, loss_name="sigloss"
28 |     ):
29 |         super(SigLoss, self).__init__()
30 |         self.valid_mask = valid_mask
31 |         self.loss_weight = loss_weight
32 |         self.max_depth = max_depth
33 |         self.loss_name = loss_name
34 | 
35 |         self.eps = 0.001  # avoid grad explode
36 | 
37 |         # HACK: a hack implementation for warmup sigloss
38 |         self.warm_up = warm_up
39 |         self.warm_iter = warm_iter
40 |         self.warm_up_counter = 0
41 | 
42 |     def sigloss(self, input, target):
43 |         if self.valid_mask:
44 |             valid_mask = target > 0
45 |             if self.max_depth is not None:
46 |                 valid_mask = torch.logical_and(target > 0, target <= self.max_depth)
47 |             input = input[valid_mask]
48 |             target = target[valid_mask]
49 | 
50 |         if self.warm_up:
51 |             if self.warm_up_counter < self.warm_iter:
52 |                 g = torch.log(input + self.eps) - torch.log(target + self.eps)
53 |                 g = 0.15 * torch.pow(torch.mean(g), 2)
54 |                 self.warm_up_counter += 1
55 |                 return torch.sqrt(g)
56 | 
57 |         g = torch.log(input + self.eps) - torch.log(target + self.eps)
58 |         Dg = torch.var(g) + 0.15 * torch.pow(torch.mean(g), 2)
59 |         return torch.sqrt(Dg)
60 | 
61 |     def forward(self, depth_pred, depth_gt):
62 |         """Forward function."""
63 | 
64 |         loss_depth = self.loss_weight * self.sigloss(depth_pred, depth_gt)
65 |         return loss_depth
66 | 


--------------------------------------------------------------------------------
/dinov2/eval/depth/ops/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .wrappers import resize
7 | 


--------------------------------------------------------------------------------
/dinov2/eval/depth/ops/wrappers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import warnings
 7 | 
 8 | import torch.nn.functional as F
 9 | 
10 | 
11 | def resize(input, size=None, scale_factor=None, mode="nearest", align_corners=None, warning=False):
12 |     if warning:
13 |         if size is not None and align_corners:
14 |             input_h, input_w = tuple(int(x) for x in input.shape[2:])
15 |             output_h, output_w = tuple(int(x) for x in size)
16 |             if output_h > input_h or output_w > output_h:
17 |                 if (
18 |                     (output_h > 1 and output_w > 1 and input_h > 1 and input_w > 1)
19 |                     and (output_h - 1) % (input_h - 1)
20 |                     and (output_w - 1) % (input_w - 1)
21 |                 ):
22 |                     warnings.warn(
23 |                         f"When align_corners={align_corners}, "
24 |                         "the output would more aligned if "
25 |                         f"input size {(input_h, input_w)} is `x+1` and "
26 |                         f"out size {(output_h, output_w)} is `nx+1`"
27 |                     )
28 |     return F.interpolate(input, size, scale_factor, mode, align_corners)
29 | 


--------------------------------------------------------------------------------
/dinov2/eval/metrics.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | from enum import Enum
  7 | import logging
  8 | from typing import Any, Dict, Optional
  9 | 
 10 | import torch
 11 | from torch import Tensor
 12 | from torchmetrics import Metric, MetricCollection
 13 | from torchmetrics.classification import MulticlassAccuracy
 14 | from torchmetrics.utilities.data import dim_zero_cat, select_topk
 15 | 
 16 | 
 17 | logger = logging.getLogger("dinov2")
 18 | 
 19 | 
 20 | class MetricType(Enum):
 21 |     MEAN_ACCURACY = "mean_accuracy"
 22 |     MEAN_PER_CLASS_ACCURACY = "mean_per_class_accuracy"
 23 |     PER_CLASS_ACCURACY = "per_class_accuracy"
 24 |     IMAGENET_REAL_ACCURACY = "imagenet_real_accuracy"
 25 | 
 26 |     @property
 27 |     def accuracy_averaging(self):
 28 |         return getattr(AccuracyAveraging, self.name, None)
 29 | 
 30 |     def __str__(self):
 31 |         return self.value
 32 | 
 33 | 
 34 | class AccuracyAveraging(Enum):
 35 |     MEAN_ACCURACY = "micro"
 36 |     MEAN_PER_CLASS_ACCURACY = "macro"
 37 |     PER_CLASS_ACCURACY = "none"
 38 | 
 39 |     def __str__(self):
 40 |         return self.value
 41 | 
 42 | 
 43 | def build_metric(metric_type: MetricType, *, num_classes: int, ks: Optional[tuple] = None):
 44 |     if metric_type.accuracy_averaging is not None:
 45 |         return build_topk_accuracy_metric(
 46 |             average_type=metric_type.accuracy_averaging,
 47 |             num_classes=num_classes,
 48 |             ks=(1, 5) if ks is None else ks,
 49 |         )
 50 |     elif metric_type == MetricType.IMAGENET_REAL_ACCURACY:
 51 |         return build_topk_imagenet_real_accuracy_metric(
 52 |             num_classes=num_classes,
 53 |             ks=(1, 5) if ks is None else ks,
 54 |         )
 55 | 
 56 |     raise ValueError(f"Unknown metric type {metric_type}")
 57 | 
 58 | 
 59 | def build_topk_accuracy_metric(average_type: AccuracyAveraging, num_classes: int, ks: tuple = (1, 5)):
 60 |     metrics: Dict[str, Metric] = {
 61 |         f"top-{k}": MulticlassAccuracy(top_k=k, num_classes=int(num_classes), average=average_type.value) for k in ks
 62 |     }
 63 |     return MetricCollection(metrics)
 64 | 
 65 | 
 66 | def build_topk_imagenet_real_accuracy_metric(num_classes: int, ks: tuple = (1, 5)):
 67 |     metrics: Dict[str, Metric] = {f"top-{k}": ImageNetReaLAccuracy(top_k=k, num_classes=int(num_classes)) for k in ks}
 68 |     return MetricCollection(metrics)
 69 | 
 70 | 
 71 | class ImageNetReaLAccuracy(Metric):
 72 |     is_differentiable: bool = False
 73 |     higher_is_better: Optional[bool] = None
 74 |     full_state_update: bool = False
 75 | 
 76 |     def __init__(
 77 |         self,
 78 |         num_classes: int,
 79 |         top_k: int = 1,
 80 |         **kwargs: Any,
 81 |     ) -> None:
 82 |         super().__init__(**kwargs)
 83 |         self.num_classes = num_classes
 84 |         self.top_k = top_k
 85 |         self.add_state("tp", [], dist_reduce_fx="cat")
 86 | 
 87 |     def update(self, preds: Tensor, target: Tensor) -> None:  # type: ignore
 88 |         # preds [B, D]
 89 |         # target [B, A]
 90 |         # preds_oh [B, D] with 0 and 1
 91 |         # select top K highest probabilities, use one hot representation
 92 |         preds_oh = select_topk(preds, self.top_k)
 93 |         # target_oh [B, D + 1] with 0 and 1
 94 |         target_oh = torch.zeros((preds_oh.shape[0], preds_oh.shape[1] + 1), device=target.device, dtype=torch.int32)
 95 |         target = target.long()
 96 |         # for undefined targets (-1) use a fake value `num_classes`
 97 |         target[target == -1] = self.num_classes
 98 |         # fill targets, use one hot representation
 99 |         target_oh.scatter_(1, target, 1)
100 |         # target_oh [B, D] (remove the fake target at index `num_classes`)
101 |         target_oh = target_oh[:, :-1]
102 |         # tp [B] with 0 and 1
103 |         tp = (preds_oh * target_oh == 1).sum(dim=1)
104 |         # at least one match between prediction and target
105 |         tp.clip_(max=1)
106 |         # ignore instances where no targets are defined
107 |         mask = target_oh.sum(dim=1) > 0
108 |         tp = tp[mask]
109 |         self.tp.append(tp)  # type: ignore
110 | 
111 |     def compute(self) -> Tensor:
112 |         tp = dim_zero_cat(self.tp)  # type: ignore
113 |         return tp.float().mean()
114 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation/hooks/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .optimizer import DistOptimizerHook
7 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation/hooks/optimizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | try:
 7 |     import apex
 8 | except ImportError:
 9 |     print("apex is not installed")
10 | 
11 | from mmcv.runner import OptimizerHook, HOOKS
12 | 
13 | 
14 | @HOOKS.register_module()
15 | class DistOptimizerHook(OptimizerHook):
16 |     """Optimizer hook for distributed training."""
17 | 
18 |     def __init__(self, update_interval=1, grad_clip=None, coalesce=True, bucket_size_mb=-1, use_fp16=False):
19 |         self.grad_clip = grad_clip
20 |         self.coalesce = coalesce
21 |         self.bucket_size_mb = bucket_size_mb
22 |         self.update_interval = update_interval
23 |         self.use_fp16 = use_fp16
24 | 
25 |     def before_run(self, runner):
26 |         runner.optimizer.zero_grad()
27 | 
28 |     def after_train_iter(self, runner):
29 |         runner.outputs["loss"] /= self.update_interval
30 |         if self.use_fp16:
31 |             # runner.outputs['loss'].backward()
32 |             with apex.amp.scale_loss(runner.outputs["loss"], runner.optimizer) as scaled_loss:
33 |                 scaled_loss.backward()
34 |         else:
35 |             runner.outputs["loss"].backward()
36 |         if self.every_n_iters(runner, self.update_interval):
37 |             if self.grad_clip is not None:
38 |                 self.clip_grads(runner.model.parameters())
39 |             runner.optimizer.step()
40 |             runner.optimizer.zero_grad()
41 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .backbones import *  # noqa: F403
7 | from .decode_heads import *  # noqa: F403
8 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation/models/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .vision_transformer import DinoVisionTransformer
7 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation/models/backbones/vision_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from mmcv.runner import BaseModule
 7 | from mmseg.models.builder import BACKBONES
 8 | 
 9 | 
10 | @BACKBONES.register_module()
11 | class DinoVisionTransformer(BaseModule):
12 |     """Vision Transformer."""
13 | 
14 |     def __init__(
15 |         self,
16 |         *args,
17 |         **kwargs,
18 |     ):
19 |         super().__init__()
20 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation/models/decode_heads/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .linear_head import BNHead
7 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation/models/decode_heads/linear_head.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | from mmseg.models.builder import HEADS
10 | from mmseg.models.decode_heads.decode_head import BaseDecodeHead
11 | from mmseg.ops import resize
12 | 
13 | 
14 | @HEADS.register_module()
15 | class BNHead(BaseDecodeHead):
16 |     """Just a batchnorm."""
17 | 
18 |     def __init__(self, resize_factors=None, **kwargs):
19 |         super().__init__(**kwargs)
20 |         assert self.in_channels == self.channels
21 |         self.bn = nn.SyncBatchNorm(self.in_channels)
22 |         self.resize_factors = resize_factors
23 | 
24 |     def _forward_feature(self, inputs):
25 |         """Forward function for feature maps before classifying each pixel with
26 |         ``self.cls_seg`` fc.
27 | 
28 |         Args:
29 |             inputs (list[Tensor]): List of multi-level img features.
30 | 
31 |         Returns:
32 |             feats (Tensor): A tensor of shape (batch_size, self.channels,
33 |                 H, W) which is feature map for last layer of decoder head.
34 |         """
35 |         # print("inputs", [i.shape for i in inputs])
36 |         x = self._transform_inputs(inputs)
37 |         # print("x", x.shape)
38 |         feats = self.bn(x)
39 |         # print("feats", feats.shape)
40 |         return feats
41 | 
42 |     def _transform_inputs(self, inputs):
43 |         """Transform inputs for decoder.
44 |         Args:
45 |             inputs (list[Tensor]): List of multi-level img features.
46 |         Returns:
47 |             Tensor: The transformed inputs
48 |         """
49 | 
50 |         if self.input_transform == "resize_concat":
51 |             # accept lists (for cls token)
52 |             input_list = []
53 |             for x in inputs:
54 |                 if isinstance(x, list):
55 |                     input_list.extend(x)
56 |                 else:
57 |                     input_list.append(x)
58 |             inputs = input_list
59 |             # an image descriptor can be a local descriptor with resolution 1x1
60 |             for i, x in enumerate(inputs):
61 |                 if len(x.shape) == 2:
62 |                     inputs[i] = x[:, :, None, None]
63 |             # select indices
64 |             inputs = [inputs[i] for i in self.in_index]
65 |             # Resizing shenanigans
66 |             # print("before", *(x.shape for x in inputs))
67 |             if self.resize_factors is not None:
68 |                 assert len(self.resize_factors) == len(inputs), (len(self.resize_factors), len(inputs))
69 |                 inputs = [
70 |                     resize(input=x, scale_factor=f, mode="bilinear" if f >= 1 else "area")
71 |                     for x, f in zip(inputs, self.resize_factors)
72 |                 ]
73 |                 # print("after", *(x.shape for x in inputs))
74 |             upsampled_inputs = [
75 |                 resize(input=x, size=inputs[0].shape[2:], mode="bilinear", align_corners=self.align_corners)
76 |                 for x in inputs
77 |             ]
78 |             inputs = torch.cat(upsampled_inputs, dim=1)
79 |         elif self.input_transform == "multiple_select":
80 |             inputs = [inputs[i] for i in self.in_index]
81 |         else:
82 |             inputs = inputs[self.in_index]
83 | 
84 |         return inputs
85 | 
86 |     def forward(self, inputs):
87 |         """Forward function."""
88 |         output = self._forward_feature(inputs)
89 |         output = self.cls_seg(output)
90 |         return output
91 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .core import *  # noqa: F403
7 | from .models import *  # noqa: F403
8 | from .ops import *  # noqa: F403
9 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/core/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from mmseg.core.evaluation import *  # noqa: F403
 7 | from mmseg.core.seg import *  # noqa: F403
 8 | 
 9 | from .anchor import *  # noqa: F403
10 | from .box import *  # noqa: F403
11 | from .utils import *  # noqa: F403
12 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/core/anchor/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .point_generator import MlvlPointGenerator  # noqa: F403
7 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/core/anchor/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import warnings
 7 | 
 8 | from mmcv.utils import Registry, build_from_cfg
 9 | 
10 | PRIOR_GENERATORS = Registry("Generator for anchors and points")
11 | 
12 | ANCHOR_GENERATORS = PRIOR_GENERATORS
13 | 
14 | 
15 | def build_prior_generator(cfg, default_args=None):
16 |     return build_from_cfg(cfg, PRIOR_GENERATORS, default_args)
17 | 
18 | 
19 | def build_anchor_generator(cfg, default_args=None):
20 |     warnings.warn("``build_anchor_generator`` would be deprecated soon, please use " "``build_prior_generator`` ")
21 |     return build_prior_generator(cfg, default_args=default_args)
22 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/core/box/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .builder import *  # noqa: F403
7 | from .samplers import MaskPseudoSampler  # noqa: F403
8 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/core/box/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from mmcv.utils import Registry, build_from_cfg
 7 | 
 8 | BBOX_SAMPLERS = Registry("bbox_sampler")
 9 | BBOX_CODERS = Registry("bbox_coder")
10 | 
11 | 
12 | def build_sampler(cfg, **default_args):
13 |     """Builder of box sampler."""
14 |     return build_from_cfg(cfg, BBOX_SAMPLERS, default_args)
15 | 
16 | 
17 | def build_bbox_coder(cfg, **default_args):
18 |     """Builder of box coder."""
19 |     return build_from_cfg(cfg, BBOX_CODERS, default_args)
20 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/core/box/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .mask_pseudo_sampler import MaskPseudoSampler  # noqa: F403
7 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/core/box/samplers/base_sampler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from abc import ABCMeta, abstractmethod
 7 | 
 8 | import torch
 9 | 
10 | from .sampling_result import SamplingResult
11 | 
12 | 
13 | class BaseSampler(metaclass=ABCMeta):
14 |     """Base class of samplers."""
15 | 
16 |     def __init__(self, num, pos_fraction, neg_pos_ub=-1, add_gt_as_proposals=True, **kwargs):
17 |         self.num = num
18 |         self.pos_fraction = pos_fraction
19 |         self.neg_pos_ub = neg_pos_ub
20 |         self.add_gt_as_proposals = add_gt_as_proposals
21 |         self.pos_sampler = self
22 |         self.neg_sampler = self
23 | 
24 |     @abstractmethod
25 |     def _sample_pos(self, assign_result, num_expected, **kwargs):
26 |         """Sample positive samples."""
27 |         pass
28 | 
29 |     @abstractmethod
30 |     def _sample_neg(self, assign_result, num_expected, **kwargs):
31 |         """Sample negative samples."""
32 |         pass
33 | 
34 |     def sample(self, assign_result, bboxes, gt_bboxes, gt_labels=None, **kwargs):
35 |         """Sample positive and negative bboxes.
36 | 
37 |         This is a simple implementation of bbox sampling given candidates,
38 |         assigning results and ground truth bboxes.
39 | 
40 |         Args:
41 |             assign_result (:obj:`AssignResult`): Bbox assigning results.
42 |             bboxes (Tensor): Boxes to be sampled from.
43 |             gt_bboxes (Tensor): Ground truth bboxes.
44 |             gt_labels (Tensor, optional): Class labels of ground truth bboxes.
45 | 
46 |         Returns:
47 |             :obj:`SamplingResult`: Sampling result.
48 | 
49 |         Example:
50 |             >>> from mmdet.core.bbox import RandomSampler
51 |             >>> from mmdet.core.bbox import AssignResult
52 |             >>> from mmdet.core.bbox.demodata import ensure_rng, random_boxes
53 |             >>> rng = ensure_rng(None)
54 |             >>> assign_result = AssignResult.random(rng=rng)
55 |             >>> bboxes = random_boxes(assign_result.num_preds, rng=rng)
56 |             >>> gt_bboxes = random_boxes(assign_result.num_gts, rng=rng)
57 |             >>> gt_labels = None
58 |             >>> self = RandomSampler(num=32, pos_fraction=0.5, neg_pos_ub=-1,
59 |             >>>                      add_gt_as_proposals=False)
60 |             >>> self = self.sample(assign_result, bboxes, gt_bboxes, gt_labels)
61 |         """
62 |         if len(bboxes.shape) < 2:
63 |             bboxes = bboxes[None, :]
64 | 
65 |         bboxes = bboxes[:, :4]
66 | 
67 |         gt_flags = bboxes.new_zeros((bboxes.shape[0],), dtype=torch.uint8)
68 |         if self.add_gt_as_proposals and len(gt_bboxes) > 0:
69 |             if gt_labels is None:
70 |                 raise ValueError("gt_labels must be given when add_gt_as_proposals is True")
71 |             bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
72 |             assign_result.add_gt_(gt_labels)
73 |             gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
74 |             gt_flags = torch.cat([gt_ones, gt_flags])
75 | 
76 |         num_expected_pos = int(self.num * self.pos_fraction)
77 |         pos_inds = self.pos_sampler._sample_pos(assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
78 |         # We found that sampled indices have duplicated items occasionally.
79 |         # (may be a bug of PyTorch)
80 |         pos_inds = pos_inds.unique()
81 |         num_sampled_pos = pos_inds.numel()
82 |         num_expected_neg = self.num - num_sampled_pos
83 |         if self.neg_pos_ub >= 0:
84 |             _pos = max(1, num_sampled_pos)
85 |             neg_upper_bound = int(self.neg_pos_ub * _pos)
86 |             if num_expected_neg > neg_upper_bound:
87 |                 num_expected_neg = neg_upper_bound
88 |         neg_inds = self.neg_sampler._sample_neg(assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
89 |         neg_inds = neg_inds.unique()
90 | 
91 |         sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, gt_flags)
92 |         return sampling_result
93 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/core/box/samplers/mask_pseudo_sampler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | # References:
 7 | #   https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py
 8 | 
 9 | import torch
10 | 
11 | from ..builder import BBOX_SAMPLERS
12 | from .base_sampler import BaseSampler
13 | from .mask_sampling_result import MaskSamplingResult
14 | 
15 | 
16 | @BBOX_SAMPLERS.register_module()
17 | class MaskPseudoSampler(BaseSampler):
18 |     """A pseudo sampler that does not do sampling actually."""
19 | 
20 |     def __init__(self, **kwargs):
21 |         pass
22 | 
23 |     def _sample_pos(self, **kwargs):
24 |         """Sample positive samples."""
25 |         raise NotImplementedError
26 | 
27 |     def _sample_neg(self, **kwargs):
28 |         """Sample negative samples."""
29 |         raise NotImplementedError
30 | 
31 |     def sample(self, assign_result, masks, gt_masks, **kwargs):
32 |         """Directly returns the positive and negative indices  of samples.
33 | 
34 |         Args:
35 |             assign_result (:obj:`AssignResult`): Assigned results
36 |             masks (torch.Tensor): Bounding boxes
37 |             gt_masks (torch.Tensor): Ground truth boxes
38 |         Returns:
39 |             :obj:`SamplingResult`: sampler results
40 |         """
41 |         pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
42 |         neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
43 |         gt_flags = masks.new_zeros(masks.shape[0], dtype=torch.uint8)
44 |         sampling_result = MaskSamplingResult(pos_inds, neg_inds, masks, gt_masks, assign_result, gt_flags)
45 |         return sampling_result
46 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/core/box/samplers/mask_sampling_result.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | # References:
 7 | #   https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py
 8 | 
 9 | import torch
10 | 
11 | from .sampling_result import SamplingResult
12 | 
13 | 
14 | class MaskSamplingResult(SamplingResult):
15 |     """Mask sampling result."""
16 | 
17 |     def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result, gt_flags):
18 |         self.pos_inds = pos_inds
19 |         self.neg_inds = neg_inds
20 |         self.pos_masks = masks[pos_inds]
21 |         self.neg_masks = masks[neg_inds]
22 |         self.pos_is_gt = gt_flags[pos_inds]
23 | 
24 |         self.num_gts = gt_masks.shape[0]
25 |         self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
26 | 
27 |         if gt_masks.numel() == 0:
28 |             # hack for index error case
29 |             assert self.pos_assigned_gt_inds.numel() == 0
30 |             self.pos_gt_masks = torch.empty_like(gt_masks)
31 |         else:
32 |             self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :]
33 | 
34 |         if assign_result.labels is not None:
35 |             self.pos_gt_labels = assign_result.labels[pos_inds]
36 |         else:
37 |             self.pos_gt_labels = None
38 | 
39 |     @property
40 |     def masks(self):
41 |         """torch.Tensor: concatenated positive and negative boxes"""
42 |         return torch.cat([self.pos_masks, self.neg_masks])
43 | 
44 |     def __nice__(self):
45 |         data = self.info.copy()
46 |         data["pos_masks"] = data.pop("pos_masks").shape
47 |         data["neg_masks"] = data.pop("neg_masks").shape
48 |         parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
49 |         body = "    " + ",\n    ".join(parts)
50 |         return "{\n" + body + "\n}"
51 | 
52 |     @property
53 |     def info(self):
54 |         """Returns a dictionary of info about the object."""
55 |         return {
56 |             "pos_inds": self.pos_inds,
57 |             "neg_inds": self.neg_inds,
58 |             "pos_masks": self.pos_masks,
59 |             "neg_masks": self.neg_masks,
60 |             "pos_is_gt": self.pos_is_gt,
61 |             "num_gts": self.num_gts,
62 |             "pos_assigned_gt_inds": self.pos_assigned_gt_inds,
63 |         }
64 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/core/box/samplers/sampling_result.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | import torch
  7 | 
  8 | 
  9 | class SamplingResult:
 10 |     """Bbox sampling result.
 11 | 
 12 |     Example:
 13 |         >>> # xdoctest: +IGNORE_WANT
 14 |         >>> from mmdet.core.bbox.samplers.sampling_result import *  # NOQA
 15 |         >>> self = SamplingResult.random(rng=10)
 16 |         >>> print(f'self = {self}')
 17 |         self = <SamplingResult({
 18 |             'neg_bboxes': torch.Size([12, 4]),
 19 |             'neg_inds': tensor([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
 20 |             'num_gts': 4,
 21 |             'pos_assigned_gt_inds': tensor([], dtype=torch.int64),
 22 |             'pos_bboxes': torch.Size([0, 4]),
 23 |             'pos_inds': tensor([], dtype=torch.int64),
 24 |             'pos_is_gt': tensor([], dtype=torch.uint8)
 25 |         })>
 26 |     """
 27 | 
 28 |     def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, gt_flags):
 29 |         self.pos_inds = pos_inds
 30 |         self.neg_inds = neg_inds
 31 |         self.pos_bboxes = bboxes[pos_inds]
 32 |         self.neg_bboxes = bboxes[neg_inds]
 33 |         self.pos_is_gt = gt_flags[pos_inds]
 34 | 
 35 |         self.num_gts = gt_bboxes.shape[0]
 36 |         self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
 37 | 
 38 |         if gt_bboxes.numel() == 0:
 39 |             # hack for index error case
 40 |             assert self.pos_assigned_gt_inds.numel() == 0
 41 |             self.pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
 42 |         else:
 43 |             if len(gt_bboxes.shape) < 2:
 44 |                 gt_bboxes = gt_bboxes.view(-1, 4)
 45 | 
 46 |             self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds.long(), :]
 47 | 
 48 |         if assign_result.labels is not None:
 49 |             self.pos_gt_labels = assign_result.labels[pos_inds]
 50 |         else:
 51 |             self.pos_gt_labels = None
 52 | 
 53 |     @property
 54 |     def bboxes(self):
 55 |         """torch.Tensor: concatenated positive and negative boxes"""
 56 |         return torch.cat([self.pos_bboxes, self.neg_bboxes])
 57 | 
 58 |     def to(self, device):
 59 |         """Change the device of the data inplace.
 60 | 
 61 |         Example:
 62 |             >>> self = SamplingResult.random()
 63 |             >>> print(f'self = {self.to(None)}')
 64 |             >>> # xdoctest: +REQUIRES(--gpu)
 65 |             >>> print(f'self = {self.to(0)}')
 66 |         """
 67 |         _dict = self.__dict__
 68 |         for key, value in _dict.items():
 69 |             if isinstance(value, torch.Tensor):
 70 |                 _dict[key] = value.to(device)
 71 |         return self
 72 | 
 73 |     def __nice__(self):
 74 |         data = self.info.copy()
 75 |         data["pos_bboxes"] = data.pop("pos_bboxes").shape
 76 |         data["neg_bboxes"] = data.pop("neg_bboxes").shape
 77 |         parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
 78 |         body = "    " + ",\n    ".join(parts)
 79 |         return "{\n" + body + "\n}"
 80 | 
 81 |     @property
 82 |     def info(self):
 83 |         """Returns a dictionary of info about the object."""
 84 |         return {
 85 |             "pos_inds": self.pos_inds,
 86 |             "neg_inds": self.neg_inds,
 87 |             "pos_bboxes": self.pos_bboxes,
 88 |             "neg_bboxes": self.neg_bboxes,
 89 |             "pos_is_gt": self.pos_is_gt,
 90 |             "num_gts": self.num_gts,
 91 |             "pos_assigned_gt_inds": self.pos_assigned_gt_inds,
 92 |         }
 93 | 
 94 |     @classmethod
 95 |     def random(cls, rng=None, **kwargs):
 96 |         """
 97 |         Args:
 98 |             rng (None | int | numpy.random.RandomState): seed or state.
 99 |             kwargs (keyword arguments):
100 |                 - num_preds: number of predicted boxes
101 |                 - num_gts: number of true boxes
102 |                 - p_ignore (float): probability of a predicted box assigned to \
103 |                     an ignored truth.
104 |                 - p_assigned (float): probability of a predicted box not being \
105 |                     assigned.
106 |                 - p_use_label (float | bool): with labels or not.
107 | 
108 |         Returns:
109 |             :obj:`SamplingResult`: Randomly generated sampling result.
110 | 
111 |         Example:
112 |             >>> from mmdet.core.bbox.samplers.sampling_result import *  # NOQA
113 |             >>> self = SamplingResult.random()
114 |             >>> print(self.__dict__)
115 |         """
116 |         from mmdet.core.bbox import demodata
117 |         from mmdet.core.bbox.assigners.assign_result import AssignResult
118 |         from mmdet.core.bbox.samplers.random_sampler import RandomSampler
119 | 
120 |         rng = demodata.ensure_rng(rng)
121 | 
122 |         # make probabalistic?
123 |         num = 32
124 |         pos_fraction = 0.5
125 |         neg_pos_ub = -1
126 | 
127 |         assign_result = AssignResult.random(rng=rng, **kwargs)
128 | 
129 |         # Note we could just compute an assignment
130 |         bboxes = demodata.random_boxes(assign_result.num_preds, rng=rng)
131 |         gt_bboxes = demodata.random_boxes(assign_result.num_gts, rng=rng)
132 | 
133 |         if rng.rand() > 0.2:
134 |             # sometimes algorithms squeeze their data, be robust to that
135 |             gt_bboxes = gt_bboxes.squeeze()
136 |             bboxes = bboxes.squeeze()
137 | 
138 |         if assign_result.labels is None:
139 |             gt_labels = None
140 |         else:
141 |             gt_labels = None
142 | 
143 |         if gt_labels is None:
144 |             add_gt_as_proposals = False
145 |         else:
146 |             add_gt_as_proposals = True  # make probabalistic?
147 | 
148 |         sampler = RandomSampler(
149 |             num, pos_fraction, neg_pos_ub=neg_pos_ub, add_gt_as_proposals=add_gt_as_proposals, rng=rng
150 |         )
151 |         self = sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels)
152 |         return self
153 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/core/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .dist_utils import reduce_mean
7 | from .misc import add_prefix, multi_apply
8 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/core/utils/dist_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import torch.distributed as dist
 7 | 
 8 | 
 9 | def reduce_mean(tensor):
10 |     """ "Obtain the mean of tensor on different GPUs."""
11 |     if not (dist.is_available() and dist.is_initialized()):
12 |         return tensor
13 |     tensor = tensor.clone()
14 |     dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
15 |     return tensor
16 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/core/utils/misc.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from functools import partial
 7 | 
 8 | 
 9 | def multi_apply(func, *args, **kwargs):
10 |     """Apply function to a list of arguments.
11 | 
12 |     Note:
13 |         This function applies the ``func`` to multiple inputs and
14 |         map the multiple outputs of the ``func`` into different
15 |         list. Each list contains the same type of outputs corresponding
16 |         to different inputs.
17 | 
18 |     Args:
19 |         func (Function): A function that will be applied to a list of
20 |             arguments
21 | 
22 |     Returns:
23 |         tuple(list): A tuple containing multiple list, each list contains \
24 |             a kind of returned results by the function
25 |     """
26 |     pfunc = partial(func, **kwargs) if kwargs else func
27 |     map_results = map(pfunc, *args)
28 |     return tuple(map(list, zip(*map_results)))
29 | 
30 | 
31 | def add_prefix(inputs, prefix):
32 |     """Add prefix for dict.
33 | 
34 |     Args:
35 |         inputs (dict): The input dict with str keys.
36 |         prefix (str): The prefix to add.
37 | 
38 |     Returns:
39 | 
40 |         dict: The dict with keys updated with ``prefix``.
41 |     """
42 | 
43 |     outputs = dict()
44 |     for name, value in inputs.items():
45 |         outputs[f"{prefix}.{name}"] = value
46 | 
47 |     return outputs
48 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from .backbones import *  # noqa: F403
 7 | from .builder import MASK_ASSIGNERS, MATCH_COST, TRANSFORMER, build_assigner, build_match_cost
 8 | from .decode_heads import *  # noqa: F403
 9 | from .losses import *  # noqa: F403
10 | from .plugins import *  # noqa: F403
11 | from .segmentors import *  # noqa: F403
12 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/models/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .vit_adapter import ViTAdapter
7 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/models/backbones/drop_path.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | # References:
 7 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 8 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
 9 | 
10 | from torch import nn
11 | 
12 | 
13 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
14 |     if drop_prob == 0.0 or not training:
15 |         return x
16 |     keep_prob = 1 - drop_prob
17 |     shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
18 |     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
19 |     if keep_prob > 0.0:
20 |         random_tensor.div_(keep_prob)
21 |     return x * random_tensor
22 | 
23 | 
24 | class DropPath(nn.Module):
25 |     """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
26 | 
27 |     def __init__(self, drop_prob: float = 0.0):
28 |         super(DropPath, self).__init__()
29 |         self.drop_prob = drop_prob
30 | 
31 |     def forward(self, x):
32 |         return drop_path(x, self.drop_prob, self.training)
33 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/models/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from mmcv.utils import Registry
 7 | 
 8 | TRANSFORMER = Registry("Transformer")
 9 | MASK_ASSIGNERS = Registry("mask_assigner")
10 | MATCH_COST = Registry("match_cost")
11 | 
12 | 
13 | def build_match_cost(cfg):
14 |     """Build Match Cost."""
15 |     return MATCH_COST.build(cfg)
16 | 
17 | 
18 | def build_assigner(cfg):
19 |     """Build Assigner."""
20 |     return MASK_ASSIGNERS.build(cfg)
21 | 
22 | 
23 | def build_transformer(cfg):
24 |     """Build Transformer."""
25 |     return TRANSFORMER.build(cfg)
26 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/models/decode_heads/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .mask2former_head import Mask2FormerHead
7 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/models/losses/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .cross_entropy_loss import CrossEntropyLoss, binary_cross_entropy, cross_entropy, mask_cross_entropy
7 | from .dice_loss import DiceLoss
8 | from .match_costs import ClassificationCost, CrossEntropyLossCost, DiceCost
9 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/models/losses/dice_loss.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | from mmseg.models.builder import LOSSES
  9 | from mmseg.models.losses.utils import weight_reduce_loss
 10 | 
 11 | 
 12 | def dice_loss(pred, target, weight=None, eps=1e-3, reduction="mean", avg_factor=None):
 13 |     """Calculate dice loss, which is proposed in
 14 |     `V-Net: Fully Convolutional Neural Networks for Volumetric
 15 |     Medical Image Segmentation <https://arxiv.org/abs/1606.04797>`_.
 16 | 
 17 |     Args:
 18 |         pred (torch.Tensor): The prediction, has a shape (n, *)
 19 |         target (torch.Tensor): The learning label of the prediction,
 20 |             shape (n, *), same shape of pred.
 21 |         weight (torch.Tensor, optional): The weight of loss for each
 22 |             prediction, has a shape (n,). Defaults to None.
 23 |         eps (float): Avoid dividing by zero. Default: 1e-3.
 24 |         reduction (str, optional): The method used to reduce the loss into
 25 |             a scalar. Defaults to 'mean'.
 26 |             Options are "none", "mean" and "sum".
 27 |         avg_factor (int, optional): Average factor that is used to average
 28 |             the loss. Defaults to None.
 29 |     """
 30 | 
 31 |     input = pred.flatten(1)
 32 |     target = target.flatten(1).float()
 33 | 
 34 |     a = torch.sum(input * target, 1)
 35 |     b = torch.sum(input * input, 1) + eps
 36 |     c = torch.sum(target * target, 1) + eps
 37 |     d = (2 * a) / (b + c)
 38 |     loss = 1 - d
 39 |     if weight is not None:
 40 |         assert weight.ndim == loss.ndim
 41 |         assert len(weight) == len(pred)
 42 |     loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
 43 |     return loss
 44 | 
 45 | 
 46 | def naive_dice_loss(pred, target, weight=None, eps=1e-3, reduction="mean", avg_factor=None):
 47 |     """Calculate naive dice loss, the coefficient in the denominator is the
 48 |     first power instead of the second power.
 49 | 
 50 |     Args:
 51 |         pred (torch.Tensor): The prediction, has a shape (n, *)
 52 |         target (torch.Tensor): The learning label of the prediction,
 53 |             shape (n, *), same shape of pred.
 54 |         weight (torch.Tensor, optional): The weight of loss for each
 55 |             prediction, has a shape (n,). Defaults to None.
 56 |         eps (float): Avoid dividing by zero. Default: 1e-3.
 57 |         reduction (str, optional): The method used to reduce the loss into
 58 |             a scalar. Defaults to 'mean'.
 59 |             Options are "none", "mean" and "sum".
 60 |         avg_factor (int, optional): Average factor that is used to average
 61 |             the loss. Defaults to None.
 62 |     """
 63 |     input = pred.flatten(1)
 64 |     target = target.flatten(1).float()
 65 | 
 66 |     a = torch.sum(input * target, 1)
 67 |     b = torch.sum(input, 1)
 68 |     c = torch.sum(target, 1)
 69 |     d = (2 * a + eps) / (b + c + eps)
 70 |     loss = 1 - d
 71 |     if weight is not None:
 72 |         assert weight.ndim == loss.ndim
 73 |         assert len(weight) == len(pred)
 74 |     loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
 75 |     return loss
 76 | 
 77 | 
 78 | @LOSSES.register_module(force=True)
 79 | class DiceLoss(nn.Module):
 80 |     def __init__(self, use_sigmoid=True, activate=True, reduction="mean", naive_dice=False, loss_weight=1.0, eps=1e-3):
 81 |         """Dice Loss, there are two forms of dice loss is supported:
 82 | 
 83 |             - the one proposed in `V-Net: Fully Convolutional Neural
 84 |                 Networks for Volumetric Medical Image Segmentation
 85 |                 <https://arxiv.org/abs/1606.04797>`_.
 86 |             - the dice loss in which the power of the number in the
 87 |                 denominator is the first power instead of the second
 88 |                 power.
 89 | 
 90 |         Args:
 91 |             use_sigmoid (bool, optional): Whether to the prediction is
 92 |                 used for sigmoid or softmax. Defaults to True.
 93 |             activate (bool): Whether to activate the predictions inside,
 94 |                 this will disable the inside sigmoid operation.
 95 |                 Defaults to True.
 96 |             reduction (str, optional): The method used
 97 |                 to reduce the loss. Options are "none",
 98 |                 "mean" and "sum". Defaults to 'mean'.
 99 |             naive_dice (bool, optional): If false, use the dice
100 |                 loss defined in the V-Net paper, otherwise, use the
101 |                 naive dice loss in which the power of the number in the
102 |                 denominator is the first power instead of the second
103 |                 power.Defaults to False.
104 |             loss_weight (float, optional): Weight of loss. Defaults to 1.0.
105 |             eps (float): Avoid dividing by zero. Defaults to 1e-3.
106 |         """
107 | 
108 |         super(DiceLoss, self).__init__()
109 |         self.use_sigmoid = use_sigmoid
110 |         self.reduction = reduction
111 |         self.naive_dice = naive_dice
112 |         self.loss_weight = loss_weight
113 |         self.eps = eps
114 |         self.activate = activate
115 | 
116 |     def forward(self, pred, target, weight=None, reduction_override=None, avg_factor=None):
117 |         """Forward function.
118 | 
119 |         Args:
120 |             pred (torch.Tensor): The prediction, has a shape (n, *).
121 |             target (torch.Tensor): The label of the prediction,
122 |                 shape (n, *), same shape of pred.
123 |             weight (torch.Tensor, optional): The weight of loss for each
124 |                 prediction, has a shape (n,). Defaults to None.
125 |             avg_factor (int, optional): Average factor that is used to average
126 |                 the loss. Defaults to None.
127 |             reduction_override (str, optional): The reduction method used to
128 |                 override the original reduction method of the loss.
129 |                 Options are "none", "mean" and "sum".
130 | 
131 |         Returns:
132 |             torch.Tensor: The calculated loss
133 |         """
134 | 
135 |         assert reduction_override in (None, "none", "mean", "sum")
136 |         reduction = reduction_override if reduction_override else self.reduction
137 | 
138 |         if self.activate:
139 |             if self.use_sigmoid:
140 |                 pred = pred.sigmoid()
141 |             else:
142 |                 raise NotImplementedError
143 | 
144 |         if self.naive_dice:
145 |             loss = self.loss_weight * naive_dice_loss(
146 |                 pred, target, weight, eps=self.eps, reduction=reduction, avg_factor=avg_factor
147 |             )
148 |         else:
149 |             loss = self.loss_weight * dice_loss(
150 |                 pred, target, weight, eps=self.eps, reduction=reduction, avg_factor=avg_factor
151 |             )
152 | 
153 |         return loss
154 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/models/losses/match_costs.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | import torch
  7 | import torch.nn.functional as F
  8 | 
  9 | from ..builder import MATCH_COST
 10 | 
 11 | 
 12 | @MATCH_COST.register_module()
 13 | class ClassificationCost:
 14 |     """ClsSoftmaxCost.Borrow from
 15 |     mmdet.core.bbox.match_costs.match_cost.ClassificationCost.
 16 | 
 17 |      Args:
 18 |          weight (int | float, optional): loss_weight
 19 | 
 20 |      Examples:
 21 |          >>> import torch
 22 |          >>> self = ClassificationCost()
 23 |          >>> cls_pred = torch.rand(4, 3)
 24 |          >>> gt_labels = torch.tensor([0, 1, 2])
 25 |          >>> factor = torch.tensor([10, 8, 10, 8])
 26 |          >>> self(cls_pred, gt_labels)
 27 |          tensor([[-0.3430, -0.3525, -0.3045],
 28 |                 [-0.3077, -0.2931, -0.3992],
 29 |                 [-0.3664, -0.3455, -0.2881],
 30 |                 [-0.3343, -0.2701, -0.3956]])
 31 |     """
 32 | 
 33 |     def __init__(self, weight=1.0):
 34 |         self.weight = weight
 35 | 
 36 |     def __call__(self, cls_pred, gt_labels):
 37 |         """
 38 |         Args:
 39 |             cls_pred (Tensor): Predicted classification logits, shape
 40 |                 [num_query, num_class].
 41 |             gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
 42 | 
 43 |         Returns:
 44 |             torch.Tensor: cls_cost value with weight
 45 |         """
 46 |         # Following the official DETR repo, contrary to the loss that
 47 |         # NLL is used, we approximate it in 1 - cls_score[gt_label].
 48 |         # The 1 is a constant that doesn't change the matching,
 49 |         # so it can be omitted.
 50 |         cls_score = cls_pred.softmax(-1)
 51 |         cls_cost = -cls_score[:, gt_labels]
 52 |         return cls_cost * self.weight
 53 | 
 54 | 
 55 | @MATCH_COST.register_module()
 56 | class DiceCost:
 57 |     """Cost of mask assignments based on dice losses.
 58 | 
 59 |     Args:
 60 |         weight (int | float, optional): loss_weight. Defaults to 1.
 61 |         pred_act (bool, optional): Whether to apply sigmoid to mask_pred.
 62 |             Defaults to False.
 63 |         eps (float, optional): default 1e-12.
 64 |     """
 65 | 
 66 |     def __init__(self, weight=1.0, pred_act=False, eps=1e-3):
 67 |         self.weight = weight
 68 |         self.pred_act = pred_act
 69 |         self.eps = eps
 70 | 
 71 |     def binary_mask_dice_loss(self, mask_preds, gt_masks):
 72 |         """
 73 |         Args:
 74 |             mask_preds (Tensor): Mask prediction in shape (N1, H, W).
 75 |             gt_masks (Tensor): Ground truth in shape (N2, H, W)
 76 |                 store 0 or 1, 0 for negative class and 1 for
 77 |                 positive class.
 78 | 
 79 |         Returns:
 80 |             Tensor: Dice cost matrix in shape (N1, N2).
 81 |         """
 82 |         mask_preds = mask_preds.reshape((mask_preds.shape[0], -1))
 83 |         gt_masks = gt_masks.reshape((gt_masks.shape[0], -1)).float()
 84 |         numerator = 2 * torch.einsum("nc,mc->nm", mask_preds, gt_masks)
 85 |         denominator = mask_preds.sum(-1)[:, None] + gt_masks.sum(-1)[None, :]
 86 |         loss = 1 - (numerator + self.eps) / (denominator + self.eps)
 87 |         return loss
 88 | 
 89 |     def __call__(self, mask_preds, gt_masks):
 90 |         """
 91 |         Args:
 92 |             mask_preds (Tensor): Mask prediction logits in shape (N1, H, W).
 93 |             gt_masks (Tensor): Ground truth in shape (N2, H, W).
 94 | 
 95 |         Returns:
 96 |             Tensor: Dice cost matrix in shape (N1, N2).
 97 |         """
 98 |         if self.pred_act:
 99 |             mask_preds = mask_preds.sigmoid()
100 |         dice_cost = self.binary_mask_dice_loss(mask_preds, gt_masks)
101 |         return dice_cost * self.weight
102 | 
103 | 
104 | @MATCH_COST.register_module()
105 | class CrossEntropyLossCost:
106 |     """CrossEntropyLossCost.
107 | 
108 |     Args:
109 |         weight (int | float, optional): loss weight. Defaults to 1.
110 |         use_sigmoid (bool, optional): Whether the prediction uses sigmoid
111 |                 of softmax. Defaults to True.
112 |     """
113 | 
114 |     def __init__(self, weight=1.0, use_sigmoid=True):
115 |         assert use_sigmoid, "use_sigmoid = False is not supported yet."
116 |         self.weight = weight
117 |         self.use_sigmoid = use_sigmoid
118 | 
119 |     def _binary_cross_entropy(self, cls_pred, gt_labels):
120 |         """
121 |         Args:
122 |             cls_pred (Tensor): The prediction with shape (num_query, 1, *) or
123 |                 (num_query, *).
124 |             gt_labels (Tensor): The learning label of prediction with
125 |                 shape (num_gt, *).
126 |         Returns:
127 |             Tensor: Cross entropy cost matrix in shape (num_query, num_gt).
128 |         """
129 |         cls_pred = cls_pred.flatten(1).float()
130 |         gt_labels = gt_labels.flatten(1).float()
131 |         n = cls_pred.shape[1]
132 |         pos = F.binary_cross_entropy_with_logits(cls_pred, torch.ones_like(cls_pred), reduction="none")
133 |         neg = F.binary_cross_entropy_with_logits(cls_pred, torch.zeros_like(cls_pred), reduction="none")
134 |         cls_cost = torch.einsum("nc,mc->nm", pos, gt_labels) + torch.einsum("nc,mc->nm", neg, 1 - gt_labels)
135 |         cls_cost = cls_cost / n
136 | 
137 |         return cls_cost
138 | 
139 |     def __call__(self, cls_pred, gt_labels):
140 |         """
141 |         Args:
142 |             cls_pred (Tensor): Predicted classification logits.
143 |             gt_labels (Tensor): Labels.
144 |         Returns:
145 |             Tensor: Cross entropy cost matrix with weight in
146 |                 shape (num_query, num_gt).
147 |         """
148 |         if self.use_sigmoid:
149 |             cls_cost = self._binary_cross_entropy(cls_pred, gt_labels)
150 |         else:
151 |             raise NotImplementedError
152 | 
153 |         return cls_cost * self.weight
154 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/models/plugins/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .msdeformattn_pixel_decoder import MSDeformAttnPixelDecoder
7 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/models/segmentors/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .encoder_decoder_mask2former import EncoderDecoderMask2Former
7 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/models/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from .assigner import MaskHungarianAssigner
 7 | from .point_sample import get_uncertain_point_coords_with_randomness
 8 | from .positional_encoding import LearnedPositionalEncoding, SinePositionalEncoding
 9 | from .transformer import DetrTransformerDecoder, DetrTransformerDecoderLayer, DynamicConv, Transformer
10 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/models/utils/assigner.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | from abc import ABCMeta, abstractmethod
  7 | 
  8 | import torch
  9 | 
 10 | from ..builder import MASK_ASSIGNERS, build_match_cost
 11 | 
 12 | try:
 13 |     from scipy.optimize import linear_sum_assignment
 14 | except ImportError:
 15 |     linear_sum_assignment = None
 16 | 
 17 | 
 18 | class AssignResult(metaclass=ABCMeta):
 19 |     """Collection of assign results."""
 20 | 
 21 |     def __init__(self, num_gts, gt_inds, labels):
 22 |         self.num_gts = num_gts
 23 |         self.gt_inds = gt_inds
 24 |         self.labels = labels
 25 | 
 26 |     @property
 27 |     def info(self):
 28 |         info = {
 29 |             "num_gts": self.num_gts,
 30 |             "gt_inds": self.gt_inds,
 31 |             "labels": self.labels,
 32 |         }
 33 |         return info
 34 | 
 35 | 
 36 | class BaseAssigner(metaclass=ABCMeta):
 37 |     """Base assigner that assigns boxes to ground truth boxes."""
 38 | 
 39 |     @abstractmethod
 40 |     def assign(self, masks, gt_masks, gt_masks_ignore=None, gt_labels=None):
 41 |         """Assign boxes to either a ground truth boxes or a negative boxes."""
 42 |         pass
 43 | 
 44 | 
 45 | @MASK_ASSIGNERS.register_module()
 46 | class MaskHungarianAssigner(BaseAssigner):
 47 |     """Computes one-to-one matching between predictions and ground truth for
 48 |     mask.
 49 | 
 50 |     This class computes an assignment between the targets and the predictions
 51 |     based on the costs. The costs are weighted sum of three components:
 52 |     classification cost, regression L1 cost and regression iou cost. The
 53 |     targets don't include the no_object, so generally there are more
 54 |     predictions than targets. After the one-to-one matching, the un-matched
 55 |     are treated as backgrounds. Thus each query prediction will be assigned
 56 |     with `0` or a positive integer indicating the ground truth index:
 57 | 
 58 |     - 0: negative sample, no assigned gt
 59 |     - positive integer: positive sample, index (1-based) of assigned gt
 60 | 
 61 |     Args:
 62 |         cls_cost (obj:`mmcv.ConfigDict`|dict): Classification cost config.
 63 |         mask_cost (obj:`mmcv.ConfigDict`|dict): Mask cost config.
 64 |         dice_cost (obj:`mmcv.ConfigDict`|dict): Dice cost config.
 65 |     """
 66 | 
 67 |     def __init__(
 68 |         self,
 69 |         cls_cost=dict(type="ClassificationCost", weight=1.0),
 70 |         dice_cost=dict(type="DiceCost", weight=1.0),
 71 |         mask_cost=dict(type="MaskFocalCost", weight=1.0),
 72 |     ):
 73 |         self.cls_cost = build_match_cost(cls_cost)
 74 |         self.dice_cost = build_match_cost(dice_cost)
 75 |         self.mask_cost = build_match_cost(mask_cost)
 76 | 
 77 |     def assign(self, cls_pred, mask_pred, gt_labels, gt_masks, img_meta, gt_masks_ignore=None, eps=1e-7):
 78 |         """Computes one-to-one matching based on the weighted costs.
 79 | 
 80 |         This method assign each query prediction to a ground truth or
 81 |         background. The `assigned_gt_inds` with -1 means don't care,
 82 |         0 means negative sample, and positive number is the index (1-based)
 83 |         of assigned gt.
 84 |         The assignment is done in the following steps, the order matters.
 85 | 
 86 |         1. assign every prediction to -1
 87 |         2. compute the weighted costs
 88 |         3. do Hungarian matching on CPU based on the costs
 89 |         4. assign all to 0 (background) first, then for each matched pair
 90 |            between predictions and gts, treat this prediction as foreground
 91 |            and assign the corresponding gt index (plus 1) to it.
 92 | 
 93 |         Args:
 94 |             mask_pred (Tensor): Predicted mask, shape [num_query, h, w]
 95 |             cls_pred (Tensor): Predicted classification logits, shape
 96 |                 [num_query, num_class].
 97 |             gt_masks (Tensor): Ground truth mask, shape [num_gt, h, w].
 98 |             gt_labels (Tensor): Label of `gt_masks`, shape (num_gt,).
 99 |             img_meta (dict): Meta information for current image.
100 |             gt_masks_ignore (Tensor, optional): Ground truth masks that are
101 |                 labelled as `ignored`. Default None.
102 |             eps (int | float, optional): A value added to the denominator for
103 |                 numerical stability. Default 1e-7.
104 | 
105 |         Returns:
106 |             :obj:`AssignResult`: The assigned result.
107 |         """
108 |         assert gt_masks_ignore is None, "Only case when gt_masks_ignore is None is supported."
109 |         num_gts, num_queries = gt_labels.shape[0], cls_pred.shape[0]
110 | 
111 |         # 1. assign -1 by default
112 |         assigned_gt_inds = cls_pred.new_full((num_queries,), -1, dtype=torch.long)
113 |         assigned_labels = cls_pred.new_full((num_queries,), -1, dtype=torch.long)
114 |         if num_gts == 0 or num_queries == 0:
115 |             # No ground truth or boxes, return empty assignment
116 |             if num_gts == 0:
117 |                 # No ground truth, assign all to background
118 |                 assigned_gt_inds[:] = 0
119 |             return AssignResult(num_gts, assigned_gt_inds, labels=assigned_labels)
120 | 
121 |         # 2. compute the weighted costs
122 |         # classification and maskcost.
123 |         if self.cls_cost.weight != 0 and cls_pred is not None:
124 |             cls_cost = self.cls_cost(cls_pred, gt_labels)
125 |         else:
126 |             cls_cost = 0
127 | 
128 |         if self.mask_cost.weight != 0:
129 |             # mask_pred shape = [nq, h, w]
130 |             # gt_mask shape = [ng, h, w]
131 |             # mask_cost shape = [nq, ng]
132 |             mask_cost = self.mask_cost(mask_pred, gt_masks)
133 |         else:
134 |             mask_cost = 0
135 | 
136 |         if self.dice_cost.weight != 0:
137 |             dice_cost = self.dice_cost(mask_pred, gt_masks)
138 |         else:
139 |             dice_cost = 0
140 |         cost = cls_cost + mask_cost + dice_cost
141 | 
142 |         # 3. do Hungarian matching on CPU using linear_sum_assignment
143 |         cost = cost.detach().cpu()
144 |         if linear_sum_assignment is None:
145 |             raise ImportError('Please run "pip install scipy" ' "to install scipy first.")
146 | 
147 |         matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
148 |         matched_row_inds = torch.from_numpy(matched_row_inds).to(cls_pred.device)
149 |         matched_col_inds = torch.from_numpy(matched_col_inds).to(cls_pred.device)
150 | 
151 |         # 4. assign backgrounds and foregrounds
152 |         # assign all indices to backgrounds first
153 |         assigned_gt_inds[:] = 0
154 |         # assign foregrounds based on matching results
155 |         assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
156 |         assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
157 |         return AssignResult(num_gts, assigned_gt_inds, labels=assigned_labels)
158 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/models/utils/point_sample.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import torch
 7 | from mmcv.ops import point_sample
 8 | 
 9 | 
10 | def get_uncertainty(mask_pred, labels):
11 |     """Estimate uncertainty based on pred logits.
12 | 
13 |     We estimate uncertainty as L1 distance between 0.0 and the logits
14 |     prediction in 'mask_pred' for the foreground class in `classes`.
15 | 
16 |     Args:
17 |         mask_pred (Tensor): mask predication logits, shape (num_rois,
18 |             num_classes, mask_height, mask_width).
19 | 
20 |         labels (list[Tensor]): Either predicted or ground truth label for
21 |             each predicted mask, of length num_rois.
22 | 
23 |     Returns:
24 |         scores (Tensor): Uncertainty scores with the most uncertain
25 |             locations having the highest uncertainty score,
26 |             shape (num_rois, 1, mask_height, mask_width)
27 |     """
28 |     if mask_pred.shape[1] == 1:
29 |         gt_class_logits = mask_pred.clone()
30 |     else:
31 |         inds = torch.arange(mask_pred.shape[0], device=mask_pred.device)
32 |         gt_class_logits = mask_pred[inds, labels].unsqueeze(1)
33 |     return -torch.abs(gt_class_logits)
34 | 
35 | 
36 | def get_uncertain_point_coords_with_randomness(
37 |     mask_pred, labels, num_points, oversample_ratio, importance_sample_ratio
38 | ):
39 |     """Get ``num_points`` most uncertain points with random points during
40 |     train.
41 | 
42 |     Sample points in [0, 1] x [0, 1] coordinate space based on their
43 |     uncertainty. The uncertainties are calculated for each point using
44 |     'get_uncertainty()' function that takes point's logit prediction as
45 |     input.
46 | 
47 |     Args:
48 |         mask_pred (Tensor): A tensor of shape (num_rois, num_classes,
49 |             mask_height, mask_width) for class-specific or class-agnostic
50 |             prediction.
51 |         labels (list): The ground truth class for each instance.
52 |         num_points (int): The number of points to sample.
53 |         oversample_ratio (int): Oversampling parameter.
54 |         importance_sample_ratio (float): Ratio of points that are sampled
55 |             via importnace sampling.
56 | 
57 |     Returns:
58 |         point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
59 |             that contains the coordinates sampled points.
60 |     """
61 |     assert oversample_ratio >= 1
62 |     assert 0 <= importance_sample_ratio <= 1
63 |     batch_size = mask_pred.shape[0]
64 |     num_sampled = int(num_points * oversample_ratio)
65 |     point_coords = torch.rand(batch_size, num_sampled, 2, device=mask_pred.device)
66 |     point_logits = point_sample(mask_pred, point_coords)
67 |     # It is crucial to calculate uncertainty based on the sampled
68 |     # prediction value for the points. Calculating uncertainties of the
69 |     # coarse predictions first and sampling them for points leads to
70 |     # incorrect results.  To illustrate this: assume uncertainty func(
71 |     # logits)=-abs(logits), a sampled point between two coarse
72 |     # predictions with -1 and 1 logits has 0 logits, and therefore 0
73 |     # uncertainty value. However, if we calculate uncertainties for the
74 |     # coarse predictions first, both will have -1 uncertainty,
75 |     # and sampled point will get -1 uncertainty.
76 |     point_uncertainties = get_uncertainty(point_logits, labels)
77 |     num_uncertain_points = int(importance_sample_ratio * num_points)
78 |     num_random_points = num_points - num_uncertain_points
79 |     idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
80 |     shift = num_sampled * torch.arange(batch_size, dtype=torch.long, device=mask_pred.device)
81 |     idx += shift[:, None]
82 |     point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(batch_size, num_uncertain_points, 2)
83 |     if num_random_points > 0:
84 |         rand_roi_coords = torch.rand(batch_size, num_random_points, 2, device=mask_pred.device)
85 |         point_coords = torch.cat((point_coords, rand_roi_coords), dim=1)
86 |     return point_coords
87 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/models/utils/positional_encoding.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | import math
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING
 11 | from mmcv.runner import BaseModule
 12 | 
 13 | 
 14 | @POSITIONAL_ENCODING.register_module()
 15 | class SinePositionalEncoding(BaseModule):
 16 |     """Position encoding with sine and cosine functions.
 17 | 
 18 |     See `End-to-End Object Detection with Transformers
 19 |     <https://arxiv.org/pdf/2005.12872>`_ for details.
 20 | 
 21 |     Args:
 22 |         num_feats (int): The feature dimension for each position
 23 |             along x-axis or y-axis. Note the final returned dimension
 24 |             for each position is 2 times of this value.
 25 |         temperature (int, optional): The temperature used for scaling
 26 |             the position embedding. Defaults to 10000.
 27 |         normalize (bool, optional): Whether to normalize the position
 28 |             embedding. Defaults to False.
 29 |         scale (float, optional): A scale factor that scales the position
 30 |             embedding. The scale will be used only when `normalize` is True.
 31 |             Defaults to 2*pi.
 32 |         eps (float, optional): A value added to the denominator for
 33 |             numerical stability. Defaults to 1e-6.
 34 |         offset (float): offset add to embed when do the normalization.
 35 |             Defaults to 0.
 36 |         init_cfg (dict or list[dict], optional): Initialization config dict.
 37 |             Default: None
 38 |     """
 39 | 
 40 |     def __init__(
 41 |         self, num_feats, temperature=10000, normalize=False, scale=2 * math.pi, eps=1e-6, offset=0.0, init_cfg=None
 42 |     ):
 43 |         super(SinePositionalEncoding, self).__init__(init_cfg)
 44 |         if normalize:
 45 |             assert isinstance(scale, (float, int)), (
 46 |                 "when normalize is set," "scale should be provided and in float or int type, " f"found {type(scale)}"
 47 |             )
 48 |         self.num_feats = num_feats
 49 |         self.temperature = temperature
 50 |         self.normalize = normalize
 51 |         self.scale = scale
 52 |         self.eps = eps
 53 |         self.offset = offset
 54 | 
 55 |     def forward(self, mask):
 56 |         """Forward function for `SinePositionalEncoding`.
 57 | 
 58 |         Args:
 59 |             mask (Tensor): ByteTensor mask. Non-zero values representing
 60 |                 ignored positions, while zero values means valid positions
 61 |                 for this image. Shape [bs, h, w].
 62 | 
 63 |         Returns:
 64 |             pos (Tensor): Returned position embedding with shape
 65 |                 [bs, num_feats*2, h, w].
 66 |         """
 67 |         # For convenience of exporting to ONNX, it's required to convert
 68 |         # `masks` from bool to int.
 69 |         mask = mask.to(torch.int)
 70 |         not_mask = 1 - mask  # logical_not
 71 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
 72 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
 73 |         if self.normalize:
 74 |             y_embed = (y_embed + self.offset) / (y_embed[:, -1:, :] + self.eps) * self.scale
 75 |             x_embed = (x_embed + self.offset) / (x_embed[:, :, -1:] + self.eps) * self.scale
 76 |         dim_t = torch.arange(self.num_feats, dtype=torch.float32, device=mask.device)
 77 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_feats)
 78 |         pos_x = x_embed[:, :, :, None] / dim_t
 79 |         pos_y = y_embed[:, :, :, None] / dim_t
 80 |         # use `view` instead of `flatten` for dynamically exporting to ONNX
 81 |         B, H, W = mask.size()
 82 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).view(B, H, W, -1)
 83 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).view(B, H, W, -1)
 84 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
 85 |         return pos
 86 | 
 87 |     def __repr__(self):
 88 |         """str: a string that describes the module"""
 89 |         repr_str = self.__class__.__name__
 90 |         repr_str += f"(num_feats={self.num_feats}, "
 91 |         repr_str += f"temperature={self.temperature}, "
 92 |         repr_str += f"normalize={self.normalize}, "
 93 |         repr_str += f"scale={self.scale}, "
 94 |         repr_str += f"eps={self.eps})"
 95 |         return repr_str
 96 | 
 97 | 
 98 | @POSITIONAL_ENCODING.register_module()
 99 | class LearnedPositionalEncoding(BaseModule):
100 |     """Position embedding with learnable embedding weights.
101 | 
102 |     Args:
103 |         num_feats (int): The feature dimension for each position
104 |             along x-axis or y-axis. The final returned dimension for
105 |             each position is 2 times of this value.
106 |         row_num_embed (int, optional): The dictionary size of row embeddings.
107 |             Default 50.
108 |         col_num_embed (int, optional): The dictionary size of col embeddings.
109 |             Default 50.
110 |         init_cfg (dict or list[dict], optional): Initialization config dict.
111 |     """
112 | 
113 |     def __init__(self, num_feats, row_num_embed=50, col_num_embed=50, init_cfg=dict(type="Uniform", layer="Embedding")):
114 |         super(LearnedPositionalEncoding, self).__init__(init_cfg)
115 |         self.row_embed = nn.Embedding(row_num_embed, num_feats)
116 |         self.col_embed = nn.Embedding(col_num_embed, num_feats)
117 |         self.num_feats = num_feats
118 |         self.row_num_embed = row_num_embed
119 |         self.col_num_embed = col_num_embed
120 | 
121 |     def forward(self, mask):
122 |         """Forward function for `LearnedPositionalEncoding`.
123 | 
124 |         Args:
125 |             mask (Tensor): ByteTensor mask. Non-zero values representing
126 |                 ignored positions, while zero values means valid positions
127 |                 for this image. Shape [bs, h, w].
128 | 
129 |         Returns:
130 |             pos (Tensor): Returned position embedding with shape
131 |                 [bs, num_feats*2, h, w].
132 |         """
133 |         h, w = mask.shape[-2:]
134 |         x = torch.arange(w, device=mask.device)
135 |         y = torch.arange(h, device=mask.device)
136 |         x_embed = self.col_embed(x)
137 |         y_embed = self.row_embed(y)
138 |         pos = (
139 |             torch.cat((x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(1, w, 1)), dim=-1)
140 |             .permute(2, 0, 1)
141 |             .unsqueeze(0)
142 |             .repeat(mask.shape[0], 1, 1, 1)
143 |         )
144 |         return pos
145 | 
146 |     def __repr__(self):
147 |         """str: a string that describes the module"""
148 |         repr_str = self.__class__.__name__
149 |         repr_str += f"(num_feats={self.num_feats}, "
150 |         repr_str += f"row_num_embed={self.row_num_embed}, "
151 |         repr_str += f"col_num_embed={self.col_num_embed})"
152 |         return repr_str
153 | 


--------------------------------------------------------------------------------
/dinov2/eval/segmentation_m2f/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | # References:
 7 | #   https://github.com/fundamentalvision/Deformable-DETR/tree/main/models/ops/modules
 8 | #   https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | 
10 | from .ms_deform_attn import MSDeformAttn
11 | 


--------------------------------------------------------------------------------
/dinov2/eval/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import argparse
 7 | from typing import Any, List, Optional, Tuple
 8 | 
 9 | import torch
10 | import torch.backends.cudnn as cudnn
11 | 
12 | from dinov2.models import build_model_from_cfg
13 | from dinov2.utils.config import setup
14 | import dinov2.utils.utils as dinov2_utils
15 | 
16 | 
17 | def get_args_parser(
18 |     description: Optional[str] = None,
19 |     parents: Optional[List[argparse.ArgumentParser]] = None,
20 |     add_help: bool = True,
21 | ):
22 |     parser = argparse.ArgumentParser(
23 |         description=description,
24 |         parents=parents or [],
25 |         add_help=add_help,
26 |     )
27 |     parser.add_argument(
28 |         "--config-file",
29 |         type=str,
30 |         help="Model configuration file",
31 |     )
32 |     parser.add_argument(
33 |         "--pretrained-weights",
34 |         type=str,
35 |         help="Pretrained model weights",
36 |     )
37 |     parser.add_argument(
38 |         "--output-dir",
39 |         default="",
40 |         type=str,
41 |         help="Output directory to write results and logs",
42 |     )
43 |     parser.add_argument(
44 |         "--opts",
45 |         help="Extra configuration options",
46 |         default=[],
47 |         nargs="+",
48 |     )
49 |     return parser
50 | 
51 | 
52 | def get_autocast_dtype(config):
53 |     teacher_dtype_str = config.compute_precision.teacher.backbone.mixed_precision.param_dtype
54 |     if teacher_dtype_str == "fp16":
55 |         return torch.half
56 |     elif teacher_dtype_str == "bf16":
57 |         return torch.bfloat16
58 |     else:
59 |         return torch.float
60 | 
61 | 
62 | def build_model_for_eval(config, pretrained_weights):
63 |     model, _ = build_model_from_cfg(config, only_teacher=True)
64 |     dinov2_utils.load_pretrained_weights(model, pretrained_weights, "teacher")
65 |     model.eval()
66 |     model.cuda()
67 |     return model
68 | 
69 | 
70 | def setup_and_build_model(args) -> Tuple[Any, torch.dtype]:
71 |     cudnn.benchmark = True
72 |     config = setup(args)
73 |     model = build_model_for_eval(config, args.pretrained_weights)
74 |     autocast_dtype = get_autocast_dtype(config)
75 |     return model, autocast_dtype
76 | 


--------------------------------------------------------------------------------
/dinov2/eval/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | import logging
  7 | from typing import Dict, Optional
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | from torchmetrics import MetricCollection
 12 | 
 13 | from dinov2.data import DatasetWithEnumeratedTargets, SamplerType, make_data_loader
 14 | import dinov2.distributed as distributed
 15 | from dinov2.logging import MetricLogger
 16 | 
 17 | 
 18 | logger = logging.getLogger("dinov2")
 19 | 
 20 | 
 21 | class ModelWithNormalize(torch.nn.Module):
 22 |     def __init__(self, model):
 23 |         super().__init__()
 24 |         self.model = model
 25 | 
 26 |     def forward(self, samples):
 27 |         return nn.functional.normalize(self.model(samples), dim=1, p=2)
 28 | 
 29 | 
 30 | class ModelWithIntermediateLayers(nn.Module):
 31 |     def __init__(self, feature_model, n_last_blocks, autocast_ctx):
 32 |         super().__init__()
 33 |         self.feature_model = feature_model
 34 |         self.feature_model.eval()
 35 |         self.n_last_blocks = n_last_blocks
 36 |         self.autocast_ctx = autocast_ctx
 37 | 
 38 |     def forward(self, images):
 39 |         with torch.inference_mode():
 40 |             with self.autocast_ctx():
 41 |                 features = self.feature_model.get_intermediate_layers(
 42 |                     images, self.n_last_blocks, return_class_token=True
 43 |                 )
 44 |         return features
 45 | 
 46 | 
 47 | @torch.inference_mode()
 48 | def evaluate(
 49 |     model: nn.Module,
 50 |     data_loader,
 51 |     postprocessors: Dict[str, nn.Module],
 52 |     metrics: Dict[str, MetricCollection],
 53 |     device: torch.device,
 54 |     criterion: Optional[nn.Module] = None,
 55 | ):
 56 |     model.eval()
 57 |     if criterion is not None:
 58 |         criterion.eval()
 59 | 
 60 |     for metric in metrics.values():
 61 |         metric = metric.to(device)
 62 | 
 63 |     metric_logger = MetricLogger(delimiter="  ")
 64 |     header = "Test:"
 65 | 
 66 |     for samples, targets, *_ in metric_logger.log_every(data_loader, 10, header):
 67 |         outputs = model(samples.to(device))
 68 |         targets = targets.to(device)
 69 | 
 70 |         if criterion is not None:
 71 |             loss = criterion(outputs, targets)
 72 |             metric_logger.update(loss=loss.item())
 73 | 
 74 |         for k, metric in metrics.items():
 75 |             metric_inputs = postprocessors[k](outputs, targets)
 76 |             metric.update(**metric_inputs)
 77 | 
 78 |     metric_logger.synchronize_between_processes()
 79 |     logger.info(f"Averaged stats: {metric_logger}")
 80 | 
 81 |     stats = {k: metric.compute() for k, metric in metrics.items()}
 82 |     metric_logger_stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
 83 |     return metric_logger_stats, stats
 84 | 
 85 | 
 86 | def all_gather_and_flatten(tensor_rank):
 87 |     tensor_all_ranks = torch.empty(
 88 |         distributed.get_global_size(),
 89 |         *tensor_rank.shape,
 90 |         dtype=tensor_rank.dtype,
 91 |         device=tensor_rank.device,
 92 |     )
 93 |     tensor_list = list(tensor_all_ranks.unbind(0))
 94 |     torch.distributed.all_gather(tensor_list, tensor_rank.contiguous())
 95 |     return tensor_all_ranks.flatten(end_dim=1)
 96 | 
 97 | 
 98 | def extract_features(model, dataset, batch_size, num_workers, gather_on_cpu=False):
 99 |     dataset_with_enumerated_targets = DatasetWithEnumeratedTargets(dataset)
100 |     sample_count = len(dataset_with_enumerated_targets)
101 |     data_loader = make_data_loader(
102 |         dataset=dataset_with_enumerated_targets,
103 |         batch_size=batch_size,
104 |         num_workers=num_workers,
105 |         sampler_type=SamplerType.DISTRIBUTED,
106 |         drop_last=False,
107 |         shuffle=False,
108 |     )
109 |     return extract_features_with_dataloader(model, data_loader, sample_count, gather_on_cpu)
110 | 
111 | 
112 | @torch.inference_mode()
113 | def extract_features_with_dataloader(model, data_loader, sample_count, gather_on_cpu=False):
114 |     gather_device = torch.device("cpu") if gather_on_cpu else torch.device("cuda")
115 |     metric_logger = MetricLogger(delimiter="  ")
116 |     features, all_labels = None, None
117 |     for samples, (index, labels_rank) in metric_logger.log_every(data_loader, 10):
118 |         samples = samples.cuda(non_blocking=True)
119 |         labels_rank = labels_rank.cuda(non_blocking=True)
120 |         index = index.cuda(non_blocking=True)
121 |         features_rank = model(samples).float()
122 | 
123 |         # init storage feature matrix
124 |         if features is None:
125 |             features = torch.zeros(sample_count, features_rank.shape[-1], device=gather_device)
126 |             labels_shape = list(labels_rank.shape)
127 |             labels_shape[0] = sample_count
128 |             all_labels = torch.full(labels_shape, fill_value=-1, device=gather_device)
129 |             logger.info(f"Storing features into tensor of shape {features.shape}")
130 | 
131 |         # share indexes, features and labels between processes
132 |         index_all = all_gather_and_flatten(index).to(gather_device)
133 |         features_all_ranks = all_gather_and_flatten(features_rank).to(gather_device)
134 |         labels_all_ranks = all_gather_and_flatten(labels_rank).to(gather_device)
135 | 
136 |         # update storage feature matrix
137 |         if len(index_all) > 0:
138 |             features.index_copy_(0, index_all, features_all_ranks)
139 |             all_labels.index_copy_(0, index_all, labels_all_ranks)
140 | 
141 |     logger.info(f"Features shape: {tuple(features.shape)}")
142 |     logger.info(f"Labels shape: {tuple(all_labels.shape)}")
143 | 
144 |     assert torch.all(all_labels > -1)
145 | 
146 |     return features, all_labels
147 | 


--------------------------------------------------------------------------------
/dinov2/fsdp/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | import os
  7 | from typing import Any
  8 | 
  9 | import torch
 10 | import dinov2.distributed as distributed
 11 | from functools import partial
 12 | from fvcore.common.checkpoint import Checkpointer
 13 | from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 14 | from torch.distributed.fsdp import ShardingStrategy
 15 | from torch.distributed.fsdp import MixedPrecision
 16 | from torch.distributed.fsdp import StateDictType
 17 | from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
 18 | from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 19 | from torch.distributed.fsdp._runtime_utils import _reshard
 20 | 
 21 | 
 22 | def get_fsdp_wrapper(model_cfg, modules_to_wrap=set()):
 23 |     sharding_strategy_dict = {
 24 |         "NO_SHARD": ShardingStrategy.NO_SHARD,
 25 |         "SHARD_GRAD_OP": ShardingStrategy.SHARD_GRAD_OP,
 26 |         "FULL_SHARD": ShardingStrategy.FULL_SHARD,
 27 |     }
 28 | 
 29 |     dtype_dict = {
 30 |         "fp32": torch.float32,
 31 |         "fp16": torch.float16,
 32 |         "bf16": torch.bfloat16,
 33 |     }
 34 | 
 35 |     mixed_precision_config = MixedPrecision(
 36 |         param_dtype=dtype_dict[model_cfg.mixed_precision.param_dtype],
 37 |         reduce_dtype=dtype_dict[model_cfg.mixed_precision.reduce_dtype],
 38 |         buffer_dtype=dtype_dict[model_cfg.mixed_precision.buffer_dtype],
 39 |     )
 40 | 
 41 |     sharding_strategy_config = sharding_strategy_dict[model_cfg.sharding_strategy]
 42 | 
 43 |     local_rank = distributed.get_local_rank()
 44 | 
 45 |     fsdp_wrapper = partial(
 46 |         FSDP,
 47 |         sharding_strategy=sharding_strategy_config,
 48 |         mixed_precision=mixed_precision_config,
 49 |         device_id=local_rank,
 50 |         sync_module_states=True,
 51 |         use_orig_params=True,
 52 |         auto_wrap_policy=ModuleWrapPolicy(modules_to_wrap),
 53 |     )
 54 |     return fsdp_wrapper
 55 | 
 56 | 
 57 | def is_fsdp(x):
 58 |     return isinstance(x, FSDP)
 59 | 
 60 | 
 61 | def is_sharded_fsdp(x):
 62 |     return is_fsdp(x) and x.sharding_strategy is not ShardingStrategy.NO_SHARD
 63 | 
 64 | 
 65 | def free_if_fsdp(x):
 66 |     if is_sharded_fsdp(x):
 67 |         handles = x._handles
 68 |         true_list = [True for h in handles]
 69 |         _reshard(x, handles, true_list)
 70 | 
 71 | 
 72 | def get_fsdp_modules(x):
 73 |     return FSDP.fsdp_modules(x)
 74 | 
 75 | 
 76 | def reshard_fsdp_model(x):
 77 |     for m in get_fsdp_modules(x):
 78 |         free_if_fsdp(m)
 79 | 
 80 | 
 81 | def rankstr():
 82 |     return f"rank_{distributed.get_global_rank()}"
 83 | 
 84 | 
 85 | class FSDPCheckpointer(Checkpointer):
 86 |     def save(self, name: str, **kwargs: Any) -> None:
 87 |         """
 88 |         Dump model and checkpointables to a file.
 89 | 
 90 |         Args:
 91 |             name (str): name of the file.
 92 |             kwargs (dict): extra arbitrary data to save.
 93 |         """
 94 |         if not self.save_dir or not self.save_to_disk:
 95 |             return
 96 | 
 97 |         data = {}
 98 |         with FSDP.state_dict_type(self.model, StateDictType.LOCAL_STATE_DICT):
 99 |             data["model"] = self.model.state_dict()
100 | 
101 |         # data["model"] = self.model.state_dict()
102 |         for key, obj in self.checkpointables.items():
103 |             data[key] = obj.state_dict()
104 |         data.update(kwargs)
105 | 
106 |         basename = f"{name}.{rankstr()}.pth"
107 |         save_file = os.path.join(self.save_dir, basename)
108 |         assert os.path.basename(save_file) == basename, basename
109 |         self.logger.info("Saving checkpoint to {}".format(save_file))
110 |         with self.path_manager.open(save_file, "wb") as f:
111 |             torch.save(data, f)
112 |         self.tag_last_checkpoint(basename)
113 | 
114 |     def load(self, *args, **kwargs):
115 |         with FSDP.state_dict_type(self.model, StateDictType.LOCAL_STATE_DICT):
116 |             return super().load(*args, **kwargs)
117 | 
118 |     def has_checkpoint(self) -> bool:
119 |         """
120 |         Returns:
121 |             bool: whether a checkpoint exists in the target directory.
122 |         """
123 |         save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
124 |         return self.path_manager.exists(save_file)
125 | 
126 |     def get_checkpoint_file(self) -> str:
127 |         """
128 |         Returns:
129 |             str: The latest checkpoint file in target directory.
130 |         """
131 |         save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
132 |         try:
133 |             with self.path_manager.open(save_file, "r") as f:
134 |                 last_saved = f.read().strip()
135 |         except IOError:
136 |             # if file doesn't exist, maybe because it has just been
137 |             # deleted by a separate process
138 |             return ""
139 |         # pyre-fixme[6]: For 2nd param expected `Union[PathLike[str], str]` but got
140 |         #  `Union[bytes, str]`.
141 |         return os.path.join(self.save_dir, last_saved)
142 | 
143 |     def tag_last_checkpoint(self, last_filename_basename: str) -> None:
144 |         """
145 |         Tag the last checkpoint.
146 | 
147 |         Args:
148 |             last_filename_basename (str): the basename of the last filename.
149 |         """
150 |         if distributed.is_enabled():
151 |             torch.distributed.barrier()
152 |         save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
153 |         with self.path_manager.open(save_file, "w") as f:
154 |             f.write(last_filename_basename)  # pyre-ignore
155 | 
156 | 
157 | ShardedGradScaler = ShardedGradScaler
158 | 


--------------------------------------------------------------------------------
/dinov2/hub/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/dinov2/hub/backbones.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | from enum import Enum
  7 | from typing import Union
  8 | 
  9 | import torch
 10 | 
 11 | from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
 12 | 
 13 | 
 14 | class Weights(Enum):
 15 |     LVD142M = "LVD142M"
 16 | 
 17 | 
 18 | def _make_dinov2_model(
 19 |     *,
 20 |     arch_name: str = "vit_large",
 21 |     img_size: int = 518,
 22 |     patch_size: int = 14,
 23 |     init_values: float = 1.0,
 24 |     ffn_layer: str = "mlp",
 25 |     block_chunks: int = 0,
 26 |     num_register_tokens: int = 0,
 27 |     interpolate_antialias: bool = False,
 28 |     interpolate_offset: float = 0.1,
 29 |     pretrained: bool = True,
 30 |     weights: Union[Weights, str] = Weights.LVD142M,
 31 |     **kwargs,
 32 | ):
 33 |     from ..models import vision_transformer as vits
 34 | 
 35 |     if isinstance(weights, str):
 36 |         try:
 37 |             weights = Weights[weights]
 38 |         except KeyError:
 39 |             raise AssertionError(f"Unsupported weights: {weights}")
 40 | 
 41 |     model_base_name = _make_dinov2_model_name(arch_name, patch_size)
 42 |     vit_kwargs = dict(
 43 |         img_size=img_size,
 44 |         patch_size=patch_size,
 45 |         init_values=init_values,
 46 |         ffn_layer=ffn_layer,
 47 |         block_chunks=block_chunks,
 48 |         num_register_tokens=num_register_tokens,
 49 |         interpolate_antialias=interpolate_antialias,
 50 |         interpolate_offset=interpolate_offset,
 51 |     )
 52 |     vit_kwargs.update(**kwargs)
 53 |     model = vits.__dict__[arch_name](**vit_kwargs)
 54 | 
 55 |     if pretrained:
 56 |         model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
 57 |         url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
 58 |         state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
 59 |         model.load_state_dict(state_dict, strict=True)
 60 | 
 61 |     return model
 62 | 
 63 | 
 64 | def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
 65 |     """
 66 |     DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
 67 |     """
 68 |     return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
 69 | 
 70 | 
 71 | def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
 72 |     """
 73 |     DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
 74 |     """
 75 |     return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
 76 | 
 77 | 
 78 | def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
 79 |     """
 80 |     DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
 81 |     """
 82 |     return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
 83 | 
 84 | 
 85 | def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
 86 |     """
 87 |     DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
 88 |     """
 89 |     return _make_dinov2_model(
 90 |         arch_name="vit_giant2",
 91 |         ffn_layer="swiglufused",
 92 |         weights=weights,
 93 |         pretrained=pretrained,
 94 |         **kwargs,
 95 |     )
 96 | 
 97 | 
 98 | def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
 99 |     """
100 |     DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
101 |     """
102 |     return _make_dinov2_model(
103 |         arch_name="vit_small",
104 |         pretrained=pretrained,
105 |         weights=weights,
106 |         num_register_tokens=4,
107 |         interpolate_antialias=True,
108 |         interpolate_offset=0.0,
109 |         **kwargs,
110 |     )
111 | 
112 | 
113 | def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
114 |     """
115 |     DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
116 |     """
117 |     return _make_dinov2_model(
118 |         arch_name="vit_base",
119 |         pretrained=pretrained,
120 |         weights=weights,
121 |         num_register_tokens=4,
122 |         interpolate_antialias=True,
123 |         interpolate_offset=0.0,
124 |         **kwargs,
125 |     )
126 | 
127 | 
128 | def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
129 |     """
130 |     DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
131 |     """
132 |     return _make_dinov2_model(
133 |         arch_name="vit_large",
134 |         pretrained=pretrained,
135 |         weights=weights,
136 |         num_register_tokens=4,
137 |         interpolate_antialias=True,
138 |         interpolate_offset=0.0,
139 |         **kwargs,
140 |     )
141 | 
142 | 
143 | def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
144 |     """
145 |     DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
146 |     """
147 |     return _make_dinov2_model(
148 |         arch_name="vit_giant2",
149 |         ffn_layer="swiglufused",
150 |         weights=weights,
151 |         pretrained=pretrained,
152 |         num_register_tokens=4,
153 |         interpolate_antialias=True,
154 |         interpolate_offset=0.0,
155 |         **kwargs,
156 |     )
157 | 


--------------------------------------------------------------------------------
/dinov2/hub/depth/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .decode_heads import BNHead, DPTHead
7 | from .encoder_decoder import DepthEncoderDecoder
8 | 


--------------------------------------------------------------------------------
/dinov2/hub/depth/ops.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import warnings
 7 | 
 8 | import torch.nn.functional as F
 9 | 
10 | 
11 | def resize(input, size=None, scale_factor=None, mode="nearest", align_corners=None, warning=False):
12 |     if warning:
13 |         if size is not None and align_corners:
14 |             input_h, input_w = tuple(int(x) for x in input.shape[2:])
15 |             output_h, output_w = tuple(int(x) for x in size)
16 |             if output_h > input_h or output_w > output_h:
17 |                 if (
18 |                     (output_h > 1 and output_w > 1 and input_h > 1 and input_w > 1)
19 |                     and (output_h - 1) % (input_h - 1)
20 |                     and (output_w - 1) % (input_w - 1)
21 |                 ):
22 |                     warnings.warn(
23 |                         f"When align_corners={align_corners}, "
24 |                         "the output would more aligned if "
25 |                         f"input size {(input_h, input_w)} is `x+1` and "
26 |                         f"out size {(output_h, output_w)} is `nx+1`"
27 |                     )
28 |     return F.interpolate(input, size, scale_factor, mode, align_corners)
29 | 


--------------------------------------------------------------------------------
/dinov2/hub/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import itertools
 7 | import math
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | 
13 | 
14 | _DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
15 | 
16 | 
17 | def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
18 |     compact_arch_name = arch_name.replace("_", "")[:4]
19 |     registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
20 |     return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
21 | 
22 | 
23 | class CenterPadding(nn.Module):
24 |     def __init__(self, multiple):
25 |         super().__init__()
26 |         self.multiple = multiple
27 | 
28 |     def _get_pad(self, size):
29 |         new_size = math.ceil(size / self.multiple) * self.multiple
30 |         pad_size = new_size - size
31 |         pad_size_left = pad_size // 2
32 |         pad_size_right = pad_size - pad_size_left
33 |         return pad_size_left, pad_size_right
34 | 
35 |     @torch.inference_mode()
36 |     def forward(self, x):
37 |         pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
38 |         output = F.pad(x, pads)
39 |         return output
40 | 


--------------------------------------------------------------------------------
/dinov2/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from .dino_head import DINOHead
 7 | from .mlp import Mlp
 8 | from .patch_embed import PatchEmbed
 9 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
10 | from .block import NestedTensorBlock
11 | from .attention import MemEffAttention
12 | 


--------------------------------------------------------------------------------
/dinov2/layers/attention.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | # References:
 7 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 8 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
 9 | 
10 | import logging
11 | import os
12 | import warnings
13 | 
14 | from torch import Tensor
15 | from torch import nn
16 | 
17 | 
18 | logger = logging.getLogger("dinov2")
19 | 
20 | 
21 | XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
22 | try:
23 |     if XFORMERS_ENABLED:
24 |         from xformers.ops import memory_efficient_attention, unbind
25 | 
26 |         XFORMERS_AVAILABLE = True
27 |         warnings.warn("xFormers is available (Attention)")
28 |     else:
29 |         warnings.warn("xFormers is disabled (Attention)")
30 |         raise ImportError
31 | except ImportError:
32 |     XFORMERS_AVAILABLE = False
33 |     warnings.warn("xFormers is not available (Attention)")
34 | 
35 | 
36 | class Attention(nn.Module):
37 |     def __init__(
38 |         self,
39 |         dim: int,
40 |         num_heads: int = 8,
41 |         qkv_bias: bool = False,
42 |         proj_bias: bool = True,
43 |         attn_drop: float = 0.0,
44 |         proj_drop: float = 0.0,
45 |     ) -> None:
46 |         super().__init__()
47 |         self.num_heads = num_heads
48 |         head_dim = dim // num_heads
49 |         self.scale = head_dim**-0.5
50 | 
51 |         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
52 |         self.attn_drop = nn.Dropout(attn_drop)
53 |         self.proj = nn.Linear(dim, dim, bias=proj_bias)
54 |         self.proj_drop = nn.Dropout(proj_drop)
55 | 
56 |     def forward(self, x: Tensor) -> Tensor:
57 |         B, N, C = x.shape
58 |         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
59 | 
60 |         q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
61 |         attn = q @ k.transpose(-2, -1)
62 | 
63 |         attn = attn.softmax(dim=-1)
64 |         attn = self.attn_drop(attn)
65 | 
66 |         x = (attn @ v).transpose(1, 2).reshape(B, N, C)
67 |         x = self.proj(x)
68 |         x = self.proj_drop(x)
69 |         return x
70 | 
71 | 
72 | class MemEffAttention(Attention):
73 |     def forward(self, x: Tensor, attn_bias=None) -> Tensor:
74 |         if not XFORMERS_AVAILABLE:
75 |             if attn_bias is not None:
76 |                 raise AssertionError("xFormers is required for using nested tensors")
77 |             return super().forward(x)
78 | 
79 |         B, N, C = x.shape
80 |         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
81 | 
82 |         q, k, v = unbind(qkv, 2)
83 | 
84 |         x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
85 |         x = x.reshape([B, N, C])
86 | 
87 |         x = self.proj(x)
88 |         x = self.proj_drop(x)
89 |         return x
90 | 


--------------------------------------------------------------------------------
/dinov2/layers/dino_head.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | from torch.nn.init import trunc_normal_
 9 | from torch.nn.utils import weight_norm
10 | 
11 | 
12 | class DINOHead(nn.Module):
13 |     def __init__(
14 |         self,
15 |         in_dim,
16 |         out_dim,
17 |         use_bn=False,
18 |         nlayers=3,
19 |         hidden_dim=2048,
20 |         bottleneck_dim=256,
21 |         mlp_bias=True,
22 |     ):
23 |         super().__init__()
24 |         nlayers = max(nlayers, 1)
25 |         self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
26 |         self.apply(self._init_weights)
27 |         self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
28 |         self.last_layer.weight_g.data.fill_(1)
29 | 
30 |     def _init_weights(self, m):
31 |         if isinstance(m, nn.Linear):
32 |             trunc_normal_(m.weight, std=0.02)
33 |             if isinstance(m, nn.Linear) and m.bias is not None:
34 |                 nn.init.constant_(m.bias, 0)
35 | 
36 |     def forward(self, x):
37 |         x = self.mlp(x)
38 |         eps = 1e-6 if x.dtype == torch.float16 else 1e-12
39 |         x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
40 |         x = self.last_layer(x)
41 |         return x
42 | 
43 | 
44 | def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
45 |     if nlayers == 1:
46 |         return nn.Linear(in_dim, bottleneck_dim, bias=bias)
47 |     else:
48 |         layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
49 |         if use_bn:
50 |             layers.append(nn.BatchNorm1d(hidden_dim))
51 |         layers.append(nn.GELU())
52 |         for _ in range(nlayers - 2):
53 |             layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
54 |             if use_bn:
55 |                 layers.append(nn.BatchNorm1d(hidden_dim))
56 |             layers.append(nn.GELU())
57 |         layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
58 |         return nn.Sequential(*layers)
59 | 


--------------------------------------------------------------------------------
/dinov2/layers/drop_path.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | # References:
 7 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 8 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
 9 | 
10 | 
11 | from torch import nn
12 | 
13 | 
14 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
15 |     if drop_prob == 0.0 or not training:
16 |         return x
17 |     keep_prob = 1 - drop_prob
18 |     shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
19 |     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
20 |     if keep_prob > 0.0:
21 |         random_tensor.div_(keep_prob)
22 |     output = x * random_tensor
23 |     return output
24 | 
25 | 
26 | class DropPath(nn.Module):
27 |     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
28 | 
29 |     def __init__(self, drop_prob=None):
30 |         super(DropPath, self).__init__()
31 |         self.drop_prob = drop_prob
32 | 
33 |     def forward(self, x):
34 |         return drop_path(x, self.drop_prob, self.training)
35 | 


--------------------------------------------------------------------------------
/dinov2/layers/layer_scale.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
 7 | 
 8 | from typing import Optional, Union
 9 | 
10 | import torch
11 | from torch import Tensor
12 | from torch import nn
13 | 
14 | 
15 | class LayerScale(nn.Module):
16 |     def __init__(
17 |         self,
18 |         dim: int,
19 |         init_values: Union[float, Tensor] = 1e-5,
20 |         inplace: bool = False,
21 |         device: Optional[torch.device] = None,
22 |         dtype: Optional[torch.dtype] = None,
23 |     ) -> None:
24 |         super().__init__()
25 |         self.inplace = inplace
26 |         self.init_values = init_values
27 |         self.gamma = nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
28 |         self.reset_parameters()
29 | 
30 |     def reset_parameters(self):
31 |         nn.init.constant_(self.gamma, self.init_values)
32 | 
33 |     def forward(self, x: Tensor) -> Tensor:
34 |         return x.mul_(self.gamma) if self.inplace else x * self.gamma
35 | 


--------------------------------------------------------------------------------
/dinov2/layers/mlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | # References:
 7 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 8 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
 9 | 
10 | 
11 | from typing import Callable, Optional
12 | 
13 | from torch import Tensor, nn
14 | 
15 | 
16 | class Mlp(nn.Module):
17 |     def __init__(
18 |         self,
19 |         in_features: int,
20 |         hidden_features: Optional[int] = None,
21 |         out_features: Optional[int] = None,
22 |         act_layer: Callable[..., nn.Module] = nn.GELU,
23 |         drop: float = 0.0,
24 |         bias: bool = True,
25 |     ) -> None:
26 |         super().__init__()
27 |         out_features = out_features or in_features
28 |         hidden_features = hidden_features or in_features
29 |         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
30 |         self.act = act_layer()
31 |         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
32 |         self.drop = nn.Dropout(drop)
33 | 
34 |     def forward(self, x: Tensor) -> Tensor:
35 |         x = self.fc1(x)
36 |         x = self.act(x)
37 |         x = self.drop(x)
38 |         x = self.fc2(x)
39 |         x = self.drop(x)
40 |         return x
41 | 


--------------------------------------------------------------------------------
/dinov2/layers/patch_embed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | # References:
 7 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 8 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
 9 | 
10 | from typing import Callable, Optional, Tuple, Union
11 | 
12 | from torch import Tensor
13 | import torch.nn as nn
14 | 
15 | 
16 | def make_2tuple(x):
17 |     if isinstance(x, tuple):
18 |         assert len(x) == 2
19 |         return x
20 | 
21 |     assert isinstance(x, int)
22 |     return (x, x)
23 | 
24 | 
25 | class PatchEmbed(nn.Module):
26 |     """
27 |     2D image to patch embedding: (B,C,H,W) -> (B,N,D)
28 | 
29 |     Args:
30 |         img_size: Image size.
31 |         patch_size: Patch token size.
32 |         in_chans: Number of input image channels.
33 |         embed_dim: Number of linear projection output channels.
34 |         norm_layer: Normalization layer.
35 |     """
36 | 
37 |     def __init__(
38 |         self,
39 |         img_size: Union[int, Tuple[int, int]] = 224,
40 |         patch_size: Union[int, Tuple[int, int]] = 16,
41 |         in_chans: int = 3,
42 |         embed_dim: int = 768,
43 |         norm_layer: Optional[Callable] = None,
44 |         flatten_embedding: bool = True,
45 |     ) -> None:
46 |         super().__init__()
47 | 
48 |         image_HW = make_2tuple(img_size)
49 |         patch_HW = make_2tuple(patch_size)
50 |         patch_grid_size = (
51 |             image_HW[0] // patch_HW[0],
52 |             image_HW[1] // patch_HW[1],
53 |         )
54 | 
55 |         self.img_size = image_HW
56 |         self.patch_size = patch_HW
57 |         self.patches_resolution = patch_grid_size
58 |         self.num_patches = patch_grid_size[0] * patch_grid_size[1]
59 | 
60 |         self.in_chans = in_chans
61 |         self.embed_dim = embed_dim
62 | 
63 |         self.flatten_embedding = flatten_embedding
64 | 
65 |         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
66 |         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
67 | 
68 |     def forward(self, x: Tensor) -> Tensor:
69 |         _, _, H, W = x.shape
70 |         patch_H, patch_W = self.patch_size
71 | 
72 |         assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
73 |         assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
74 | 
75 |         x = self.proj(x)  # B C H W
76 |         H, W = x.size(2), x.size(3)
77 |         x = x.flatten(2).transpose(1, 2)  # B HW C
78 |         x = self.norm(x)
79 |         if not self.flatten_embedding:
80 |             x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
81 |         return x
82 | 
83 |     def flops(self) -> float:
84 |         Ho, Wo = self.patches_resolution
85 |         flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
86 |         if self.norm is not None:
87 |             flops += Ho * Wo * self.embed_dim
88 |         return flops
89 | 


--------------------------------------------------------------------------------
/dinov2/layers/swiglu_ffn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import os
 7 | from typing import Callable, Optional
 8 | import warnings
 9 | 
10 | from torch import Tensor, nn
11 | import torch.nn.functional as F
12 | 
13 | 
14 | class SwiGLUFFN(nn.Module):
15 |     def __init__(
16 |         self,
17 |         in_features: int,
18 |         hidden_features: Optional[int] = None,
19 |         out_features: Optional[int] = None,
20 |         act_layer: Callable[..., nn.Module] = None,
21 |         drop: float = 0.0,
22 |         bias: bool = True,
23 |     ) -> None:
24 |         super().__init__()
25 |         out_features = out_features or in_features
26 |         hidden_features = hidden_features or in_features
27 |         self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
28 |         self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
29 | 
30 |     def forward(self, x: Tensor) -> Tensor:
31 |         x12 = self.w12(x)
32 |         x1, x2 = x12.chunk(2, dim=-1)
33 |         hidden = F.silu(x1) * x2
34 |         return self.w3(hidden)
35 | 
36 | 
37 | XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
38 | try:
39 |     if XFORMERS_ENABLED:
40 |         from xformers.ops import SwiGLU
41 | 
42 |         XFORMERS_AVAILABLE = True
43 |         warnings.warn("xFormers is available (SwiGLU)")
44 |     else:
45 |         warnings.warn("xFormers is disabled (SwiGLU)")
46 |         raise ImportError
47 | except ImportError:
48 |     SwiGLU = SwiGLUFFN
49 |     XFORMERS_AVAILABLE = False
50 | 
51 |     warnings.warn("xFormers is not available (SwiGLU)")
52 | 
53 | 
54 | class SwiGLUFFNFused(SwiGLU):
55 |     def __init__(
56 |         self,
57 |         in_features: int,
58 |         hidden_features: Optional[int] = None,
59 |         out_features: Optional[int] = None,
60 |         act_layer: Callable[..., nn.Module] = None,
61 |         drop: float = 0.0,
62 |         bias: bool = True,
63 |     ) -> None:
64 |         out_features = out_features or in_features
65 |         hidden_features = hidden_features or in_features
66 |         hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
67 |         super().__init__(
68 |             in_features=in_features,
69 |             hidden_features=hidden_features,
70 |             out_features=out_features,
71 |             bias=bias,
72 |         )
73 | 


--------------------------------------------------------------------------------
/dinov2/logging/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | import functools
  7 | import logging
  8 | import os
  9 | import sys
 10 | from typing import Optional
 11 | 
 12 | import dinov2.distributed as distributed
 13 | from .helpers import MetricLogger, SmoothedValue
 14 | 
 15 | 
 16 | # So that calling _configure_logger multiple times won't add many handlers
 17 | @functools.lru_cache()
 18 | def _configure_logger(
 19 |     name: Optional[str] = None,
 20 |     *,
 21 |     level: int = logging.DEBUG,
 22 |     output: Optional[str] = None,
 23 | ):
 24 |     """
 25 |     Configure a logger.
 26 | 
 27 |     Adapted from Detectron2.
 28 | 
 29 |     Args:
 30 |         name: The name of the logger to configure.
 31 |         level: The logging level to use.
 32 |         output: A file name or a directory to save log. If None, will not save log file.
 33 |             If ends with ".txt" or ".log", assumed to be a file name.
 34 |             Otherwise, logs will be saved to `output/log.txt`.
 35 | 
 36 |     Returns:
 37 |         The configured logger.
 38 |     """
 39 | 
 40 |     logger = logging.getLogger(name)
 41 |     logger.setLevel(level)
 42 |     logger.propagate = False
 43 | 
 44 |     # Loosely match Google glog format:
 45 |     #   [IWEF]yyyymmdd hh:mm:ss.uuuuuu threadid file:line] msg
 46 |     # but use a shorter timestamp and include the logger name:
 47 |     #   [IWEF]yyyymmdd hh:mm:ss logger threadid file:line] msg
 48 |     fmt_prefix = "%(levelname).1s%(asctime)s %(process)s %(name)s %(filename)s:%(lineno)s] "
 49 |     fmt_message = "%(message)s"
 50 |     fmt = fmt_prefix + fmt_message
 51 |     datefmt = "%Y%m%d %H:%M:%S"
 52 |     formatter = logging.Formatter(fmt=fmt, datefmt=datefmt)
 53 | 
 54 |     # stdout logging for main worker only
 55 |     if distributed.is_main_process():
 56 |         handler = logging.StreamHandler(stream=sys.stdout)
 57 |         handler.setLevel(logging.DEBUG)
 58 |         handler.setFormatter(formatter)
 59 |         logger.addHandler(handler)
 60 | 
 61 |     # file logging for all workers
 62 |     if output:
 63 |         if os.path.splitext(output)[-1] in (".txt", ".log"):
 64 |             filename = output
 65 |         else:
 66 |             filename = os.path.join(output, "logs", "log.txt")
 67 | 
 68 |         if not distributed.is_main_process():
 69 |             global_rank = distributed.get_global_rank()
 70 |             filename = filename + ".rank{}".format(global_rank)
 71 | 
 72 |         os.makedirs(os.path.dirname(filename), exist_ok=True)
 73 | 
 74 |         handler = logging.StreamHandler(open(filename, "a"))
 75 |         handler.setLevel(logging.DEBUG)
 76 |         handler.setFormatter(formatter)
 77 |         logger.addHandler(handler)
 78 | 
 79 |     return logger
 80 | 
 81 | 
 82 | def setup_logging(
 83 |     output: Optional[str] = None,
 84 |     *,
 85 |     name: Optional[str] = None,
 86 |     level: int = logging.DEBUG,
 87 |     capture_warnings: bool = True,
 88 | ) -> None:
 89 |     """
 90 |     Setup logging.
 91 | 
 92 |     Args:
 93 |         output: A file name or a directory to save log files. If None, log
 94 |             files will not be saved. If output ends with ".txt" or ".log", it
 95 |             is assumed to be a file name.
 96 |             Otherwise, logs will be saved to `output/log.txt`.
 97 |         name: The name of the logger to configure, by default the root logger.
 98 |         level: The logging level to use.
 99 |         capture_warnings: Whether warnings should be captured as logs.
100 |     """
101 |     logging.captureWarnings(capture_warnings)
102 |     _configure_logger(name, level=level, output=output)
103 | 


--------------------------------------------------------------------------------
/dinov2/logging/helpers.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | from collections import defaultdict, deque
  7 | import datetime
  8 | import json
  9 | import logging
 10 | import time
 11 | 
 12 | import torch
 13 | 
 14 | import dinov2.distributed as distributed
 15 | 
 16 | 
 17 | logger = logging.getLogger("dinov2")
 18 | 
 19 | 
 20 | class MetricLogger(object):
 21 |     def __init__(self, delimiter="\t", output_file=None):
 22 |         self.meters = defaultdict(SmoothedValue)
 23 |         self.delimiter = delimiter
 24 |         self.output_file = output_file
 25 | 
 26 |     def update(self, **kwargs):
 27 |         for k, v in kwargs.items():
 28 |             if isinstance(v, torch.Tensor):
 29 |                 v = v.item()
 30 |             assert isinstance(v, (float, int))
 31 |             self.meters[k].update(v)
 32 | 
 33 |     def __getattr__(self, attr):
 34 |         if attr in self.meters:
 35 |             return self.meters[attr]
 36 |         if attr in self.__dict__:
 37 |             return self.__dict__[attr]
 38 |         raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
 39 | 
 40 |     def __str__(self):
 41 |         loss_str = []
 42 |         for name, meter in self.meters.items():
 43 |             loss_str.append("{}: {}".format(name, str(meter)))
 44 |         return self.delimiter.join(loss_str)
 45 | 
 46 |     def synchronize_between_processes(self):
 47 |         for meter in self.meters.values():
 48 |             meter.synchronize_between_processes()
 49 | 
 50 |     def add_meter(self, name, meter):
 51 |         self.meters[name] = meter
 52 | 
 53 |     def dump_in_output_file(self, iteration, iter_time, data_time):
 54 |         if self.output_file is None or not distributed.is_main_process():
 55 |             return
 56 |         dict_to_dump = dict(
 57 |             iteration=iteration,
 58 |             iter_time=iter_time,
 59 |             data_time=data_time,
 60 |         )
 61 |         dict_to_dump.update({k: v.median for k, v in self.meters.items()})
 62 |         with open(self.output_file, "a") as f:
 63 |             f.write(json.dumps(dict_to_dump) + "\n")
 64 |         pass
 65 | 
 66 |     def log_every(self, iterable, print_freq, header=None, n_iterations=None, start_iteration=0):
 67 |         i = start_iteration
 68 |         if not header:
 69 |             header = ""
 70 |         start_time = time.time()
 71 |         end = time.time()
 72 |         iter_time = SmoothedValue(fmt="{avg:.6f}")
 73 |         data_time = SmoothedValue(fmt="{avg:.6f}")
 74 | 
 75 |         if n_iterations is None:
 76 |             n_iterations = len(iterable)
 77 | 
 78 |         space_fmt = ":" + str(len(str(n_iterations))) + "d"
 79 | 
 80 |         log_list = [
 81 |             header,
 82 |             "[{0" + space_fmt + "}/{1}]",
 83 |             "eta: {eta}",
 84 |             "{meters}",
 85 |             "time: {time}",
 86 |             "data: {data}",
 87 |         ]
 88 |         if torch.cuda.is_available():
 89 |             log_list += ["max mem: {memory:.0f}"]
 90 | 
 91 |         log_msg = self.delimiter.join(log_list)
 92 |         MB = 1024.0 * 1024.0
 93 |         for obj in iterable:
 94 |             data_time.update(time.time() - end)
 95 |             yield obj
 96 |             iter_time.update(time.time() - end)
 97 |             if i % print_freq == 0 or i == n_iterations - 1:
 98 |                 self.dump_in_output_file(iteration=i, iter_time=iter_time.avg, data_time=data_time.avg)
 99 |                 eta_seconds = iter_time.global_avg * (n_iterations - i)
100 |                 eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
101 |                 if torch.cuda.is_available():
102 |                     logger.info(
103 |                         log_msg.format(
104 |                             i,
105 |                             n_iterations,
106 |                             eta=eta_string,
107 |                             meters=str(self),
108 |                             time=str(iter_time),
109 |                             data=str(data_time),
110 |                             memory=torch.cuda.max_memory_allocated() / MB,
111 |                         )
112 |                     )
113 |                 else:
114 |                     logger.info(
115 |                         log_msg.format(
116 |                             i,
117 |                             n_iterations,
118 |                             eta=eta_string,
119 |                             meters=str(self),
120 |                             time=str(iter_time),
121 |                             data=str(data_time),
122 |                         )
123 |                     )
124 |             i += 1
125 |             end = time.time()
126 |             if i >= n_iterations:
127 |                 break
128 |         total_time = time.time() - start_time
129 |         total_time_str = str(datetime.timedelta(seconds=int(total_time)))
130 |         logger.info("{} Total time: {} ({:.6f} s / it)".format(header, total_time_str, total_time / n_iterations))
131 | 
132 | 
133 | class SmoothedValue:
134 |     """Track a series of values and provide access to smoothed values over a
135 |     window or the global series average.
136 |     """
137 | 
138 |     def __init__(self, window_size=20, fmt=None):
139 |         if fmt is None:
140 |             fmt = "{median:.4f} ({global_avg:.4f})"
141 |         self.deque = deque(maxlen=window_size)
142 |         self.total = 0.0
143 |         self.count = 0
144 |         self.fmt = fmt
145 | 
146 |     def update(self, value, num=1):
147 |         self.deque.append(value)
148 |         self.count += num
149 |         self.total += value * num
150 | 
151 |     def synchronize_between_processes(self):
152 |         """
153 |         Distributed synchronization of the metric
154 |         Warning: does not synchronize the deque!
155 |         """
156 |         if not distributed.is_enabled():
157 |             return
158 |         t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
159 |         torch.distributed.barrier()
160 |         torch.distributed.all_reduce(t)
161 |         t = t.tolist()
162 |         self.count = int(t[0])
163 |         self.total = t[1]
164 | 
165 |     @property
166 |     def median(self):
167 |         d = torch.tensor(list(self.deque))
168 |         return d.median().item()
169 | 
170 |     @property
171 |     def avg(self):
172 |         d = torch.tensor(list(self.deque), dtype=torch.float32)
173 |         return d.mean().item()
174 | 
175 |     @property
176 |     def global_avg(self):
177 |         return self.total / self.count
178 | 
179 |     @property
180 |     def max(self):
181 |         return max(self.deque)
182 | 
183 |     @property
184 |     def value(self):
185 |         return self.deque[-1]
186 | 
187 |     def __str__(self):
188 |         return self.fmt.format(
189 |             median=self.median,
190 |             avg=self.avg,
191 |             global_avg=self.global_avg,
192 |             max=self.max,
193 |             value=self.value,
194 |         )
195 | 


--------------------------------------------------------------------------------
/dinov2/loss/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .dino_clstoken_loss import DINOLoss
7 | from .ibot_patch_loss import iBOTPatchLoss
8 | from .koleo_loss import KoLeoLoss
9 | 


--------------------------------------------------------------------------------
/dinov2/loss/dino_clstoken_loss.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | import torch
  7 | import torch.distributed as dist
  8 | import torch.nn.functional as F
  9 | from torch import nn
 10 | 
 11 | 
 12 | class DINOLoss(nn.Module):
 13 |     def __init__(
 14 |         self,
 15 |         out_dim,
 16 |         student_temp=0.1,
 17 |         center_momentum=0.9,
 18 |     ):
 19 |         super().__init__()
 20 |         self.student_temp = student_temp
 21 |         self.center_momentum = center_momentum
 22 |         self.register_buffer("center", torch.zeros(1, out_dim))
 23 |         self.updated = True
 24 |         self.reduce_handle = None
 25 |         self.len_teacher_output = None
 26 |         self.async_batch_center = None
 27 | 
 28 |     @torch.no_grad()
 29 |     def softmax_center_teacher(self, teacher_output, teacher_temp):
 30 |         self.apply_center_update()
 31 |         # teacher centering and sharpening
 32 |         return F.softmax((teacher_output - self.center) / teacher_temp, dim=-1)
 33 | 
 34 |     @torch.no_grad()
 35 |     def sinkhorn_knopp_teacher(self, teacher_output, teacher_temp, n_iterations=3):
 36 |         teacher_output = teacher_output.float()
 37 |         world_size = dist.get_world_size() if dist.is_initialized() else 1
 38 |         Q = torch.exp(teacher_output / teacher_temp).t()  # Q is K-by-B for consistency with notations from our paper
 39 |         B = Q.shape[1] * world_size  # number of samples to assign
 40 |         K = Q.shape[0]  # how many prototypes
 41 | 
 42 |         # make the matrix sums to 1
 43 |         sum_Q = torch.sum(Q)
 44 |         if dist.is_initialized():
 45 |             dist.all_reduce(sum_Q)
 46 |         Q /= sum_Q
 47 | 
 48 |         for it in range(n_iterations):
 49 |             # normalize each row: total weight per prototype must be 1/K
 50 |             sum_of_rows = torch.sum(Q, dim=1, keepdim=True)
 51 |             if dist.is_initialized():
 52 |                 dist.all_reduce(sum_of_rows)
 53 |             Q /= sum_of_rows
 54 |             Q /= K
 55 | 
 56 |             # normalize each column: total weight per sample must be 1/B
 57 |             Q /= torch.sum(Q, dim=0, keepdim=True)
 58 |             Q /= B
 59 | 
 60 |         Q *= B  # the columns must sum to 1 so that Q is an assignment
 61 |         return Q.t()
 62 | 
 63 |     def forward(self, student_output_list, teacher_out_softmaxed_centered_list):
 64 |         """
 65 |         Cross-entropy between softmax outputs of the teacher and student networks.
 66 |         """
 67 |         # TODO: Use cross_entropy_distribution here
 68 |         total_loss = 0
 69 |         for s in student_output_list:
 70 |             lsm = F.log_softmax(s / self.student_temp, dim=-1)
 71 |             for t in teacher_out_softmaxed_centered_list:
 72 |                 loss = torch.sum(t * lsm, dim=-1)
 73 |                 total_loss -= loss.mean()
 74 |         return total_loss
 75 | 
 76 |     @torch.no_grad()
 77 |     def update_center(self, teacher_output):
 78 |         self.reduce_center_update(teacher_output)
 79 | 
 80 |     @torch.no_grad()
 81 |     def reduce_center_update(self, teacher_output):
 82 |         self.updated = False
 83 |         self.len_teacher_output = len(teacher_output)
 84 |         self.async_batch_center = torch.sum(teacher_output, dim=0, keepdim=True)
 85 |         if dist.is_initialized():
 86 |             self.reduce_handle = dist.all_reduce(self.async_batch_center, async_op=True)
 87 | 
 88 |     @torch.no_grad()
 89 |     def apply_center_update(self):
 90 |         if self.updated is False:
 91 |             world_size = dist.get_world_size() if dist.is_initialized() else 1
 92 | 
 93 |             if self.reduce_handle is not None:
 94 |                 self.reduce_handle.wait()
 95 |             _t = self.async_batch_center / (self.len_teacher_output * world_size)
 96 | 
 97 |             self.center = self.center * self.center_momentum + _t * (1 - self.center_momentum)
 98 | 
 99 |             self.updated = True
100 | 


--------------------------------------------------------------------------------
/dinov2/loss/ibot_patch_loss.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | import torch
  7 | import torch.distributed as dist
  8 | import torch.nn.functional as F
  9 | from torch import nn
 10 | 
 11 | import logging
 12 | 
 13 | 
 14 | logger = logging.getLogger("dinov2")
 15 | 
 16 | 
 17 | try:
 18 |     from xformers.ops import cross_entropy
 19 | 
 20 |     def lossfunc(t, s, temp):
 21 |         s = s.float()
 22 |         t = t.float()
 23 |         if s.ndim == 2:
 24 |             return -cross_entropy(s.unsqueeze(0), t.unsqueeze(0), temp, bw_inplace=True).squeeze(0)
 25 |         elif s.ndim == 3:
 26 |             return -cross_entropy(s, t, temp, bw_inplace=True)
 27 | 
 28 | except ImportError:
 29 | 
 30 |     def lossfunc(t, s, temp):
 31 |         return torch.sum(t * F.log_softmax(s / temp, dim=-1), dim=-1)
 32 | 
 33 | 
 34 | class iBOTPatchLoss(nn.Module):
 35 |     def __init__(self, patch_out_dim, student_temp=0.1, center_momentum=0.9):
 36 |         super().__init__()
 37 |         self.student_temp = student_temp
 38 |         self.center_momentum = center_momentum
 39 |         self.register_buffer("center", torch.zeros(1, 1, patch_out_dim))
 40 |         self.updated = True
 41 |         self.reduce_handle = None
 42 |         self.len_teacher_patch_tokens = None
 43 |         self.async_batch_center = None
 44 | 
 45 |     @torch.no_grad()
 46 |     def softmax_center_teacher(self, teacher_patch_tokens, teacher_temp):
 47 |         self.apply_center_update()
 48 |         # teacher centering and sharpening
 49 |         #
 50 |         # WARNING:
 51 |         #   as self.center is a float32, everything gets casted to float32 afterwards
 52 |         #
 53 |         # teacher_patch_tokens = teacher_patch_tokens.float()
 54 |         # return F.softmax((teacher_patch_tokens.sub_(self.center.to(teacher_patch_tokens.dtype))).mul_(1 / teacher_temp), dim=-1)
 55 | 
 56 |         return F.softmax((teacher_patch_tokens - self.center) / teacher_temp, dim=-1)
 57 | 
 58 |         # this is experimental, keep everything in float16 and let's see what happens:
 59 |         # return F.softmax((teacher_patch_tokens.sub_(self.center)) / teacher_temp, dim=-1)
 60 | 
 61 |     @torch.no_grad()
 62 |     def sinkhorn_knopp_teacher(self, teacher_output, teacher_temp, n_masked_patches_tensor, n_iterations=3):
 63 |         teacher_output = teacher_output.float()
 64 |         # world_size = dist.get_world_size() if dist.is_initialized() else 1
 65 |         Q = torch.exp(teacher_output / teacher_temp).t()  # Q is K-by-B for consistency with notations from our paper
 66 |         # B = Q.shape[1] * world_size # number of samples to assign
 67 |         B = n_masked_patches_tensor
 68 |         dist.all_reduce(B)
 69 |         K = Q.shape[0]  # how many prototypes
 70 | 
 71 |         # make the matrix sums to 1
 72 |         sum_Q = torch.sum(Q)
 73 |         if dist.is_initialized():
 74 |             dist.all_reduce(sum_Q)
 75 |         Q /= sum_Q
 76 | 
 77 |         for it in range(n_iterations):
 78 |             # normalize each row: total weight per prototype must be 1/K
 79 |             sum_of_rows = torch.sum(Q, dim=1, keepdim=True)
 80 |             if dist.is_initialized():
 81 |                 dist.all_reduce(sum_of_rows)
 82 |             Q /= sum_of_rows
 83 |             Q /= K
 84 | 
 85 |             # normalize each column: total weight per sample must be 1/B
 86 |             Q /= torch.sum(Q, dim=0, keepdim=True)
 87 |             Q /= B
 88 | 
 89 |         Q *= B  # the columns must sum to 1 so that Q is an assignment
 90 |         return Q.t()
 91 | 
 92 |     def forward(self, student_patch_tokens, teacher_patch_tokens, student_masks_flat):
 93 |         """
 94 |         Cross-entropy between softmax outputs of the teacher and student networks.
 95 |         student_patch_tokens: (B, N, D) tensor
 96 |         teacher_patch_tokens: (B, N, D) tensor
 97 |         student_masks_flat: (B, N) tensor
 98 |         """
 99 |         t = teacher_patch_tokens
100 |         s = student_patch_tokens
101 |         loss = torch.sum(t * F.log_softmax(s / self.student_temp, dim=-1), dim=-1)
102 |         loss = torch.sum(loss * student_masks_flat.float(), dim=-1) / student_masks_flat.sum(dim=-1).clamp(min=1.0)
103 |         return -loss.mean()
104 | 
105 |     def forward_masked(
106 |         self,
107 |         student_patch_tokens_masked,
108 |         teacher_patch_tokens_masked,
109 |         student_masks_flat,
110 |         n_masked_patches=None,
111 |         masks_weight=None,
112 |     ):
113 |         t = teacher_patch_tokens_masked
114 |         s = student_patch_tokens_masked
115 |         # loss = torch.sum(t * F.log_softmax(s / self.student_temp, dim=-1), dim=-1)
116 |         loss = lossfunc(t, s, self.student_temp)
117 |         if masks_weight is None:
118 |             masks_weight = (
119 |                 (1 / student_masks_flat.sum(-1).clamp(min=1.0))
120 |                 .unsqueeze(-1)
121 |                 .expand_as(student_masks_flat)[student_masks_flat]
122 |             )
123 |         if n_masked_patches is not None:
124 |             loss = loss[:n_masked_patches]
125 |         loss = loss * masks_weight
126 |         return -loss.sum() / student_masks_flat.shape[0]
127 | 
128 |     @torch.no_grad()
129 |     def update_center(self, teacher_patch_tokens):
130 |         self.reduce_center_update(teacher_patch_tokens)
131 | 
132 |     @torch.no_grad()
133 |     def reduce_center_update(self, teacher_patch_tokens):
134 |         self.updated = False
135 |         self.len_teacher_patch_tokens = len(teacher_patch_tokens)
136 |         self.async_batch_center = torch.sum(teacher_patch_tokens.mean(1), dim=0, keepdim=True)
137 |         if dist.is_initialized():
138 |             self.reduce_handle = dist.all_reduce(self.async_batch_center, async_op=True)
139 | 
140 |     @torch.no_grad()
141 |     def apply_center_update(self):
142 |         if self.updated is False:
143 |             world_size = dist.get_world_size() if dist.is_initialized() else 1
144 | 
145 |             if self.reduce_handle is not None:
146 |                 self.reduce_handle.wait()
147 |             _t = self.async_batch_center / (self.len_teacher_patch_tokens * world_size)
148 | 
149 |             self.center = self.center * self.center_momentum + _t * (1 - self.center_momentum)
150 | 
151 |             self.updated = True
152 | 


--------------------------------------------------------------------------------
/dinov2/loss/koleo_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import logging
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | 
12 | # import torch.distributed as dist
13 | 
14 | 
15 | logger = logging.getLogger("dinov2")
16 | 
17 | 
18 | class KoLeoLoss(nn.Module):
19 |     """Kozachenko-Leonenko entropic loss regularizer from Sablayrolles et al. - 2018 - Spreading vectors for similarity search"""
20 | 
21 |     def __init__(self):
22 |         super().__init__()
23 |         self.pdist = nn.PairwiseDistance(2, eps=1e-8)
24 | 
25 |     def pairwise_NNs_inner(self, x):
26 |         """
27 |         Pairwise nearest neighbors for L2-normalized vectors.
28 |         Uses Torch rather than Faiss to remain on GPU.
29 |         """
30 |         # parwise dot products (= inverse distance)
31 |         dots = torch.mm(x, x.t())
32 |         n = x.shape[0]
33 |         dots.view(-1)[:: (n + 1)].fill_(-1)  # Trick to fill diagonal with -1
34 |         # max inner prod -> min distance
35 |         _, I = torch.max(dots, dim=1)  # noqa: E741
36 |         return I
37 | 
38 |     def forward(self, student_output, eps=1e-8):
39 |         """
40 |         Args:
41 |             student_output (BxD): backbone output of student
42 |         """
43 |         with torch.cuda.amp.autocast(enabled=False):
44 |             student_output = F.normalize(student_output, eps=eps, p=2, dim=-1)
45 |             I = self.pairwise_NNs_inner(student_output)  # noqa: E741
46 |             distances = self.pdist(student_output, student_output[I])  # BxD, BxD -> B
47 |             loss = -torch.log(distances + eps).mean()
48 |         return loss
49 | 


--------------------------------------------------------------------------------
/dinov2/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import logging
 7 | 
 8 | from . import vision_transformer as vits
 9 | 
10 | 
11 | logger = logging.getLogger("dinov2")
12 | 
13 | 
14 | def build_model(args, only_teacher=False, img_size=224):
15 |     args.arch = args.arch.removesuffix("_memeff")
16 |     if "vit" in args.arch:
17 |         vit_kwargs = dict(
18 |             img_size=img_size,
19 |             patch_size=args.patch_size,
20 |             init_values=args.layerscale,
21 |             ffn_layer=args.ffn_layer,
22 |             block_chunks=args.block_chunks,
23 |             qkv_bias=args.qkv_bias,
24 |             proj_bias=args.proj_bias,
25 |             ffn_bias=args.ffn_bias,
26 |             num_register_tokens=args.num_register_tokens,
27 |             interpolate_offset=args.interpolate_offset,
28 |             interpolate_antialias=args.interpolate_antialias,
29 |         )
30 |         teacher = vits.__dict__[args.arch](**vit_kwargs)
31 |         if only_teacher:
32 |             return teacher, teacher.embed_dim
33 |         student = vits.__dict__[args.arch](
34 |             **vit_kwargs,
35 |             drop_path_rate=args.drop_path_rate,
36 |             drop_path_uniform=args.drop_path_uniform,
37 |         )
38 |         embed_dim = student.embed_dim
39 |     return student, teacher, embed_dim
40 | 
41 | 
42 | def build_model_from_cfg(cfg, only_teacher=False):
43 |     return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)
44 | 


--------------------------------------------------------------------------------
/dinov2/run/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/dinov2/run/eval/knn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import logging
 7 | import os
 8 | import sys
 9 | 
10 | from dinov2.eval.knn import get_args_parser as get_knn_args_parser
11 | from dinov2.logging import setup_logging
12 | from dinov2.run.submit import get_args_parser, submit_jobs
13 | 
14 | 
15 | logger = logging.getLogger("dinov2")
16 | 
17 | 
18 | class Evaluator:
19 |     def __init__(self, args):
20 |         self.args = args
21 | 
22 |     def __call__(self):
23 |         from dinov2.eval.knn import main as knn_main
24 | 
25 |         self._setup_args()
26 |         knn_main(self.args)
27 | 
28 |     def checkpoint(self):
29 |         import submitit
30 | 
31 |         logger.info(f"Requeuing {self.args}")
32 |         empty = type(self)(self.args)
33 |         return submitit.helpers.DelayedSubmission(empty)
34 | 
35 |     def _setup_args(self):
36 |         import submitit
37 | 
38 |         job_env = submitit.JobEnvironment()
39 |         self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
40 |         logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
41 |         logger.info(f"Args: {self.args}")
42 | 
43 | 
44 | def main():
45 |     description = "Submitit launcher for DINOv2 k-NN evaluation"
46 |     knn_args_parser = get_knn_args_parser(add_help=False)
47 |     parents = [knn_args_parser]
48 |     args_parser = get_args_parser(description=description, parents=parents)
49 |     args = args_parser.parse_args()
50 | 
51 |     setup_logging()
52 | 
53 |     assert os.path.exists(args.config_file), "Configuration file does not exist!"
54 |     submit_jobs(Evaluator, args, name="dinov2:knn")
55 |     return 0
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     sys.exit(main())
60 | 


--------------------------------------------------------------------------------
/dinov2/run/eval/linear.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import logging
 7 | import os
 8 | import sys
 9 | 
10 | from dinov2.eval.linear import get_args_parser as get_linear_args_parser
11 | from dinov2.logging import setup_logging
12 | from dinov2.run.submit import get_args_parser, submit_jobs
13 | 
14 | 
15 | logger = logging.getLogger("dinov2")
16 | 
17 | 
18 | class Evaluator:
19 |     def __init__(self, args):
20 |         self.args = args
21 | 
22 |     def __call__(self):
23 |         from dinov2.eval.linear import main as linear_main
24 | 
25 |         self._setup_args()
26 |         linear_main(self.args)
27 | 
28 |     def checkpoint(self):
29 |         import submitit
30 | 
31 |         logger.info(f"Requeuing {self.args}")
32 |         empty = type(self)(self.args)
33 |         return submitit.helpers.DelayedSubmission(empty)
34 | 
35 |     def _setup_args(self):
36 |         import submitit
37 | 
38 |         job_env = submitit.JobEnvironment()
39 |         self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
40 |         logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
41 |         logger.info(f"Args: {self.args}")
42 | 
43 | 
44 | def main():
45 |     description = "Submitit launcher for DINOv2 linear evaluation"
46 |     linear_args_parser = get_linear_args_parser(add_help=False)
47 |     parents = [linear_args_parser]
48 |     args_parser = get_args_parser(description=description, parents=parents)
49 |     args = args_parser.parse_args()
50 | 
51 |     setup_logging()
52 | 
53 |     assert os.path.exists(args.config_file), "Configuration file does not exist!"
54 |     submit_jobs(Evaluator, args, name="dinov2:linear")
55 |     return 0
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     sys.exit(main())
60 | 


--------------------------------------------------------------------------------
/dinov2/run/eval/log_regression.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import logging
 7 | import os
 8 | import sys
 9 | 
10 | from dinov2.eval.log_regression import get_args_parser as get_log_regression_args_parser
11 | from dinov2.logging import setup_logging
12 | from dinov2.run.submit import get_args_parser, submit_jobs
13 | 
14 | 
15 | logger = logging.getLogger("dinov2")
16 | 
17 | 
18 | class Evaluator:
19 |     def __init__(self, args):
20 |         self.args = args
21 | 
22 |     def __call__(self):
23 |         from dinov2.eval.log_regression import main as log_regression_main
24 | 
25 |         self._setup_args()
26 |         log_regression_main(self.args)
27 | 
28 |     def checkpoint(self):
29 |         import submitit
30 | 
31 |         logger.info(f"Requeuing {self.args}")
32 |         empty = type(self)(self.args)
33 |         return submitit.helpers.DelayedSubmission(empty)
34 | 
35 |     def _setup_args(self):
36 |         import submitit
37 | 
38 |         job_env = submitit.JobEnvironment()
39 |         self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
40 |         logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
41 |         logger.info(f"Args: {self.args}")
42 | 
43 | 
44 | def main():
45 |     description = "Submitit launcher for DINOv2 logistic evaluation"
46 |     log_regression_args_parser = get_log_regression_args_parser(add_help=False)
47 |     parents = [log_regression_args_parser]
48 |     args_parser = get_args_parser(description=description, parents=parents)
49 |     args = args_parser.parse_args()
50 | 
51 |     setup_logging()
52 | 
53 |     assert os.path.exists(args.config_file), "Configuration file does not exist!"
54 |     submit_jobs(Evaluator, args, name="dinov2:logreg")
55 |     return 0
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     sys.exit(main())
60 | 


--------------------------------------------------------------------------------
/dinov2/run/submit.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | import argparse
  7 | import logging
  8 | import os
  9 | from pathlib import Path
 10 | from typing import List, Optional
 11 | 
 12 | import submitit
 13 | 
 14 | from dinov2.utils.cluster import (
 15 |     get_slurm_executor_parameters,
 16 |     get_slurm_partition,
 17 |     get_user_checkpoint_path,
 18 | )
 19 | 
 20 | 
 21 | logger = logging.getLogger("dinov2")
 22 | 
 23 | 
 24 | def get_args_parser(
 25 |     description: Optional[str] = None,
 26 |     parents: Optional[List[argparse.ArgumentParser]] = None,
 27 |     add_help: bool = True,
 28 | ) -> argparse.ArgumentParser:
 29 |     parents = parents or []
 30 |     slurm_partition = get_slurm_partition()
 31 |     parser = argparse.ArgumentParser(
 32 |         description=description,
 33 |         parents=parents,
 34 |         add_help=add_help,
 35 |     )
 36 |     parser.add_argument(
 37 |         "--ngpus",
 38 |         "--gpus",
 39 |         "--gpus-per-node",
 40 |         default=8,
 41 |         type=int,
 42 |         help="Number of GPUs to request on each node",
 43 |     )
 44 |     parser.add_argument(
 45 |         "--nodes",
 46 |         "--nnodes",
 47 |         default=1,
 48 |         type=int,
 49 |         help="Number of nodes to request",
 50 |     )
 51 |     parser.add_argument(
 52 |         "--timeout",
 53 |         default=2800,
 54 |         type=int,
 55 |         help="Duration of the job",
 56 |     )
 57 |     parser.add_argument(
 58 |         "--partition",
 59 |         default=slurm_partition,
 60 |         type=str,
 61 |         help="Partition where to submit",
 62 |     )
 63 |     parser.add_argument(
 64 |         "--use-volta32",
 65 |         action="store_true",
 66 |         help="Request V100-32GB GPUs",
 67 |     )
 68 |     parser.add_argument(
 69 |         "--comment",
 70 |         default="",
 71 |         type=str,
 72 |         help="Comment to pass to scheduler, e.g. priority message",
 73 |     )
 74 |     parser.add_argument(
 75 |         "--exclude",
 76 |         default="",
 77 |         type=str,
 78 |         help="Nodes to exclude",
 79 |     )
 80 |     return parser
 81 | 
 82 | 
 83 | def get_shared_folder() -> Path:
 84 |     user_checkpoint_path = get_user_checkpoint_path()
 85 |     if user_checkpoint_path is None:
 86 |         raise RuntimeError("Path to user checkpoint cannot be determined")
 87 |     path = user_checkpoint_path / "experiments"
 88 |     path.mkdir(exist_ok=True)
 89 |     return path
 90 | 
 91 | 
 92 | def submit_jobs(task_class, args, name: str):
 93 |     if not args.output_dir:
 94 |         args.output_dir = str(get_shared_folder() / "%j")
 95 | 
 96 |     Path(args.output_dir).mkdir(parents=True, exist_ok=True)
 97 |     executor = submitit.AutoExecutor(folder=args.output_dir, slurm_max_num_timeout=30)
 98 | 
 99 |     kwargs = {}
100 |     if args.use_volta32:
101 |         kwargs["slurm_constraint"] = "volta32gb"
102 |     if args.comment:
103 |         kwargs["slurm_comment"] = args.comment
104 |     if args.exclude:
105 |         kwargs["slurm_exclude"] = args.exclude
106 | 
107 |     executor_params = get_slurm_executor_parameters(
108 |         nodes=args.nodes,
109 |         num_gpus_per_node=args.ngpus,
110 |         timeout_min=args.timeout,  # max is 60 * 72
111 |         slurm_signal_delay_s=120,
112 |         slurm_partition=args.partition,
113 |         **kwargs,
114 |     )
115 |     executor.update_parameters(name=name, **executor_params)
116 | 
117 |     task = task_class(args)
118 |     job = executor.submit(task)
119 | 
120 |     logger.info(f"Submitted job_id: {job.job_id}")
121 |     str_output_dir = os.path.abspath(args.output_dir).replace("%j", str(job.job_id))
122 |     logger.info(f"Logs and checkpoints will be saved at: {str_output_dir}")
123 | 


--------------------------------------------------------------------------------
/dinov2/run/train/train.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import logging
 7 | import os
 8 | import sys
 9 | 
10 | from dinov2.logging import setup_logging
11 | from dinov2.train import get_args_parser as get_train_args_parser
12 | from dinov2.run.submit import get_args_parser, submit_jobs
13 | 
14 | 
15 | logger = logging.getLogger("dinov2")
16 | 
17 | 
18 | class Trainer(object):
19 |     def __init__(self, args):
20 |         self.args = args
21 | 
22 |     def __call__(self):
23 |         from dinov2.train import main as train_main
24 | 
25 |         self._setup_args()
26 |         train_main(self.args)
27 | 
28 |     def checkpoint(self):
29 |         import submitit
30 | 
31 |         logger.info(f"Requeuing {self.args}")
32 |         empty = type(self)(self.args)
33 |         return submitit.helpers.DelayedSubmission(empty)
34 | 
35 |     def _setup_args(self):
36 |         import submitit
37 | 
38 |         job_env = submitit.JobEnvironment()
39 |         self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
40 |         logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
41 |         logger.info(f"Args: {self.args}")
42 | 
43 | 
44 | def main():
45 |     description = "Submitit launcher for DINOv2 training"
46 |     train_args_parser = get_train_args_parser(add_help=False)
47 |     parents = [train_args_parser]
48 |     args_parser = get_args_parser(description=description, parents=parents)
49 |     args = args_parser.parse_args()
50 | 
51 |     setup_logging()
52 | 
53 |     assert os.path.exists(args.config_file), "Configuration file does not exist!"
54 |     submit_jobs(Trainer, args, name="dinov2:train")
55 |     return 0
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     sys.exit(main())
60 | 


--------------------------------------------------------------------------------
/dinov2/train/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 
6 | from .train import get_args_parser, main
7 | from .ssl_meta_arch import SSLMetaArch
8 | 


--------------------------------------------------------------------------------
/dinov2/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/dinov2/utils/cluster.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from enum import Enum
 7 | import os
 8 | from pathlib import Path
 9 | from typing import Any, Dict, Optional
10 | 
11 | 
12 | class ClusterType(Enum):
13 |     AWS = "aws"
14 |     FAIR = "fair"
15 |     RSC = "rsc"
16 | 
17 | 
18 | def _guess_cluster_type() -> ClusterType:
19 |     uname = os.uname()
20 |     if uname.sysname == "Linux":
21 |         if uname.release.endswith("-aws"):
22 |             # Linux kernel versions on AWS instances are of the form "5.4.0-1051-aws"
23 |             return ClusterType.AWS
24 |         elif uname.nodename.startswith("rsc"):
25 |             # Linux kernel versions on RSC instances are standard ones but hostnames start with "rsc"
26 |             return ClusterType.RSC
27 | 
28 |     return ClusterType.FAIR
29 | 
30 | 
31 | def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]:
32 |     if cluster_type is None:
33 |         return _guess_cluster_type()
34 | 
35 |     return cluster_type
36 | 
37 | 
38 | def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
39 |     cluster_type = get_cluster_type(cluster_type)
40 |     if cluster_type is None:
41 |         return None
42 | 
43 |     CHECKPOINT_DIRNAMES = {
44 |         ClusterType.AWS: "checkpoints",
45 |         ClusterType.FAIR: "checkpoint",
46 |         ClusterType.RSC: "checkpoint/dino",
47 |     }
48 |     return Path("/") / CHECKPOINT_DIRNAMES[cluster_type]
49 | 
50 | 
51 | def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
52 |     checkpoint_path = get_checkpoint_path(cluster_type)
53 |     if checkpoint_path is None:
54 |         return None
55 | 
56 |     username = os.environ.get("USER")
57 |     assert username is not None
58 |     return checkpoint_path / username
59 | 
60 | 
61 | def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
62 |     cluster_type = get_cluster_type(cluster_type)
63 |     if cluster_type is None:
64 |         return None
65 | 
66 |     SLURM_PARTITIONS = {
67 |         ClusterType.AWS: "learnlab",
68 |         ClusterType.FAIR: "learnlab",
69 |         ClusterType.RSC: "learn",
70 |     }
71 |     return SLURM_PARTITIONS[cluster_type]
72 | 
73 | 
74 | def get_slurm_executor_parameters(
75 |     nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs
76 | ) -> Dict[str, Any]:
77 |     # create default parameters
78 |     params = {
79 |         "mem_gb": 0,  # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html
80 |         "gpus_per_node": num_gpus_per_node,
81 |         "tasks_per_node": num_gpus_per_node,  # one task per GPU
82 |         "cpus_per_task": 10,
83 |         "nodes": nodes,
84 |         "slurm_partition": get_slurm_partition(cluster_type),
85 |     }
86 |     # apply cluster-specific adjustments
87 |     cluster_type = get_cluster_type(cluster_type)
88 |     if cluster_type == ClusterType.AWS:
89 |         params["cpus_per_task"] = 12
90 |         del params["mem_gb"]
91 |     elif cluster_type == ClusterType.RSC:
92 |         params["cpus_per_task"] = 12
93 |     # set additional parameters / apply overrides
94 |     params.update(kwargs)
95 |     return params
96 | 


--------------------------------------------------------------------------------
/dinov2/utils/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import math
 7 | import logging
 8 | import os
 9 | 
10 | from omegaconf import OmegaConf
11 | 
12 | import dinov2.distributed as distributed
13 | from dinov2.logging import setup_logging
14 | from dinov2.utils import utils
15 | from dinov2.configs import dinov2_default_config
16 | 
17 | 
18 | logger = logging.getLogger("dinov2")
19 | 
20 | 
21 | def apply_scaling_rules_to_cfg(cfg):  # to fix
22 |     if cfg.optim.scaling_rule == "sqrt_wrt_1024":
23 |         base_lr = cfg.optim.base_lr
24 |         cfg.optim.lr = base_lr
25 |         cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0)
26 |         logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}")
27 |     else:
28 |         raise NotImplementedError
29 |     return cfg
30 | 
31 | 
32 | def write_config(cfg, output_dir, name="config.yaml"):
33 |     logger.info(OmegaConf.to_yaml(cfg))
34 |     saved_cfg_path = os.path.join(output_dir, name)
35 |     with open(saved_cfg_path, "w") as f:
36 |         OmegaConf.save(config=cfg, f=f)
37 |     return saved_cfg_path
38 | 
39 | 
40 | def get_cfg_from_args(args):
41 |     args.output_dir = os.path.abspath(args.output_dir)
42 |     args.opts += [f"train.output_dir={args.output_dir}"]
43 |     default_cfg = OmegaConf.create(dinov2_default_config)
44 |     cfg = OmegaConf.load(args.config_file)
45 |     cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts))
46 |     return cfg
47 | 
48 | 
49 | def default_setup(args):
50 |     distributed.enable(overwrite=True)
51 |     seed = getattr(args, "seed", 0)
52 |     rank = distributed.get_global_rank()
53 | 
54 |     global logger
55 |     setup_logging(output=args.output_dir, level=logging.INFO)
56 |     logger = logging.getLogger("dinov2")
57 | 
58 |     utils.fix_random_seeds(seed + rank)
59 |     logger.info("git:\n  {}\n".format(utils.get_sha()))
60 |     logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
61 | 
62 | 
63 | def setup(args):
64 |     """
65 |     Create configs and perform basic setups.
66 |     """
67 |     cfg = get_cfg_from_args(args)
68 |     os.makedirs(args.output_dir, exist_ok=True)
69 |     default_setup(args)
70 |     apply_scaling_rules_to_cfg(cfg)
71 |     write_config(cfg, args.output_dir)
72 |     return cfg
73 | 


--------------------------------------------------------------------------------
/dinov2/utils/dtype.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | 
 7 | from typing import Dict, Union
 8 | 
 9 | import numpy as np
10 | import torch
11 | 
12 | 
13 | TypeSpec = Union[str, np.dtype, torch.dtype]
14 | 
15 | 
16 | _NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
17 |     np.dtype("bool"): torch.bool,
18 |     np.dtype("uint8"): torch.uint8,
19 |     np.dtype("int8"): torch.int8,
20 |     np.dtype("int16"): torch.int16,
21 |     np.dtype("int32"): torch.int32,
22 |     np.dtype("int64"): torch.int64,
23 |     np.dtype("float16"): torch.float16,
24 |     np.dtype("float32"): torch.float32,
25 |     np.dtype("float64"): torch.float64,
26 |     np.dtype("complex64"): torch.complex64,
27 |     np.dtype("complex128"): torch.complex128,
28 | }
29 | 
30 | 
31 | def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
32 |     if isinstance(dtype, torch.dtype):
33 |         return dtype
34 |     if isinstance(dtype, str):
35 |         dtype = np.dtype(dtype)
36 |     assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}"
37 |     return _NUMPY_TO_TORCH_DTYPE[dtype]
38 | 


--------------------------------------------------------------------------------
/dinov2/utils/param_groups.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the Apache License, Version 2.0
  4 | # found in the LICENSE file in the root directory of this source tree.
  5 | 
  6 | from collections import defaultdict
  7 | import logging
  8 | 
  9 | 
 10 | logger = logging.getLogger("dinov2")
 11 | 
 12 | 
 13 | def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12, force_is_backbone=False, chunked_blocks=False):
 14 |     """
 15 |     Calculate lr decay rate for different ViT blocks.
 16 |     Args:
 17 |         name (string): parameter name.
 18 |         lr_decay_rate (float): base lr decay rate.
 19 |         num_layers (int): number of ViT blocks.
 20 |     Returns:
 21 |         lr decay rate for the given parameter.
 22 |     """
 23 |     layer_id = num_layers + 1
 24 |     if name.startswith("backbone") or force_is_backbone:
 25 |         if (
 26 |             ".pos_embed" in name
 27 |             or ".patch_embed" in name
 28 |             or ".mask_token" in name
 29 |             or ".cls_token" in name
 30 |             or ".register_tokens" in name
 31 |         ):
 32 |             layer_id = 0
 33 |         elif force_is_backbone and (
 34 |             "pos_embed" in name
 35 |             or "patch_embed" in name
 36 |             or "mask_token" in name
 37 |             or "cls_token" in name
 38 |             or "register_tokens" in name
 39 |         ):
 40 |             layer_id = 0
 41 |         elif ".blocks." in name and ".residual." not in name:
 42 |             layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
 43 |         elif chunked_blocks and "blocks." in name and "residual." not in name:
 44 |             layer_id = int(name[name.find("blocks.") :].split(".")[2]) + 1
 45 |         elif "blocks." in name and "residual." not in name:
 46 |             layer_id = int(name[name.find("blocks.") :].split(".")[1]) + 1
 47 | 
 48 |     return lr_decay_rate ** (num_layers + 1 - layer_id)
 49 | 
 50 | 
 51 | def get_params_groups_with_decay(model, lr_decay_rate=1.0, patch_embed_lr_mult=1.0):
 52 |     chunked_blocks = False
 53 |     if hasattr(model, "n_blocks"):
 54 |         logger.info("chunked fsdp")
 55 |         n_blocks = model.n_blocks
 56 |         chunked_blocks = model.chunked_blocks
 57 |     elif hasattr(model, "blocks"):
 58 |         logger.info("first code branch")
 59 |         n_blocks = len(model.blocks)
 60 |     elif hasattr(model, "backbone"):
 61 |         logger.info("second code branch")
 62 |         n_blocks = len(model.backbone.blocks)
 63 |     else:
 64 |         logger.info("else code branch")
 65 |         n_blocks = 0
 66 |     all_param_groups = []
 67 | 
 68 |     for name, param in model.named_parameters():
 69 |         name = name.replace("_fsdp_wrapped_module.", "")
 70 |         if not param.requires_grad:
 71 |             continue
 72 |         decay_rate = get_vit_lr_decay_rate(
 73 |             name, lr_decay_rate, num_layers=n_blocks, force_is_backbone=n_blocks > 0, chunked_blocks=chunked_blocks
 74 |         )
 75 |         d = {"params": param, "is_last_layer": False, "lr_multiplier": decay_rate, "wd_multiplier": 1.0, "name": name}
 76 | 
 77 |         if "last_layer" in name:
 78 |             d.update({"is_last_layer": True})
 79 | 
 80 |         if name.endswith(".bias") or "norm" in name or "gamma" in name:
 81 |             d.update({"wd_multiplier": 0.0})
 82 | 
 83 |         if "patch_embed" in name:
 84 |             d.update({"lr_multiplier": d["lr_multiplier"] * patch_embed_lr_mult})
 85 | 
 86 |         all_param_groups.append(d)
 87 |         logger.info(f"""{name}: lr_multiplier: {d["lr_multiplier"]}, wd_multiplier: {d["wd_multiplier"]}""")
 88 | 
 89 |     return all_param_groups
 90 | 
 91 | 
 92 | def fuse_params_groups(all_params_groups, keys=("lr_multiplier", "wd_multiplier", "is_last_layer")):
 93 |     fused_params_groups = defaultdict(lambda: {"params": []})
 94 |     for d in all_params_groups:
 95 |         identifier = ""
 96 |         for k in keys:
 97 |             identifier += k + str(d[k]) + "_"
 98 | 
 99 |         for k in keys:
100 |             fused_params_groups[identifier][k] = d[k]
101 |         fused_params_groups[identifier]["params"].append(d["params"])
102 | 
103 |     return fused_params_groups.values()
104 | 


--------------------------------------------------------------------------------
/dinov2/utils/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import logging
 7 | import os
 8 | import random
 9 | import subprocess
10 | from urllib.parse import urlparse
11 | 
12 | import numpy as np
13 | import torch
14 | from torch import nn
15 | 
16 | 
17 | logger = logging.getLogger("dinov2")
18 | 
19 | 
20 | def load_pretrained_weights(model, pretrained_weights, checkpoint_key):
21 |     if urlparse(pretrained_weights).scheme:  # If it looks like an URL
22 |         state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
23 |     else:
24 |         state_dict = torch.load(pretrained_weights, map_location="cpu")
25 |     if checkpoint_key is not None and checkpoint_key in state_dict:
26 |         logger.info(f"Take key {checkpoint_key} in provided checkpoint dict")
27 |         state_dict = state_dict[checkpoint_key]
28 |     # remove `module.` prefix
29 |     state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
30 |     # remove `backbone.` prefix induced by multicrop wrapper
31 |     state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
32 |     msg = model.load_state_dict(state_dict, strict=False)
33 |     logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg))
34 | 
35 | 
36 | def fix_random_seeds(seed=31):
37 |     """
38 |     Fix random seeds.
39 |     """
40 |     torch.manual_seed(seed)
41 |     torch.cuda.manual_seed_all(seed)
42 |     np.random.seed(seed)
43 |     random.seed(seed)
44 | 
45 | 
46 | def get_sha():
47 |     cwd = os.path.dirname(os.path.abspath(__file__))
48 | 
49 |     def _run(command):
50 |         return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
51 | 
52 |     sha = "N/A"
53 |     diff = "clean"
54 |     branch = "N/A"
55 |     try:
56 |         sha = _run(["git", "rev-parse", "HEAD"])
57 |         subprocess.check_output(["git", "diff"], cwd=cwd)
58 |         diff = _run(["git", "diff-index", "HEAD"])
59 |         diff = "has uncommitted changes" if diff else "clean"
60 |         branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
61 |     except Exception:
62 |         pass
63 |     message = f"sha: {sha}, status: {diff}, branch: {branch}"
64 |     return message
65 | 
66 | 
67 | class CosineScheduler(object):
68 |     def __init__(self, base_value, final_value, total_iters, warmup_iters=0, start_warmup_value=0, freeze_iters=0):
69 |         super().__init__()
70 |         self.final_value = final_value
71 |         self.total_iters = total_iters
72 | 
73 |         freeze_schedule = np.zeros((freeze_iters))
74 | 
75 |         warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
76 | 
77 |         iters = np.arange(total_iters - warmup_iters - freeze_iters)
78 |         schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))
79 |         self.schedule = np.concatenate((freeze_schedule, warmup_schedule, schedule))
80 | 
81 |         assert len(self.schedule) == self.total_iters
82 | 
83 |     def __getitem__(self, it):
84 |         if it >= self.total_iters:
85 |             return self.final_value
86 |         else:
87 |             return self.schedule[it]
88 | 
89 | 
90 | def has_batchnorms(model):
91 |     bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)
92 |     for name, module in model.named_modules():
93 |         if isinstance(module, bn_types):
94 |             return True
95 |     return False
96 | 


--------------------------------------------------------------------------------
/hubconf.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | 
 7 | from dinov2.hub.backbones import dinov2_vitb14, dinov2_vitg14, dinov2_vitl14, dinov2_vits14
 8 | from dinov2.hub.backbones import dinov2_vitb14_reg, dinov2_vitg14_reg, dinov2_vitl14_reg, dinov2_vits14_reg
 9 | from dinov2.hub.classifiers import dinov2_vitb14_lc, dinov2_vitg14_lc, dinov2_vitl14_lc, dinov2_vits14_lc
10 | from dinov2.hub.classifiers import dinov2_vitb14_reg_lc, dinov2_vitg14_reg_lc, dinov2_vitl14_reg_lc, dinov2_vits14_reg_lc
11 | from dinov2.hub.depthers import dinov2_vitb14_ld, dinov2_vitg14_ld, dinov2_vitl14_ld, dinov2_vits14_ld
12 | from dinov2.hub.depthers import dinov2_vitb14_dd, dinov2_vitg14_dd, dinov2_vitl14_dd, dinov2_vits14_dd
13 | 
14 | 
15 | dependencies = ["torch"]
16 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 120
 3 | 
 4 | [tool.pylint.master]
 5 | persistent = false
 6 | score = false
 7 | 
 8 | [tool.pylint.messages_control]
 9 | disable = "all"
10 | enable = [
11 |   "miscellaneous",
12 |   "similarities",
13 | ]
14 | 
15 | [tool.pylint.similarities]
16 | ignore-comments = true
17 | ignore-docstrings = true
18 | ignore-imports = true
19 | min-similarity-lines = 8
20 | 
21 | [tool.pylint.reports]
22 | reports = false
23 | 
24 | [tool.pylint.miscellaneous]
25 | notes = [
26 |   "FIXME",
27 |   "XXX",
28 |   "TODO",
29 | ]
30 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | black==22.6.0
2 | flake8==5.0.4
3 | pylint==2.15.0
4 | 


--------------------------------------------------------------------------------
/requirements-extras.txt:
--------------------------------------------------------------------------------
1 | mmcv-full==1.5.0
2 | mmsegmentation==0.27.0
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cu117
 2 | torch==2.0.0
 3 | torchvision==0.15.0
 4 | omegaconf
 5 | torchmetrics==0.10.3
 6 | fvcore
 7 | iopath
 8 | xformers==0.0.18
 9 | submitit
10 | --extra-index-url https://pypi.nvidia.com
11 | cuml-cu11
12 | 


--------------------------------------------------------------------------------
/scripts/lint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if [ -n "$1" ]; then
 4 |   echo "linting \"$1\""
 5 | fi
 6 | 
 7 | echo "running black"
 8 | if [ -n "$1" ]; then
 9 |   black "$1"
10 | else
11 |   black dinov2
12 | fi
13 | 
14 | echo "running flake8"
15 | if [ -n "$1" ]; then
16 |   flake8 "$1"
17 | else
18 |   flake8
19 | fi
20 | 
21 | echo "running pylint"
22 | if [ -n "$1" ]; then
23 |   pylint "$1"
24 | else
25 |   pylint dinov2
26 | fi
27 | 
28 | exit 0
29 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | ignore = E203,E501,W503
4 | per-file-ignores =
5 |   __init__.py:F401
6 |   hubconf.py:F401
7 | exclude =
8 |     venv
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | from pathlib import Path
 7 | import re
 8 | from typing import List, Tuple
 9 | 
10 | from setuptools import setup, find_packages
11 | 
12 | 
13 | NAME = "dinov2"
14 | DESCRIPTION = "PyTorch code and models for the DINOv2 self-supervised learning method."
15 | 
16 | URL = "https://github.com/facebookresearch/dinov2"
17 | AUTHOR = "FAIR"
18 | REQUIRES_PYTHON = ">=3.9.0"
19 | HERE = Path(__file__).parent
20 | 
21 | 
22 | try:
23 |     with open(HERE / "README.md", encoding="utf-8") as f:
24 |         long_description = "\n" + f.read()
25 | except FileNotFoundError:
26 |     long_description = DESCRIPTION
27 | 
28 | 
29 | def get_requirements(path: str = HERE / "requirements.txt") -> Tuple[List[str], List[str]]:
30 |     requirements = []
31 |     extra_indices = []
32 |     with open(path) as f:
33 |         for line in f.readlines():
34 |             line = line.rstrip("\r\n")
35 |             if line.startswith("--extra-index-url "):
36 |                 extra_indices.append(line[18:])
37 |                 continue
38 |             requirements.append(line)
39 |     return requirements, extra_indices
40 | 
41 | 
42 | def get_package_version() -> str:
43 |     with open(HERE / "dinov2/__init__.py") as f:
44 |         result = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", f.read(), re.M)
45 |         if result:
46 |             return result.group(1)
47 |     raise RuntimeError("Can't get package version")
48 | 
49 | 
50 | requirements, extra_indices = get_requirements()
51 | version = get_package_version()
52 | dev_requirements, _ = get_requirements(HERE / "requirements-dev.txt")
53 | extras_requirements, _ = get_requirements(HERE / "requirements-extras.txt")
54 | 
55 | 
56 | setup(
57 |     name=NAME,
58 |     version=version,
59 |     description=DESCRIPTION,
60 |     long_description=long_description,
61 |     long_description_content_type="text/markdown",
62 |     author=AUTHOR,
63 |     python_requires=REQUIRES_PYTHON,
64 |     url=URL,
65 |     packages=find_packages(),
66 |     package_data={
67 |         "": ["*.yaml"],
68 |     },
69 |     install_requires=requirements,
70 |     extras_require={
71 |         "dev": dev_requirements,
72 |         "extras": extras_requirements,
73 |     },
74 |     dependency_links=extra_indices,
75 |     install_package_data=True,
76 |     license="Apache",
77 |     license_files=("LICENSE",),
78 |     classifiers=[
79 |         # Trove classifiers: https://github.com/pypa/trove-classifiers/blob/main/src/trove_classifiers/__init__.py
80 |         "Development Status :: 3 - Alpha",
81 |         "Intended Audience :: Developers",
82 |         "Intended Audience :: Science/Research",
83 |         "License :: OSI Approved :: Apache Software License",
84 |         "Programming Language :: Python :: 3.9",
85 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
86 |         "Topic :: Software Development :: Libraries :: Python Modules",
87 |     ],
88 | )
89 | 


--------------------------------------------------------------------------------