├── LICENSE
├── README.md
├── anyup
    ├── __init__.py
    ├── layers
    │   ├── __init__.py
    │   ├── attention
    │   │   ├── __init__.py
    │   │   ├── attention_masking.py
    │   │   └── chunked_attention.py
    │   ├── convolutions.py
    │   ├── feature_unification.py
    │   └── positional_encoding.py
    ├── model.py
    └── utils
    │   ├── __init__.py
    │   ├── img.py
    │   └── visualization
    │       ├── __init__.py
    │       └── attention_visualization.py
├── example_usage.ipynb
└── hubconf.py


/LICENSE:
--------------------------------------------------------------------------------
  1 | Attribution 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 |      wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More considerations
 52 |      for the public:
 53 |      wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution 4.0 International Public License
 58 | 
 59 | By exercising the Licensed Rights (defined below), You accept and agree
 60 | to be bound by the terms and conditions of this Creative Commons
 61 | Attribution 4.0 International Public License ("Public License"). To the
 62 | extent this Public License may be interpreted as a contract, You are
 63 | granted the Licensed Rights in consideration of Your acceptance of
 64 | these terms and conditions, and the Licensor grants You such rights in
 65 | consideration of benefits the Licensor receives from making the
 66 | Licensed Material available under these terms and conditions.
 67 | 
 68 | 
 69 | Section 1 -- Definitions.
 70 | 
 71 |   a. Adapted Material means material subject to Copyright and Similar
 72 |      Rights that is derived from or based upon the Licensed Material
 73 |      and in which the Licensed Material is translated, altered,
 74 |      arranged, transformed, or otherwise modified in a manner requiring
 75 |      permission under the Copyright and Similar Rights held by the
 76 |      Licensor. For purposes of this Public License, where the Licensed
 77 |      Material is a musical work, performance, or sound recording,
 78 |      Adapted Material is always produced where the Licensed Material is
 79 |      synched in timed relation with a moving image.
 80 | 
 81 |   b. Adapter's License means the license You apply to Your Copyright
 82 |      and Similar Rights in Your contributions to Adapted Material in
 83 |      accordance with the terms and conditions of this Public License.
 84 | 
 85 |   c. Copyright and Similar Rights means copyright and/or similar rights
 86 |      closely related to copyright including, without limitation,
 87 |      performance, broadcast, sound recording, and Sui Generis Database
 88 |      Rights, without regard to how the rights are labeled or
 89 |      categorized. For purposes of this Public License, the rights
 90 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 91 |      Rights.
 92 | 
 93 |   d. Effective Technological Measures means those measures that, in the
 94 |      absence of proper authority, may not be circumvented under laws
 95 |      fulfilling obligations under Article 11 of the WIPO Copyright
 96 |      Treaty adopted on December 20, 1996, and/or similar international
 97 |      agreements.
 98 | 
 99 |   e. Exceptions and Limitations means fair use, fair dealing, and/or
100 |      any other exception or limitation to Copyright and Similar Rights
101 |      that applies to Your use of the Licensed Material.
102 | 
103 |   f. Licensed Material means the artistic or literary work, database,
104 |      or other material to which the Licensor applied this Public
105 |      License.
106 | 
107 |   g. Licensed Rights means the rights granted to You subject to the
108 |      terms and conditions of this Public License, which are limited to
109 |      all Copyright and Similar Rights that apply to Your use of the
110 |      Licensed Material and that the Licensor has authority to license.
111 | 
112 |   h. Licensor means the individual(s) or entity(ies) granting rights
113 |      under this Public License.
114 | 
115 |   i. Share means to provide material to the public by any means or
116 |      process that requires permission under the Licensed Rights, such
117 |      as reproduction, public display, public performance, distribution,
118 |      dissemination, communication, or importation, and to make material
119 |      available to the public including in ways that members of the
120 |      public may access the material from a place and at a time
121 |      individually chosen by them.
122 | 
123 |   j. Sui Generis Database Rights means rights other than copyright
124 |      resulting from Directive 96/9/EC of the European Parliament and of
125 |      the Council of 11 March 1996 on the legal protection of databases,
126 |      as amended and/or succeeded, as well as other essentially
127 |      equivalent rights anywhere in the world.
128 | 
129 |   k. You means the individual or entity exercising the Licensed Rights
130 |      under this Public License. Your has a corresponding meaning.
131 | 
132 | 
133 | Section 2 -- Scope.
134 | 
135 |   a. License grant.
136 | 
137 |        1. Subject to the terms and conditions of this Public License,
138 |           the Licensor hereby grants You a worldwide, royalty-free,
139 |           non-sublicensable, non-exclusive, irrevocable license to
140 |           exercise the Licensed Rights in the Licensed Material to:
141 | 
142 |             a. reproduce and Share the Licensed Material, in whole or
143 |                in part; and
144 | 
145 |             b. produce, reproduce, and Share Adapted Material.
146 | 
147 |        2. Exceptions and Limitations. For the avoidance of doubt, where
148 |           Exceptions and Limitations apply to Your use, this Public
149 |           License does not apply, and You do not need to comply with
150 |           its terms and conditions.
151 | 
152 |        3. Term. The term of this Public License is specified in Section
153 |           6(a).
154 | 
155 |        4. Media and formats; technical modifications allowed. The
156 |           Licensor authorizes You to exercise the Licensed Rights in
157 |           all media and formats whether now known or hereafter created,
158 |           and to make technical modifications necessary to do so. The
159 |           Licensor waives and/or agrees not to assert any right or
160 |           authority to forbid You from making technical modifications
161 |           necessary to exercise the Licensed Rights, including
162 |           technical modifications necessary to circumvent Effective
163 |           Technological Measures. For purposes of this Public License,
164 |           simply making modifications authorized by this Section 2(a)
165 |           (4) never produces Adapted Material.
166 | 
167 |        5. Downstream recipients.
168 | 
169 |             a. Offer from the Licensor -- Licensed Material. Every
170 |                recipient of the Licensed Material automatically
171 |                receives an offer from the Licensor to exercise the
172 |                Licensed Rights under the terms and conditions of this
173 |                Public License.
174 | 
175 |             b. No downstream restrictions. You may not offer or impose
176 |                any additional or different terms or conditions on, or
177 |                apply any Effective Technological Measures to, the
178 |                Licensed Material if doing so restricts exercise of the
179 |                Licensed Rights by any recipient of the Licensed
180 |                Material.
181 | 
182 |        6. No endorsement. Nothing in this Public License constitutes or
183 |           may be construed as permission to assert or imply that You
184 |           are, or that Your use of the Licensed Material is, connected
185 |           with, or sponsored, endorsed, or granted official status by,
186 |           the Licensor or others designated to receive attribution as
187 |           provided in Section 3(a)(1)(A)(i).
188 | 
189 |   b. Other rights.
190 | 
191 |        1. Moral rights, such as the right of integrity, are not
192 |           licensed under this Public License, nor are publicity,
193 |           privacy, and/or other similar personality rights; however, to
194 |           the extent possible, the Licensor waives and/or agrees not to
195 |           assert any such rights held by the Licensor to the limited
196 |           extent necessary to allow You to exercise the Licensed
197 |           Rights, but not otherwise.
198 | 
199 |        2. Patent and trademark rights are not licensed under this
200 |           Public License.
201 | 
202 |        3. To the extent possible, the Licensor waives any right to
203 |           collect royalties from You for the exercise of the Licensed
204 |           Rights, whether directly or through a collecting society
205 |           under any voluntary or waivable statutory or compulsory
206 |           licensing scheme. In all other cases the Licensor expressly
207 |           reserves any right to collect such royalties.
208 | 
209 | 
210 | Section 3 -- License Conditions.
211 | 
212 | Your exercise of the Licensed Rights is expressly made subject to the
213 | following conditions.
214 | 
215 |   a. Attribution.
216 | 
217 |        1. If You Share the Licensed Material (including in modified
218 |           form), You must:
219 | 
220 |             a. retain the following if it is supplied by the Licensor
221 |                with the Licensed Material:
222 | 
223 |                  i. identification of the creator(s) of the Licensed
224 |                     Material and any others designated to receive
225 |                     attribution, in any reasonable manner requested by
226 |                     the Licensor (including by pseudonym if
227 |                     designated);
228 | 
229 |                 ii. a copyright notice;
230 | 
231 |                iii. a notice that refers to this Public License;
232 | 
233 |                 iv. a notice that refers to the disclaimer of
234 |                     warranties;
235 | 
236 |                  v. a URI or hyperlink to the Licensed Material to the
237 |                     extent reasonably practicable;
238 | 
239 |             b. indicate if You modified the Licensed Material and
240 |                retain an indication of any previous modifications; and
241 | 
242 |             c. indicate the Licensed Material is licensed under this
243 |                Public License, and include the text of, or the URI or
244 |                hyperlink to, this Public License.
245 | 
246 |        2. You may satisfy the conditions in Section 3(a)(1) in any
247 |           reasonable manner based on the medium, means, and context in
248 |           which You Share the Licensed Material. For example, it may be
249 |           reasonable to satisfy the conditions by providing a URI or
250 |           hyperlink to a resource that includes the required
251 |           information.
252 | 
253 |        3. If requested by the Licensor, You must remove any of the
254 |           information required by Section 3(a)(1)(A) to the extent
255 |           reasonably practicable.
256 | 
257 |        4. If You Share Adapted Material You produce, the Adapter's
258 |           License You apply must not prevent recipients of the Adapted
259 |           Material from complying with this Public License.
260 | 
261 | 
262 | Section 4 -- Sui Generis Database Rights.
263 | 
264 | Where the Licensed Rights include Sui Generis Database Rights that
265 | apply to Your use of the Licensed Material:
266 | 
267 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
268 |      to extract, reuse, reproduce, and Share all or a substantial
269 |      portion of the contents of the database;
270 | 
271 |   b. if You include all or a substantial portion of the database
272 |      contents in a database in which You have Sui Generis Database
273 |      Rights, then the database in which You have Sui Generis Database
274 |      Rights (but not its individual contents) is Adapted Material; and
275 | 
276 |   c. You must comply with the conditions in Section 3(a) if You Share
277 |      all or a substantial portion of the contents of the database.
278 | 
279 | For the avoidance of doubt, this Section 4 supplements and does not
280 | replace Your obligations under this Public License where the Licensed
281 | Rights include other Copyright and Similar Rights.
282 | 
283 | 
284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
285 | 
286 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
287 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
288 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
289 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
290 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
291 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
292 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
293 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
294 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
295 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
296 | 
297 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
298 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
299 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
300 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
301 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
302 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
303 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
304 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
305 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
306 | 
307 |   c. The disclaimer of warranties and limitation of liability provided
308 |      above shall be interpreted in a manner that, to the extent
309 |      possible, most closely approximates an absolute disclaimer and
310 |      waiver of all liability.
311 | 
312 | 
313 | Section 6 -- Term and Termination.
314 | 
315 |   a. This Public License applies for the term of the Copyright and
316 |      Similar Rights licensed here. However, if You fail to comply with
317 |      this Public License, then Your rights under this Public License
318 |      terminate automatically.
319 | 
320 |   b. Where Your right to use the Licensed Material has terminated under
321 |      Section 6(a), it reinstates:
322 | 
323 |        1. automatically as of the date the violation is cured, provided
324 |           it is cured within 30 days of Your discovery of the
325 |           violation; or
326 | 
327 |        2. upon express reinstatement by the Licensor.
328 | 
329 |      For the avoidance of doubt, this Section 6(b) does not affect any
330 |      right the Licensor may have to seek remedies for Your violations
331 |      of this Public License.
332 | 
333 |   c. For the avoidance of doubt, the Licensor may also offer the
334 |      Licensed Material under separate terms or conditions or stop
335 |      distributing the Licensed Material at any time; however, doing so
336 |      will not terminate this Public License.
337 | 
338 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
339 |      License.
340 | 
341 | 
342 | Section 7 -- Other Terms and Conditions.
343 | 
344 |   a. The Licensor shall not be bound by any additional or different
345 |      terms or conditions communicated by You unless expressly agreed.
346 | 
347 |   b. Any arrangements, understandings, or agreements regarding the
348 |      Licensed Material not stated herein are separate from and
349 |      independent of the terms and conditions of this Public License.
350 | 
351 | 
352 | Section 8 -- Interpretation.
353 | 
354 |   a. For the avoidance of doubt, this Public License does not, and
355 |      shall not be interpreted to, reduce, limit, restrict, or impose
356 |      conditions on any use of the Licensed Material that could lawfully
357 |      be made without permission under this Public License.
358 | 
359 |   b. To the extent possible, if any provision of this Public License is
360 |      deemed unenforceable, it shall be automatically reformed to the
361 |      minimum extent necessary to make it enforceable. If the provision
362 |      cannot be reformed, it shall be severed from this Public License
363 |      without affecting the enforceability of the remaining terms and
364 |      conditions.
365 | 
366 |   c. No term or condition of this Public License will be waived and no
367 |      failure to comply consented to unless expressly agreed to by the
368 |      Licensor.
369 | 
370 |   d. Nothing in this Public License constitutes or may be interpreted
371 |      as a limitation upon, or waiver of, any privileges and immunities
372 |      that apply to the Licensor or You, including from the legal
373 |      processes of any jurisdiction or authority.
374 | 
375 | 
376 | =======================================================================
377 | 
378 | Creative Commons is not a party to its public licenses.
379 | Notwithstanding, Creative Commons may elect to apply one of its public
380 | licenses to material it publishes and in those instances will be
381 | considered the “Licensor.” The text of the Creative Commons public
382 | licenses is dedicated to the public domain under the CC0 Public Domain
383 | Dedication. Except for the limited purpose of indicating that material
384 | is shared under a Creative Commons public license or as otherwise
385 | permitted by the Creative Commons policies published at
386 | creativecommons.org/policies, Creative Commons does not authorize the
387 | use of the trademark "Creative Commons" or any other trademark or logo
388 | of Creative Commons without its prior written consent including,
389 | without limitation, in connection with any unauthorized modifications
390 | to any of its public licenses or any other arrangements,
391 | understandings, or agreements concerning use of licensed material. For
392 | the avoidance of doubt, this paragraph does not form part of the public
393 | licenses.
394 | 
395 | Creative Commons may be contacted at creativecommons.org.
396 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ### AnyUp: Universal Feature Upsampling
  2 | 
  3 | [**Thomas Wimmer**](https://wimmerth.github.io/)<sup>1,2</sup>,
  4 | [Prune Truong](https://prunetruong.com/)<sup>3</sup>,
  5 | [Marie-Julie Rakotosaona](https://scholar.google.com/citations?user=eQ0om98AAAAJ&hl=en)<sup>3</sup>,
  6 | [Michael Oechsle](https://moechsle.github.io/)<sup>3</sup>,
  7 | [Federico Tombari](https://federicotombari.github.io/)<sup>3,4</sup>,
  8 | [Bernt Schiele](https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/people/bernt-schiele)<sup>1</sup>
  9 | [Jan Eric Lenssen](https://janericlenssen.github.io/)<sup>1</sup>
 10 | 
 11 | <sup>1</sup>Max Planck Institute for Informatics, <sup>2</sup>ETH Zurich, <sup>3</sup>Google, <sup>4</sup>TU Munich
 12 | 
 13 | [![Website](https://img.shields.io/badge/Website-AnyUp-blue)](https://wimmerth.github.io/anyup/)
 14 | [![arXiv](https://img.shields.io/badge/arXiv-2510.12764-b31b1b.svg)](https://arxiv.org/abs/2510.12764)
 15 | [![Colab](https://img.shields.io/badge/Colab-AnyUp-blue?logo=googlecolab)](https://colab.research.google.com/github/wimmerth/anyup/blob/main/example_usage.ipynb)
 16 | 
 17 | [![AnyUp Teaser](https://wimmerth.github.io/anyup/assets/figures/teaser-anyup.png)](https://wimmerth.github.io/anyup/)
 18 | 
 19 | **Abstract:**
 20 | 
 21 | We introduce AnyUp, a method for feature upsampling that can be applied to any vision feature at any resolution, without
 22 | encoder-specific training. Existing learning-based upsamplers for features like DINO or CLIP need to be re-trained for
 23 | every feature extractor and thus do not generalize to different feature types at inference time. In this work, we
 24 | propose an _inference-time_ feature-agnostic upsampling architecture to alleviate this limitation and improve upsampling
 25 | quality. In our experiments, AnyUp sets a new state of the art for upsampled features, generalizes to different feature
 26 | types, and preserves feature semantics while being efficient and easy to apply to a wide range of downstream tasks.
 27 | 
 28 | ---
 29 | 
 30 | ### Use AnyUp to upsample your features!
 31 | 
 32 | Upsample features from any model, at any layer without having to retrain the upsampler. It's as easy as this:
 33 | 
 34 | ```python
 35 | import torch
 36 | # high-resolution image (B, 3, H, W)
 37 | hr_image    = ...
 38 | # low-resolution features (B, C, h, w) 
 39 | lr_features = ...
 40 | # load the AnyUp upsampler model
 41 | upsampler   = torch.hub.load('wimmerth/anyup', 'anyup')
 42 | # upsampled high-resolution features (B, C, H, W)
 43 | hr_features = upsampler(hr_image, lr_features)
 44 | ```
 45 | 
 46 | **Notes:**
 47 | - The `hr_image` should be normalized to ImageNet mean and std as usual for most vision encoders.
 48 | - The `lr_features` can be any features from any encoder, e.g. DINO, CLIP, or ResNet.
 49 | 
 50 | The `hr_features` will have the same spatial resolution as the `hr_image` by default.
 51 | If you want a different output resolution, you can specify it with the `output_size` argument:
 52 | 
 53 | ```python
 54 | # upsampled features with custom output size (B, C, H', W')
 55 | hr_features = upsampler(hr_image, lr_features, output_size=(H_prime, W_prime))
 56 | ```
 57 | 
 58 | If you have limited compute resources and run into OOM issues when upsampling to high resolutions, you can use the
 59 | `q_chunk_size` argument to trade off speed for memory:
 60 | 
 61 | ```python
 62 | # upsampled features using chunking to save memory (B, C, H, W)
 63 | hr_features = upsampler(hr_image, lr_features, q_chunk_size=128)
 64 | ```
 65 | 
 66 | If you are interested in the attention that is used by AnyUp to upsample the features, we included an optional
 67 | visualization thereof in the forward pass:
 68 | 
 69 | ```python
 70 | # matplotlib must be installed to use this feature
 71 | # upsampled features and display attention map visualization (B, C, H, W)
 72 | hr_features = upsampler(hr_image, lr_features, vis_attn=True)
 73 | ```
 74 | 
 75 | ---
 76 | 
 77 | **Training code** for AnyUp will be released soon!
 78 | 
 79 | We are also planning to integrate FlexAttention support to speed up the window attention and reduce memory consumption.
 80 | We are always happy for a helping hand, so feel free to reach out if you want to contribute!
 81 | 
 82 | ---
 83 | 
 84 | **Evaluation** followed the protocols of [JAFAR](https://github.com/PaulCouairon/JAFAR) for semantic segmentation and
 85 | [Probe3D](https://github.com/mbanani/probe3d) for surface normal and depth estimation. Note that we applied a small fix
 86 | to the probe training in JAFAR (updating LR scheduling to per epoch instead of per iteration). Therefore, we re-ran all
 87 | experiments with baselines to ensure a fair comparison.
 88 | 
 89 | **Acknowledgements:**
 90 | We built our implementation on top of the [JAFAR repository](https://github.com/PaulCouairon/JAFAR) and thank the
 91 | authors for open-sourcing their code. Other note-worthy open-source repositories include:
 92 | [LoftUp](https://github.com/andrehuang/loftup), [FeatUp](https://github.com/mhamilton723/FeatUp), and
 93 | [Probe3D](https://github.com/mbanani/probe3d).
 94 | 
 95 | ---
 96 | ### Citation
 97 | 
 98 | If you find our work useful in your research, please cite it as:
 99 | ```
100 | @article{wimmer2025anyup,
101 |     title={AnyUp: Universal Feature Upsampling},
102 |     author={Wimmer, Thomas and Truong, Prune and Rakotosaona, Marie-Julie and Oechsle, Michael and Tombari, Federico and Schiele, Bernt and Lenssen, Jan Eric},
103 |     journal={arXiv preprint arXiv:2510.12764},
104 |     year={2025}
105 | }
106 | ```


--------------------------------------------------------------------------------
/anyup/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wimmerth/anyup/0dbfc182cb3481abcb090253b86dc5f652f7a537/anyup/__init__.py


--------------------------------------------------------------------------------
/anyup/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .convolutions import ResBlock
2 | from .feature_unification import LearnedFeatureUnification
3 | from .attention import CrossAttentionBlock
4 | from .positional_encoding import RoPE


--------------------------------------------------------------------------------
/anyup/layers/attention/__init__.py:
--------------------------------------------------------------------------------
1 | from .chunked_attention import CrossAttentionBlock


--------------------------------------------------------------------------------
/anyup/layers/attention/attention_masking.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from typing import Tuple
 3 | from functools import lru_cache
 4 | 
 5 | 
 6 | def window2d(
 7 |         low_res: int | Tuple[int, int],
 8 |         high_res: int | Tuple[int, int],
 9 |         ratio: float,
10 |         *,
11 |         device: str = "cpu"
12 | ) -> torch.Tensor:
13 |     # unpack
14 |     if isinstance(high_res, int):
15 |         H = W = high_res
16 |     else:
17 |         H, W = high_res
18 |     if isinstance(low_res, int):
19 |         Lh = Lw = low_res
20 |     else:
21 |         Lh, Lw = low_res
22 | 
23 |     # pixel-centers in [0,1)
24 |     r_pos = (torch.arange(H, device=device, dtype=torch.float32) + 0.5) / H  # (H,)
25 |     c_pos = (torch.arange(W, device=device, dtype=torch.float32) + 0.5) / W  # (W,)
26 |     pos_r, pos_c = torch.meshgrid(r_pos, c_pos, indexing="ij")  # (H,W)
27 | 
28 |     # clamp before scaling
29 |     r_lo = (pos_r - ratio).clamp(0.0, 1.0)
30 |     r_hi = (pos_r + ratio).clamp(0.0, 1.0)
31 |     c_lo = (pos_c - ratio).clamp(0.0, 1.0)
32 |     c_hi = (pos_c + ratio).clamp(0.0, 1.0)
33 | 
34 |     # quantise symmetrically
35 |     r0 = (r_lo * Lh).floor().long()  # inclusive start
36 |     r1 = (r_hi * Lh).ceil().long()  # exclusive end
37 |     c0 = (c_lo * Lw).floor().long()
38 |     c1 = (c_hi * Lw).ceil().long()
39 | 
40 |     return torch.stack([r0, r1, c0, c1], dim=2)
41 | 
42 | 
43 | @lru_cache
44 | def compute_attention_mask(high_res_h, high_res_w, low_res_h, low_res_w, window_size_ratio, device="cpu"):
45 |     h, w = high_res_h, high_res_w
46 |     h_, w_ = low_res_h, low_res_w
47 | 
48 |     windows = window2d(
49 |         low_res=(h_, w_),
50 |         high_res=(h, w),
51 |         ratio=window_size_ratio,
52 |         device=device
53 |     )
54 | 
55 |     q = h * w  # number of high-res query locations
56 | 
57 |     # flatten window bounds: (q, 1)
58 |     r0 = windows[..., 0].reshape(q, 1)
59 |     r1 = windows[..., 1].reshape(q, 1)  # exclusive
60 |     c0 = windows[..., 2].reshape(q, 1)
61 |     c1 = windows[..., 3].reshape(q, 1)  # exclusive
62 | 
63 |     # row / column indices on low-res grid
64 |     rows = torch.arange(h_, device=device)  # (h_,)
65 |     cols = torch.arange(w_, device=device)  # (w_,)
66 | 
67 |     row_ok = (rows >= r0) & (rows < r1)  # (q, h_)
68 |     col_ok = (cols >= c0) & (cols < c1)  # (q, w_)
69 | 
70 |     # broadcast to (q, h_, w_) and flatten last two dims
71 |     attention_mask = (row_ok.unsqueeze(2) & col_ok.unsqueeze(1)) \
72 |         .reshape(q, h_ * w_).to(dtype=torch.bool)
73 | 
74 |     return ~attention_mask
75 | 


--------------------------------------------------------------------------------
/anyup/layers/attention/chunked_attention.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch import einsum
  4 | from typing import Optional
  5 | from .attention_masking import compute_attention_mask
  6 | 
  7 | 
  8 | class CrossAttention(nn.Module):
  9 |     def __init__(self, qk_dim, num_heads,
 10 |                  q_chunk_size: Optional[int] = None,
 11 |                  store_attn: bool = False):
 12 |         super().__init__()
 13 |         self.norm_q = nn.RMSNorm(qk_dim)
 14 |         self.norm_k = nn.RMSNorm(qk_dim)
 15 |         self.q_chunk_size = q_chunk_size
 16 |         self.store_attn = store_attn
 17 |         self.attention = nn.MultiheadAttention(
 18 |             embed_dim=qk_dim,
 19 |             num_heads=num_heads,
 20 |             dropout=0.0,
 21 |             batch_first=True,
 22 |         )
 23 | 
 24 |     @torch.no_grad()
 25 |     def _slice_mask(self, mask, start, end):
 26 |         if mask is None:
 27 |             return None
 28 |         # 2D: (tgt_len, src_len), 3D: (B*num_heads or B, tgt_len, src_len)
 29 |         if mask.dim() == 2:
 30 |             return mask[start:end, :]
 31 |         elif mask.dim() == 3:
 32 |             return mask[:, start:end, :]
 33 |         else:
 34 |             raise ValueError("attn_mask must be 2D or 3D")
 35 | 
 36 |     def forward(self, query, key, value, mask=None,
 37 |                 q_chunk_size: Optional[int] = None,
 38 |                 store_attn: Optional[bool] = None):
 39 |         q_chunk_size = self.q_chunk_size if q_chunk_size is None else q_chunk_size
 40 |         store_attn = self.store_attn if store_attn is None else store_attn
 41 | 
 42 |         val = key
 43 | 
 44 |         query = self.norm_q(query)
 45 |         key = self.norm_k(key)
 46 | 
 47 |         # Fast path: no chunking
 48 |         if q_chunk_size is None or query.size(1) <= q_chunk_size:
 49 |             _, attn = self.attention(query, key, val,
 50 |                                      average_attn_weights=True,
 51 |                                      attn_mask=mask)
 52 |             features = einsum("b i j, b j d -> b i d", attn, value)
 53 |             return features, (attn if store_attn else None)
 54 | 
 55 |         # Chunked over the query length (tgt_len)
 56 |         B, Q, _ = query.shape
 57 |         outputs = []
 58 |         attns = [] if store_attn else None
 59 | 
 60 |         for start in range(0, Q, q_chunk_size):
 61 |             end = min(start + q_chunk_size, Q)
 62 |             q_chunk = query[:, start:end, :]
 63 |             mask_chunk = self._slice_mask(mask, start, end)
 64 | 
 65 |             # We ignore the MHA output as in JAFAR:
 66 |             # use the averaged attention to weight the unprojected V.
 67 |             _, attn_chunk = self.attention(q_chunk, key, val,
 68 |                                            average_attn_weights=True,
 69 |                                            attn_mask=mask_chunk)
 70 |             out_chunk = einsum("b i j, b j d -> b i d", attn_chunk, value)
 71 |             outputs.append(out_chunk)
 72 |             if store_attn:
 73 |                 attns.append(attn_chunk)
 74 | 
 75 |         features = torch.cat(outputs, dim=1)
 76 |         attn_scores = torch.cat(attns, dim=1) if store_attn else None
 77 |         return features, attn_scores
 78 | 
 79 | 
 80 | class CrossAttentionBlock(nn.Module):
 81 |     def __init__(self, qk_dim, num_heads, window_ratio: float = 0.1,
 82 |                  q_chunk_size: Optional[int] = None, **kwargs):
 83 |         super().__init__()
 84 |         self.cross_attn = CrossAttention(
 85 |             qk_dim, num_heads,
 86 |             q_chunk_size=q_chunk_size
 87 |         )
 88 |         self.window_ratio = window_ratio
 89 |         self.conv2d = nn.Conv2d(qk_dim, qk_dim, kernel_size=3, stride=1, padding=1, bias=False)
 90 | 
 91 |     def forward(self, q, k, v, q_chunk_size: Optional[int] = None, store_attn: Optional[bool] = None, vis_attn=False,
 92 |                 **kwargs):
 93 |         store_attn = store_attn or vis_attn
 94 |         q = self.conv2d(q)
 95 |         if self.window_ratio > 0:
 96 |             attn_mask = compute_attention_mask(
 97 |                 *q.shape[-2:], *k.shape[-2:], window_size_ratio=self.window_ratio
 98 |             ).to(q.device)
 99 |         else:
100 |             attn_mask = None
101 |         b, _, h, w = q.shape
102 |         _, _, h_k, w_k = k.shape
103 |         c = v.shape[1]
104 |         q = q.permute(0, 2, 3, 1).view(b, h * w, -1)
105 |         k = k.permute(0, 2, 3, 1).view(b, h_k * w_k, -1)
106 |         v = v.permute(0, 2, 3, 1).view(b, h_k * w_k, -1)
107 | 
108 |         features, attn = self.cross_attn(q, k, v, mask=attn_mask,
109 |                                          q_chunk_size=q_chunk_size,
110 |                                          store_attn=store_attn)
111 |         features = features.view(b, h, w, c).permute(0, 3, 1, 2)
112 |         if vis_attn:
113 |             from anyup.utils.visualization import visualize_attention_oklab
114 |             import matplotlib.pyplot as plt
115 | 
116 |             ref, out = visualize_attention_oklab(attn[0], h, w, h_k, w_k)
117 | 
118 |             fig, ax = plt.subplots(1, 2, figsize=(10, 5))
119 |             ax[0].imshow(ref.cpu().numpy())
120 |             ax[0].set_title("Reference (Values)")
121 |             ax[0].set_xticks([-.5, w_k - .5], labels=[0, w_k])
122 |             ax[0].set_yticks([-.5, h_k - .5], labels=[0, h_k])
123 | 
124 |             ax[1].imshow(out.cpu().numpy())
125 |             ax[1].set_title("Attention Output")
126 |             ax[1].set_xticks([-.5, w - .5], labels=[0, w])
127 |             ax[1].set_yticks([-.5, h - .5], labels=[0, h])
128 |             plt.show()
129 | 
130 |         return features
131 | 


--------------------------------------------------------------------------------
/anyup/layers/convolutions.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | 
 4 | class ResBlock(nn.Module):
 5 |     def __init__(self, in_channels, out_channels, kernel_size=3, num_groups=8,
 6 |                  pad_mode="zeros", norm_fn=None, activation_fn=nn.SiLU, use_conv_shortcut=False):
 7 |         super().__init__()
 8 |         N = (lambda c: norm_fn(num_groups, c)) if norm_fn else (lambda c: nn.Identity())
 9 |         p = kernel_size // 2
10 |         self.block = nn.Sequential(
11 |             N(in_channels),
12 |             activation_fn(),
13 |             nn.Conv2d(in_channels, out_channels, kernel_size, padding=p, padding_mode=pad_mode, bias=False),
14 |             N(out_channels),
15 |             activation_fn(),
16 |             nn.Conv2d(out_channels, out_channels, kernel_size, padding=p, padding_mode=pad_mode, bias=False),
17 |         )
18 |         self.shortcut = (
19 |             nn.Conv2d(in_channels, out_channels, 1, bias=False, padding_mode=pad_mode)
20 |             if use_conv_shortcut or in_channels != out_channels else nn.Identity()
21 |         )
22 | 
23 |     def forward(self, x):
24 |         return self.block(x) + self.shortcut(x)
25 | 


--------------------------------------------------------------------------------
/anyup/layers/feature_unification.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.nn.functional as F
 4 | 
 5 | compute_basis_size = {"gauss_deriv": lambda order, mirror: ((order + 1) * (order + 2)) // (1 if mirror else 2)}
 6 | 
 7 | 
 8 | def herme_vander_torch(z, m):
 9 |     He0 = z.new_ones(z.shape)
10 |     if m == 0: return He0[:, None]
11 |     H = [He0, z]
12 |     for n in range(1, m):
13 |         H.append(z * H[-1] - n * H[-2])
14 |     return torch.stack(H, 1)
15 | 
16 | 
17 | def gauss_deriv(max_order, device, dtype, kernel_size, sigma=None, include_negations=False, scale_magnitude=True):
18 |     sigma = (kernel_size // 2) / 1.645 if sigma is None else sigma
19 |     if kernel_size % 2 == 0: raise ValueError("ksize must be odd")
20 |     half = kernel_size // 2
21 |     x = torch.arange(-half, half + 1, dtype=dtype, device=device)
22 |     z = x / sigma
23 |     g = torch.exp(-0.5 * z ** 2) / (sigma * (2.0 * torch.pi) ** 0.5)
24 |     He = herme_vander_torch(z, max_order)
25 |     derivs_1d = [(((-1) ** n) / (sigma ** n) if scale_magnitude else (-1) ** n) * He[:, n] * g for n in
26 |                  range(max_order + 1)]
27 |     bank = []
28 |     for o in range(max_order + 1):
29 |         for i in range(o + 1):
30 |             K = torch.outer(derivs_1d[o - i], derivs_1d[i])
31 |             bank.append(K)
32 |             if include_negations: bank.append(-K)
33 |     return torch.stack(bank, 0)
34 | 
35 | 
36 | class LearnedFeatureUnification(nn.Module):
37 |     def __init__(self, out_channels: int, kernel_size: int = 3, init_gaussian_derivatives: bool = False):
38 |         super().__init__()
39 |         self.out_channels = out_channels
40 |         self.kernel_size = kernel_size
41 |         if init_gaussian_derivatives:
42 |             # find smallest order that gives at least out_channels basis functions
43 |             order = 0
44 |             while compute_basis_size["gauss_deriv"](order, False) < out_channels:
45 |                 order += 1
46 |             print(f"FeatureUnification: initializing with Gaussian derivative basis of order {order}")
47 |             self.basis = nn.Parameter(
48 |                 gauss_deriv(
49 |                     order, device='cpu', dtype=torch.float32, kernel_size=kernel_size, scale_magnitude=False
50 |                 )[:out_channels, None]
51 |             )
52 |         else:
53 |             self.basis = nn.Parameter(
54 |                 torch.randn(out_channels, 1, kernel_size, kernel_size)
55 |             )
56 | 
57 |     def forward(self, features: torch.Tensor) -> torch.Tensor:
58 |         b, c, h, w = features.shape
59 |         x = self._depthwise_conv(features, self.basis, self.kernel_size).view(b, self.out_channels, c, h, w)
60 |         attn = F.softmax(x, dim=1)
61 |         return attn.mean(dim=2)
62 | 
63 |     @staticmethod
64 |     def _depthwise_conv(feats, basis, k):
65 |         b, c, h, w = feats.shape
66 |         p = k // 2
67 |         x = F.pad(feats, (p, p, p, p), value=0)
68 |         x = F.conv2d(x, basis.repeat(c, 1, 1, 1), groups=c)
69 |         mask = torch.ones(1, 1, h, w, dtype=x.dtype, device=x.device)
70 |         denom = F.conv2d(F.pad(mask, (p, p, p, p), value=0), torch.ones(1, 1, k, k, device=x.device))
71 |         return x / denom  # (B, out_channels*C, H, W)
72 | 


--------------------------------------------------------------------------------
/anyup/layers/positional_encoding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | def rotate_half(x):
 5 |     x1, x2 = x.chunk(2, dim=-1)
 6 |     return torch.cat((-x2, x1), dim=-1)
 7 | 
 8 | 
 9 | class RoPE(nn.Module):
10 |     def __init__(
11 |         self,
12 |         dim: int,
13 |         theta: int = 100,
14 |     ):
15 |         super().__init__()
16 |         self.dim = dim
17 |         self.theta = theta
18 |         self.freqs = nn.Parameter(torch.empty(2, self.dim))
19 | 
20 |     def _device_weight_init(self):
21 |         freqs_1d = self.theta ** torch.linspace(0, -1, self.dim // 4)
22 |         freqs_1d = torch.cat([freqs_1d, freqs_1d])
23 |         freqs_2d = torch.zeros(2, self.dim)
24 |         freqs_2d[0, : self.dim // 2] = freqs_1d
25 |         freqs_2d[1, -self.dim // 2 :] = freqs_1d
26 |         self.freqs.data.copy_(freqs_2d * 2 * torch.pi)
27 | 
28 |     def forward(self, x: torch.Tensor, coords: torch.Tensor) -> torch.Tensor:
29 |         angle = coords @ self.freqs
30 |         return x * angle.cos() + rotate_half(x) * angle.sin()
31 | 


--------------------------------------------------------------------------------
/anyup/model.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | import torch.nn.functional as F
 3 | import torch
 4 | 
 5 | from .layers import ResBlock
 6 | from .layers import LearnedFeatureUnification
 7 | from .layers import CrossAttentionBlock
 8 | from .layers import RoPE
 9 | from .utils.img import create_coordinate
10 | 
11 | 
12 | class AnyUp(nn.Module):
13 |     def __init__(
14 |             self,
15 |             input_dim=3,
16 |             qk_dim=128,
17 |             kernel_size=1,
18 |             kernel_size_lfu=5,
19 |             window_ratio=0.1,
20 |             num_heads=4,
21 |             init_gaussian_derivatives=False,
22 |             **kwargs,
23 |     ):
24 |         super().__init__()
25 |         self.qk_dim = qk_dim
26 |         self.window_ratio = window_ratio
27 |         self._rb_args = dict(kernel_size=1, num_groups=8, pad_mode="reflect", norm_fn=nn.GroupNorm,
28 |                              activation_fn=nn.SiLU)
29 | 
30 |         # Encoders
31 |         self.image_encoder = self._make_encoder(input_dim, kernel_size)
32 |         self.key_encoder = self._make_encoder(qk_dim, 1)
33 |         self.query_encoder = self._make_encoder(qk_dim, 1)
34 |         self.key_features_encoder = self._make_encoder(None, 1, first_layer_k=kernel_size_lfu,
35 |                                                        init_gaussian_derivatives=init_gaussian_derivatives)
36 | 
37 |         # Cross-attention
38 |         self.cross_decode = CrossAttentionBlock(qk_dim=qk_dim, num_heads=num_heads, window_ratio=window_ratio)
39 |         self.aggregation = self._make_encoder(2 * qk_dim, 3)
40 | 
41 |         # RoPE for (H*W, C)
42 |         self.rope = RoPE(qk_dim)
43 |         self.rope._device_weight_init()
44 | 
45 |     def _make_encoder(self, in_ch, k, layers=2, first_layer_k=0, init_gaussian_derivatives=False):
46 |         pre = (
47 |             nn.Conv2d(in_ch, self.qk_dim, k, padding=k // 2, padding_mode="reflect", bias=False)
48 |             if first_layer_k == 0 else
49 |             LearnedFeatureUnification(self.qk_dim, first_layer_k, init_gaussian_derivatives=init_gaussian_derivatives)
50 |         )
51 |         blocks = [ResBlock(self.qk_dim, self.qk_dim, **self._rb_args) for _ in range(layers)]
52 |         return nn.Sequential(pre, *blocks)
53 | 
54 |     def upsample(self, enc_img, feats, out_size, vis_attn=False, q_chunk_size=None):
55 |         b, c, h, w = feats.shape
56 | 
57 |         # Q
58 |         q = F.adaptive_avg_pool2d(self.query_encoder(enc_img), output_size=out_size)
59 | 
60 |         # K
61 |         k = F.adaptive_avg_pool2d(self.key_encoder(enc_img), output_size=(h, w))
62 |         k = torch.cat([k, self.key_features_encoder(F.normalize(feats, dim=1))], dim=1)
63 |         k = self.aggregation(k)
64 | 
65 |         # V
66 |         v = feats
67 | 
68 |         return self.cross_decode(q, k, v, vis_attn=vis_attn, q_chunk_size=q_chunk_size)
69 | 
70 |     def forward(self, image, features, output_size=None, vis_attn=False, q_chunk_size=None):
71 |         output_size = output_size if output_size is not None else image.shape[-2:]
72 |         enc = self.image_encoder(image)
73 |         h = enc.shape[-2]
74 |         coords = create_coordinate(h, enc.shape[-1], device=enc.device, dtype=enc.dtype)
75 |         enc = enc.permute(0, 2, 3, 1).view(enc.shape[0], -1, enc.shape[1])
76 |         enc = self.rope(enc, coords)
77 |         enc = enc.view(enc.shape[0], h, -1, enc.shape[-1]).permute(0, 3, 1, 2)
78 |         return self.upsample(enc, features, output_size, vis_attn=vis_attn, q_chunk_size=q_chunk_size)
79 | 


--------------------------------------------------------------------------------
/anyup/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wimmerth/anyup/0dbfc182cb3481abcb090253b86dc5f652f7a537/anyup/utils/__init__.py


--------------------------------------------------------------------------------
/anyup/utils/img.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | def create_coordinate(h, w, start=0.0, end=1.0, device=None, dtype=None):
4 |     x = torch.linspace(start, end, h, device=device, dtype=dtype)
5 |     y = torch.linspace(start, end, w, device=device, dtype=dtype)
6 |     xx, yy = torch.meshgrid(x, y, indexing="ij")
7 |     return torch.stack((xx, yy), -1).view(1, h * w, 2)
8 | 


--------------------------------------------------------------------------------
/anyup/utils/visualization/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .attention_visualization import visualize_attention_oklab
 3 | 
 4 | IMAGENET_MEAN = [0.485, 0.456, 0.406]
 5 | IMAGENET_STD = [0.229, 0.224, 0.225]
 6 | 
 7 | def unnormalize(t, mean=None, std=None):
 8 |     if mean is None: mean = IMAGENET_MEAN
 9 |     if std is None: std = IMAGENET_STD
10 |     m = torch.as_tensor(mean, device=t.device, dtype=t.dtype).view(1, -1, 1, 1)
11 |     s = torch.as_tensor(std, device=t.device, dtype=t.dtype).view(1, -1, 1, 1)
12 |     return t * s + m


--------------------------------------------------------------------------------
/anyup/utils/visualization/attention_visualization.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def srgb_to_linear(c):
 5 |     a = 0.055
 6 |     return torch.where(c <= 0.04045, c / 12.92, ((c + a) / (1 + a)) ** 2.4)
 7 | 
 8 | 
 9 | def linear_to_srgb(c):
10 |     a = 0.055
11 |     c = torch.clamp(c, 0.0, 1.0)  # simple gamut clamp in linear light
12 |     return torch.where(c <= 0.0031308, 12.92 * c, (1 + a) * torch.pow(c, 1 / 2.4) - a)
13 | 
14 | 
15 | def oklab_to_linear_srgb(L, a, b):
16 |     l_ = L + 0.3963377774 * a + 0.2158037573 * b
17 |     m_ = L - 0.1055613458 * a - 0.0638541728 * b
18 |     s_ = L - 0.0894841775 * a - 1.2914855480 * b
19 |     l, m, s = l_ ** 3, m_ ** 3, s_ ** 3
20 |     R = +4.0767416621 * l - 3.3077115913 * m + 0.2309699292 * s
21 |     G = -1.2684380046 * l + 2.6097574011 * m - 0.3413193965 * s
22 |     B = -0.0041960863 * l - 0.7034186147 * m + 1.7076147010 * s
23 |     return torch.stack([R, G, B], dim=-1)
24 | 
25 | 
26 | def oklch_grid(h_k, w_k, col_range=.7):
27 |     i, j = torch.meshgrid(
28 |         torch.arange(-col_range / 2, col_range / 2, col_range / h_k),
29 |         torch.arange(-col_range / 2, col_range / 2, col_range / w_k),
30 |         indexing='ij'
31 |     )
32 |     rgb = oklab_to_linear_srgb(torch.full_like(i, .7), i, j)
33 |     return rgb
34 | 
35 | 
36 | def visualize_attention_oklab(attn, h_q, w_q, h_k=None, w_k=None):
37 |     h_k = h_k or h_q
38 |     w_k = w_k or w_q
39 | 
40 |     num_q, num_k = attn.shape
41 |     assert 0 < h_q * w_q <= num_q
42 |     assert 0 < h_k * w_k <= num_k
43 |     if h_q * w_q < num_q: attn = attn[-h_q * w_q:]
44 |     if h_k * w_k < num_k: attn = attn[:, -h_k * w_k:]
45 | 
46 |     # rows sum to 1
47 |     attn = torch.nn.functional.normalize(attn, p=1, dim=1)
48 | 
49 |     ref_lin = oklch_grid(h_k, w_k).to(attn.device)  # [h_k, w_k, 3]
50 |     ref_rgb = linear_to_srgb(ref_lin)
51 |     ref_lin = ref_lin.view(-1, 3)
52 | 
53 |     out_lin = attn @ ref_lin  # [(h_q*w_q), 3]
54 |     out_rgb = linear_to_srgb(out_lin.view(h_q, w_q, 3))
55 |     return ref_rgb, out_rgb
56 | 


--------------------------------------------------------------------------------
/hubconf.py:
--------------------------------------------------------------------------------
 1 | dependencies = ['torch']
 2 | 
 3 | from anyup.model import AnyUp
 4 | import torch
 5 | 
 6 | 
 7 | def anyup(pretrained: bool = True, device='cpu'):
 8 |     """
 9 |     AnyUp model trained on DINOv2 ViT-S/14 features, used in most experiments of the paper.
10 |     Note: If you want to use vis_attn, you also need to install matplotlib.
11 |     """
12 |     model = AnyUp().to(device)
13 |     if pretrained:
14 |         checkpoint = "https://github.com/wimmerth/anyup/releases/download/checkpoint/anyup_paper.pth"
15 |         model.load_state_dict(torch.hub.load_state_dict_from_url(checkpoint, progress=True, map_location=device))
16 |     return model
17 | 


--------------------------------------------------------------------------------