├── .github
    └── workflows
    │   └── check_fmt.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── assets
    ├── pvs.png
    └── speed-performance.png
├── checkpoints
    ├── download_ckpts.sh
    └── edgetam.pt
├── convert_weights.py
├── examples
    ├── 01_breakdancer.mp4
    ├── 01_dog.mp4
    ├── 02_cups.mp4
    ├── 02_hummingbird.mp4
    ├── 03_blocks.mp4
    ├── 03_skateboarder.mp4
    ├── 04_coffee.mp4
    ├── 04_octopus.mp4
    ├── 05_default_juggle.mp4
    ├── 05_landing_dog_soccer.mp4
    ├── 06_pingpong.mp4
    ├── 07_snowboarder.mp4
    ├── 08_driving.mp4
    ├── 09_birdcartoon.mp4
    ├── 10_cloth_magic.mp4
    ├── 11_polevault.mp4
    ├── 12_hideandseek.mp4
    ├── 13_butterfly.mp4
    ├── 14_social_dog_training.mp4
    ├── 15_cricket.mp4
    ├── 16_robotarm.mp4
    ├── 17_childrendancing.mp4
    ├── 18_threedogs.mp4
    ├── 19_cyclist.mp4
    ├── 20_doughkneading.mp4
    ├── 21_biker.mp4
    ├── 22_dogskateboarder.mp4
    ├── 23_racecar.mp4
    └── 24_clownfish.mp4
├── gradio_app.py
├── notebooks
    ├── automatic_mask_generator_example.ipynb
    ├── image_predictor_example.ipynb
    ├── images
    │   ├── cars.jpg
    │   ├── groceries.jpg
    │   └── truck.jpg
    ├── video_predictor_example.ipynb
    └── videos
    │   ├── bedroom.mp4
    │   └── bedroom
    │       ├── 00000.jpg
    │       ├── 00001.jpg
    │       ├── 00002.jpg
    │       ├── 00003.jpg
    │       ├── 00004.jpg
    │       ├── 00005.jpg
    │       ├── 00006.jpg
    │       ├── 00007.jpg
    │       ├── 00008.jpg
    │       ├── 00009.jpg
    │       ├── 00010.jpg
    │       ├── 00011.jpg
    │       ├── 00012.jpg
    │       ├── 00013.jpg
    │       ├── 00014.jpg
    │       ├── 00015.jpg
    │       ├── 00016.jpg
    │       ├── 00017.jpg
    │       ├── 00018.jpg
    │       ├── 00019.jpg
    │       ├── 00020.jpg
    │       ├── 00021.jpg
    │       ├── 00022.jpg
    │       ├── 00023.jpg
    │       ├── 00024.jpg
    │       ├── 00025.jpg
    │       ├── 00026.jpg
    │       ├── 00027.jpg
    │       ├── 00028.jpg
    │       ├── 00029.jpg
    │       ├── 00030.jpg
    │       ├── 00031.jpg
    │       ├── 00032.jpg
    │       ├── 00033.jpg
    │       ├── 00034.jpg
    │       ├── 00035.jpg
    │       ├── 00036.jpg
    │       ├── 00037.jpg
    │       ├── 00038.jpg
    │       ├── 00039.jpg
    │       ├── 00040.jpg
    │       ├── 00041.jpg
    │       ├── 00042.jpg
    │       ├── 00043.jpg
    │       ├── 00044.jpg
    │       ├── 00045.jpg
    │       ├── 00046.jpg
    │       ├── 00047.jpg
    │       ├── 00048.jpg
    │       ├── 00049.jpg
    │       ├── 00050.jpg
    │       ├── 00051.jpg
    │       ├── 00052.jpg
    │       ├── 00053.jpg
    │       ├── 00054.jpg
    │       ├── 00055.jpg
    │       ├── 00056.jpg
    │       ├── 00057.jpg
    │       ├── 00058.jpg
    │       ├── 00059.jpg
    │       ├── 00060.jpg
    │       ├── 00061.jpg
    │       ├── 00062.jpg
    │       ├── 00063.jpg
    │       ├── 00064.jpg
    │       ├── 00065.jpg
    │       ├── 00066.jpg
    │       ├── 00067.jpg
    │       ├── 00068.jpg
    │       ├── 00069.jpg
    │       ├── 00070.jpg
    │       ├── 00071.jpg
    │       ├── 00072.jpg
    │       ├── 00073.jpg
    │       ├── 00074.jpg
    │       ├── 00075.jpg
    │       ├── 00076.jpg
    │       ├── 00077.jpg
    │       ├── 00078.jpg
    │       ├── 00079.jpg
    │       ├── 00080.jpg
    │       ├── 00081.jpg
    │       ├── 00082.jpg
    │       ├── 00083.jpg
    │       ├── 00084.jpg
    │       ├── 00085.jpg
    │       ├── 00086.jpg
    │       ├── 00087.jpg
    │       ├── 00088.jpg
    │       ├── 00089.jpg
    │       ├── 00090.jpg
    │       ├── 00091.jpg
    │       ├── 00092.jpg
    │       ├── 00093.jpg
    │       ├── 00094.jpg
    │       ├── 00095.jpg
    │       ├── 00096.jpg
    │       ├── 00097.jpg
    │       ├── 00098.jpg
    │       ├── 00099.jpg
    │       ├── 00100.jpg
    │       ├── 00101.jpg
    │       ├── 00102.jpg
    │       ├── 00103.jpg
    │       ├── 00104.jpg
    │       ├── 00105.jpg
    │       ├── 00106.jpg
    │       ├── 00107.jpg
    │       ├── 00108.jpg
    │       ├── 00109.jpg
    │       ├── 00110.jpg
    │       ├── 00111.jpg
    │       ├── 00112.jpg
    │       ├── 00113.jpg
    │       ├── 00114.jpg
    │       ├── 00115.jpg
    │       ├── 00116.jpg
    │       ├── 00117.jpg
    │       ├── 00118.jpg
    │       ├── 00119.jpg
    │       ├── 00120.jpg
    │       ├── 00121.jpg
    │       ├── 00122.jpg
    │       ├── 00123.jpg
    │       ├── 00124.jpg
    │       ├── 00125.jpg
    │       ├── 00126.jpg
    │       ├── 00127.jpg
    │       ├── 00128.jpg
    │       ├── 00129.jpg
    │       ├── 00130.jpg
    │       ├── 00131.jpg
    │       ├── 00132.jpg
    │       ├── 00133.jpg
    │       ├── 00134.jpg
    │       ├── 00135.jpg
    │       ├── 00136.jpg
    │       ├── 00137.jpg
    │       ├── 00138.jpg
    │       ├── 00139.jpg
    │       ├── 00140.jpg
    │       ├── 00141.jpg
    │       ├── 00142.jpg
    │       ├── 00143.jpg
    │       ├── 00144.jpg
    │       ├── 00145.jpg
    │       ├── 00146.jpg
    │       ├── 00147.jpg
    │       ├── 00148.jpg
    │       ├── 00149.jpg
    │       ├── 00150.jpg
    │       ├── 00151.jpg
    │       ├── 00152.jpg
    │       ├── 00153.jpg
    │       ├── 00154.jpg
    │       ├── 00155.jpg
    │       ├── 00156.jpg
    │       ├── 00157.jpg
    │       ├── 00158.jpg
    │       ├── 00159.jpg
    │       ├── 00160.jpg
    │       ├── 00161.jpg
    │       ├── 00162.jpg
    │       ├── 00163.jpg
    │       ├── 00164.jpg
    │       ├── 00165.jpg
    │       ├── 00166.jpg
    │       ├── 00167.jpg
    │       ├── 00168.jpg
    │       ├── 00169.jpg
    │       ├── 00170.jpg
    │       ├── 00171.jpg
    │       ├── 00172.jpg
    │       ├── 00173.jpg
    │       ├── 00174.jpg
    │       ├── 00175.jpg
    │       ├── 00176.jpg
    │       ├── 00177.jpg
    │       ├── 00178.jpg
    │       ├── 00179.jpg
    │       ├── 00180.jpg
    │       ├── 00181.jpg
    │       ├── 00182.jpg
    │       ├── 00183.jpg
    │       ├── 00184.jpg
    │       ├── 00185.jpg
    │       ├── 00186.jpg
    │       ├── 00187.jpg
    │       ├── 00188.jpg
    │       ├── 00189.jpg
    │       ├── 00190.jpg
    │       ├── 00191.jpg
    │       ├── 00192.jpg
    │       ├── 00193.jpg
    │       ├── 00194.jpg
    │       ├── 00195.jpg
    │       ├── 00196.jpg
    │       ├── 00197.jpg
    │       ├── 00198.jpg
    │       └── 00199.jpg
├── pyproject.toml
├── sam2
    ├── __init__.py
    ├── automatic_mask_generator.py
    ├── build_sam.py
    ├── configs
    │   ├── edgetam.yaml
    │   ├── sam2.1
    │   │   ├── sam2.1_hiera_b+.yaml
    │   │   ├── sam2.1_hiera_l.yaml
    │   │   ├── sam2.1_hiera_s.yaml
    │   │   └── sam2.1_hiera_t.yaml
    │   ├── sam2.1_training
    │   │   └── sam2.1_hiera_b+_MOSE_finetune.yaml
    │   └── sam2
    │   │   ├── sam2_hiera_b+.yaml
    │   │   ├── sam2_hiera_l.yaml
    │   │   ├── sam2_hiera_s.yaml
    │   │   └── sam2_hiera_t.yaml
    ├── csrc
    │   └── connected_components.cu
    ├── edgetam.yaml
    ├── modeling
    │   ├── __init__.py
    │   ├── backbones
    │   │   ├── __init__.py
    │   │   ├── hieradet.py
    │   │   ├── image_encoder.py
    │   │   ├── timm.py
    │   │   └── utils.py
    │   ├── memory_attention.py
    │   ├── memory_encoder.py
    │   ├── perceiver.py
    │   ├── position_encoding.py
    │   ├── sam
    │   │   ├── __init__.py
    │   │   ├── mask_decoder.py
    │   │   ├── prompt_encoder.py
    │   │   └── transformer.py
    │   ├── sam2_base.py
    │   └── sam2_utils.py
    ├── sam2_hiera_b+.yaml
    ├── sam2_hiera_l.yaml
    ├── sam2_hiera_s.yaml
    ├── sam2_hiera_t.yaml
    ├── sam2_image_predictor.py
    ├── sam2_video_predictor.py
    └── utils
    │   ├── __init__.py
    │   ├── amg.py
    │   ├── misc.py
    │   └── transforms.py
├── setup.py
└── tools
    ├── README.md
    └── vos_inference.py


/.github/workflows/check_fmt.yml:
--------------------------------------------------------------------------------
 1 | name: SAM2/fmt
 2 | on:
 3 |   pull_request:
 4 |     branches:
 5 |     - main
 6 | jobs:
 7 |   ufmt_check:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - name: Check formatting
11 |         uses: omnilib/ufmt@action-v1
12 |         with:
13 |           path: sam2 tools
14 |           version: "2.0.0b2"
15 |           python-version: "3.10"
16 |           black-version: "24.2.0"
17 |           usort-version: "1.0.2"
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | .DS_Store
 3 | __pycache__/
 4 | *-checkpoint.ipynb
 5 | .venv
 6 | *.egg*
 7 | build/*
 8 | _C.*
 9 | outputs/*
10 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@meta.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq
81 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to EdgeTAM
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `main`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Meta's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Meta has a [bounty program](https://bugbounty.meta.com/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to EdgeTAM, you agree that your contributions will be licensed
31 | under the LICENSE file in the root directory of this source tree.
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # EdgeTAM: On-Device Track Anything Model
  2 | 
  3 | [Chong Zhou<sup>1,2*</sup>](https://chongzhou96.github.io/),
  4 | [Chenchen Zhu<sup>1</sup>](https://sites.google.com/andrew.cmu.edu/zcckernel/home),
  5 | [Yunyang Xiong<sup>1</sup>](https://pages.cs.wisc.edu/~yunyang/),
  6 | [Saksham Suri<sup>1</sup>](https://www.cs.umd.edu/~sakshams/),
  7 | [Fanyi Xiao<sup>1</sup>](https://fanyix.cs.ucdavis.edu/),
  8 | [Lemeng Wu<sup>1</sup>](https://sites.google.com/view/lemeng-wu/home),
  9 | [Raghuraman Krishnamoorthi<sup>1</sup>](https://scholar.google.com/citations?user=F1mr9C0AAAAJ&hl=en),
 10 | [Bo Dai<sup>3,4</sup>](https://daibo.info/),
 11 | [Chen Change Loy<sup>2</sup>](https://www.mmlab-ntu.com/person/ccloy/),
 12 | [Vikas Chandra<sup>1</sup>](https://v-chandra.github.io/),
 13 | [Bilge Soran<sup>1</sup>](https://scholar.google.com/citations?user=9nXD6pwAAAAJ&hl=en)
 14 | 
 15 | <sup>1</sup>Meta Reality Labs,
 16 | <sup>2</sup>S-Lab, Nanyang Technological University,
 17 | <sup>3</sup>University of Hong Kong,
 18 | <sup>4</sup>Feeling AI
 19 | 
 20 | (*) Work done during the internship at Meta Reality Labs.
 21 | 
 22 | [[`Paper`](https://arxiv.org/abs/2501.07256)] [[`Demo`](https://huggingface.co/spaces/facebook/EdgeTAM)] [[`BibTeX`](#citing-edgetam)]
 23 | 
 24 | 
 25 | ## Overview
 26 | 
 27 | **EdgeTAM** is an on-device executable variant of the SAM 2 for promptable segmentation and tracking in videos.
 28 | It runs **22× faster** than SAM 2 and achieves **16 FPS** on iPhone 15 Pro Max without quantization.
 29 | 
 30 | <p align="center">
 31 |   <img src="assets/speed-performance.png?raw=true" width="400"/>
 32 | </p>
 33 | 
 34 | *In this figure, we show the speed-performance trade-offs of EdgeTAM and other models on iPhone 15 Pro Max (red) and NVIDIA A100 (blue). We report the J&F on the SA-V val dataset as the evaluation metric.*
 35 | 
 36 | ## Installation
 37 | 
 38 | EdgeTAM needs to be installed first before use. The code requires `python>=3.10`, as well as `torch>=2.3.1` and `torchvision>=0.18.1`. Please follow the instructions [here](https://pytorch.org/get-started/locally/) to install both PyTorch and TorchVision dependencies. You can install EdgeTAM on a GPU machine using:
 39 | 
 40 | ```bash
 41 | git clone https://github.com/facebookresearch/EdgeTAM.git && cd EdgeTAM
 42 | 
 43 | pip install -e .
 44 | ```
 45 | 
 46 | To use the EdgeTAM predictor and run the example notebooks, `jupyter` and `matplotlib` are required and can be installed by:
 47 | 
 48 | ```bash
 49 | pip install -e ".[notebooks]"
 50 | ```
 51 | 
 52 | Note:
 53 | 1. It's recommended to create a new Python environment via [Anaconda](https://www.anaconda.com/) for this installation and install PyTorch 2.3.1 (or higher) via `pip` following https://pytorch.org/. If you have a PyTorch version lower than 2.3.1 in your current environment, the installation command above will try to upgrade it to the latest PyTorch version using `pip`.
 54 | 2. The step above requires compiling a custom CUDA kernel with the `nvcc` compiler. If it isn't already available on your machine, please install the [CUDA toolkits](https://developer.nvidia.com/cuda-toolkit-archive) with a version that matches your PyTorch CUDA version.
 55 | 3. If you see a message like `Failed to build the SAM 2 CUDA extension` during installation, you can ignore it and still use EdgeTAM (some post-processing functionality may be limited, but it doesn't affect the results in most cases).
 56 | 
 57 | 
 58 | ## Getting Started
 59 | 
 60 | ### Downloading the model
 61 | 
 62 | Model is available [here](https://github.com/facebookresearch/EdgeTAM/tree/main/checkpoints/edgetam.pt).
 63 | 
 64 | ### On-device Gradio demo for EdgeTAM
 65 | Follow the instructions below to run the on-device demo for EdgeTAM. If you want to quickly try out the demo, you can also go to [Hugging Face Spaces](https://huggingface.co/spaces/facebook/EdgeTAM).
 66 | 
 67 | Install the dependencies for the Gradio demo:
 68 | 
 69 | ```bash
 70 | pip install -e ".[gradio]"
 71 | ```
 72 | 
 73 | Run the demo:
 74 | 
 75 | ```bash
 76 | python gradio_app.py
 77 | ```
 78 | 
 79 | The demo will be available at http://127.0.0.1:7860/ by default. You can change the port by setting the `--port` argument.
 80 | 
 81 | ### Image prediction
 82 | 
 83 | EdgeTAM has all the capabilities of [SAM](https://github.com/facebookresearch/segment-anything) on static images, and we provide image prediction APIs that closely resemble SAM for image use cases. The `SAM2ImagePredictor` class has an easy interface for image prompting.
 84 | 
 85 | ```python
 86 | import torch
 87 | from sam2.build_sam import build_sam2
 88 | from sam2.sam2_image_predictor import SAM2ImagePredictor
 89 | 
 90 | checkpoint = "./checkpoints/edgetam.pt"
 91 | model_cfg = "configs/edgetam.yaml"
 92 | predictor = SAM2ImagePredictor(build_sam2(model_cfg, checkpoint))
 93 | 
 94 | with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
 95 |     predictor.set_image(<your_image>)
 96 |     masks, _, _ = predictor.predict(<input_prompts>)
 97 | ```
 98 | 
 99 | Please refer to the examples in [image_predictor_example.ipynb](./notebooks/image_predictor_example.ipynb) for static image use cases.
100 | 
101 | EdgeTAM also supports automatic mask generation on images just like SAM. Please see [automatic_mask_generator_example.ipynb](./notebooks/automatic_mask_generator_example.ipynb) for automatic mask generation in images.
102 | 
103 | ### Video prediction
104 | 
105 | For promptable segmentation and tracking in videos, we provide a video predictor with APIs for example to add prompts and propagate masklets throughout a video. EdgeTAM supports video inference on multiple objects and uses an inference state to keep track of the interactions in each video.
106 | 
107 | ```python
108 | import torch
109 | from sam2.build_sam import build_sam2_video_predictor
110 | 
111 | checkpoint = "./checkpoints/edgetam.pt"
112 | model_cfg = "configs/edgetam.yaml"
113 | predictor = build_sam2_video_predictor(model_cfg, checkpoint)
114 | 
115 | with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
116 |     state = predictor.init_state(<your_video>)
117 | 
118 |     # add new prompts and instantly get the output on the same frame
119 |     frame_idx, object_ids, masks = predictor.add_new_points_or_box(state, <your_prompts>):
120 | 
121 |     # propagate the prompts to get masklets throughout the video
122 |     for frame_idx, object_ids, masks in predictor.propagate_in_video(state):
123 |         ...
124 | ```
125 | 
126 | Please refer to the examples in [video_predictor_example.ipynb](./notebooks/video_predictor_example.ipynb) for details on how to add click or box prompts, make refinements, and track multiple objects in videos.
127 | 
128 | ## Performance
129 | ### Promptable Video Segmentation (PVS)
130 | <p align="center">
131 |   <img src="assets/pvs.png?raw=true" width="700"/>
132 | </p>
133 | 
134 | *Zero-shot PVS accuracy across 9 datasets in offline and online settings.*
135 | 
136 | ### Video Object Segmentation (VOS)
137 | | Method         | MOSE val | DAVIS 2017 val | SA-V val | SA-V test | YTVOS 2019 val | A100  | V100  | iPhone |
138 | |----------------|----------|----------------|----------|-----------|----------------|-------|-------|--------|
139 | | STCN           | 52.5     | 85.4           | 61.0     | 62.5      | 82.7           | 62.8  | 13.2  | -      |
140 | | SwinB-AOT      | 59.4     | 85.4           | 51.1     | 50.3      | 84.5           | -     | -     | -      |
141 | | SwinB-DeAOT    | 59.9     | 86.2           | 61.4     | 61.8      | 86.1           | -     | -     | -      |
142 | | RDE            | 46.8     | 84.2           | 51.8     | 53.9      | 81.9           | 88.8  | 24.4  | -      |
143 | | XMem           | 59.6     | 86.0           | 60.1     | 62.3      | 85.6           | 61.2  | 22.6  | -      |
144 | | SimVOS-B       | -        | 88.0           | 44.2     | 44.1      | 84.2           | -     | 3.3   | -      |
145 | | JointFormer    | -        | 90.1           | -        | -         | 87.4           | -     | 3.0   | -      |
146 | | ISVOS          | -        | 88.2           | -        | -         | 86.3           | -     | 5.8   | -      |
147 | | DEVA           | 66.0     | 87.0           | 55.4     | 56.2      | 85.4           | 65.2  | 25.3  | -      |
148 | | Cutie-base     | 69.9     | 87.9           | 60.7     | 62.7      | 87.0           | 65.0  | 36.4  | -      |
149 | | Cutie-base+    | 71.7     | 88.1           | 61.3     | 62.8      | 87.5           | 57.2  | 17.9  | -      |
150 | | SAM 2-B+       | 75.8     | **90.9**       | 73.6     | 74.1      | 88.4           | 64.8  | -     | 0.7    |
151 | | SAM 2.1-B+     | **76.6** | 90.2           | **76.8** | **77.0**  | **88.6**       | 64.1  | -     | 0.7    |
152 | | **EdgeTAM**    | 70.0     | 87.7           | 72.3     | 71.7      | 86.2           | **150.9** | - | **15.7** |
153 | 
154 | *We report the G for YTVOS and J&F for other datasets. The FPS on A100 is obtained with torch compile. Nota that, for SAM 2, SAM 2.1, and EdgeTAM, we evaluate all the datasets with the same model.*
155 | 
156 | ### Segment Anything (SA)
157 | | Model    | Data        | SA-23 All        | SA-23 Image      | SA-23 Video      | FPS   |
158 | |----------|-------------|------------------|------------------|------------------|-------|
159 | | SAM      | SA-1B       | 58.1 (81.3)      | 60.8 (82.1)      | 54.5 (80.3)      | -     |
160 | | SAM 2    | SA-1B       | 58.9 (81.7)      | 60.8 (82.1)      | 56.4 (81.2)      | 1.3   |
161 | | SAM 2    | SAM2’s mix  | 61.4 (83.7)      | 63.1 (83.9)      | 59.1 (83.3)      | 1.3   |
162 | | SAM 2.1  | SAM2’s mix  | **61.9 (83.5)**  | **63.3 (83.8)**  | **60.1 (83.2)**  | 1.3   |
163 | | **EdgeTAM** | Our mix  | 55.5 (81.7)      | 56.0 (81.9)      | 54.8 (81.5)      | **40.4** |
164 | 
165 | *We report 1 (5) click mIoU results. FPS is measured on iPhone 15 Pro Max. Our mix does not contain the internal datasets that SAM 2 uses.*
166 | 
167 | ## License
168 | 
169 | The EdgeTAM model checkpoints and code are licensed under [Apache 2.0](./LICENSE).
170 | 
171 | 
172 | ## Citing EdgeTAM
173 | 
174 | If you use EdgeTAM in your research, please use the following BibTeX entry.
175 | 
176 | ```bibtex
177 | @article{zhou2025edgetam,
178 |   title={EdgeTAM: On-Device Track Anything Model},
179 |   author={Zhou, Chong and Zhu, Chenchen and Xiong, Yunyang and Suri, Saksham and Xiao, Fanyi and Wu, Lemeng and Krishnamoorthi, Raghuraman and Dai, Bo and Loy, Chen Change and Chandra, Vikas and Soran, Bilge},
180 |   journal={arXiv preprint arXiv:2501.07256},
181 |   year={2025}
182 | }
183 | ```
184 | 


--------------------------------------------------------------------------------
/assets/pvs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/assets/pvs.png


--------------------------------------------------------------------------------
/assets/speed-performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/assets/speed-performance.png


--------------------------------------------------------------------------------
/checkpoints/download_ckpts.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 4 | # All rights reserved.
 5 | 
 6 | # This source code is licensed under the license found in the
 7 | # LICENSE file in the root directory of this source tree.
 8 | 
 9 | # Use either wget or curl to download the checkpoints
10 | if command -v wget &> /dev/null; then
11 |     CMD="wget"
12 | elif command -v curl &> /dev/null; then
13 |     CMD="curl -L -O"
14 | else
15 |     echo "Please install wget or curl to download the checkpoints."
16 |     exit 1
17 | fi
18 | 
19 | # Define the URLs for SAM 2 checkpoints
20 | # SAM2_BASE_URL="https://dl.fbaipublicfiles.com/segment_anything_2/072824"
21 | # sam2_hiera_t_url="${SAM2_BASE_URL}/sam2_hiera_tiny.pt"
22 | # sam2_hiera_s_url="${SAM2_BASE_URL}/sam2_hiera_small.pt"
23 | # sam2_hiera_b_plus_url="${SAM2_BASE_URL}/sam2_hiera_base_plus.pt"
24 | # sam2_hiera_l_url="${SAM2_BASE_URL}/sam2_hiera_large.pt"
25 | 
26 | # Download each of the four checkpoints using wget
27 | # echo "Downloading sam2_hiera_tiny.pt checkpoint..."
28 | # $CMD $sam2_hiera_t_url || { echo "Failed to download checkpoint from $sam2_hiera_t_url"; exit 1; }
29 | 
30 | # echo "Downloading sam2_hiera_small.pt checkpoint..."
31 | # $CMD $sam2_hiera_s_url || { echo "Failed to download checkpoint from $sam2_hiera_s_url"; exit 1; }
32 | 
33 | # echo "Downloading sam2_hiera_base_plus.pt checkpoint..."
34 | # $CMD $sam2_hiera_b_plus_url || { echo "Failed to download checkpoint from $sam2_hiera_b_plus_url"; exit 1; }
35 | 
36 | # echo "Downloading sam2_hiera_large.pt checkpoint..."
37 | # $CMD $sam2_hiera_l_url || { echo "Failed to download checkpoint from $sam2_hiera_l_url"; exit 1; }
38 | 
39 | # Define the URLs for SAM 2.1 checkpoints
40 | SAM2p1_BASE_URL="https://dl.fbaipublicfiles.com/segment_anything_2/092824"
41 | sam2p1_hiera_t_url="${SAM2p1_BASE_URL}/sam2.1_hiera_tiny.pt"
42 | sam2p1_hiera_s_url="${SAM2p1_BASE_URL}/sam2.1_hiera_small.pt"
43 | sam2p1_hiera_b_plus_url="${SAM2p1_BASE_URL}/sam2.1_hiera_base_plus.pt"
44 | sam2p1_hiera_l_url="${SAM2p1_BASE_URL}/sam2.1_hiera_large.pt"
45 | 
46 | # SAM 2.1 checkpoints
47 | echo "Downloading sam2.1_hiera_tiny.pt checkpoint..."
48 | $CMD $sam2p1_hiera_t_url || { echo "Failed to download checkpoint from $sam2p1_hiera_t_url"; exit 1; }
49 | 
50 | echo "Downloading sam2.1_hiera_small.pt checkpoint..."
51 | $CMD $sam2p1_hiera_s_url || { echo "Failed to download checkpoint from $sam2p1_hiera_s_url"; exit 1; }
52 | 
53 | echo "Downloading sam2.1_hiera_base_plus.pt checkpoint..."
54 | $CMD $sam2p1_hiera_b_plus_url || { echo "Failed to download checkpoint from $sam2p1_hiera_b_plus_url"; exit 1; }
55 | 
56 | echo "Downloading sam2.1_hiera_large.pt checkpoint..."
57 | $CMD $sam2p1_hiera_l_url || { echo "Failed to download checkpoint from $sam2p1_hiera_l_url"; exit 1; }
58 | 
59 | echo "All checkpoints are downloaded successfully."
60 | 


--------------------------------------------------------------------------------
/checkpoints/edgetam.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/checkpoints/edgetam.pt


--------------------------------------------------------------------------------
/convert_weights.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import argparse
 8 | 
 9 | import torch
10 | 
11 | 
12 | def main(args):
13 |     sd = torch.load(args.src, map_location="cpu")["model"]
14 |     sd = {k: v for k, v in sd.items() if "teacher" not in k}
15 |     sd = {
16 |         k.replace("backbone.vision_backbone", "image_encoder"): v for k, v in sd.items()
17 |     }
18 |     sd = {k.replace("mlp.fc1", "mlp.layers.0"): v for k, v in sd.items()}
19 |     sd = {k.replace("mlp.fc2", "mlp.layers.1"): v for k, v in sd.items()}
20 |     sd = {k.replace("convs", "neck.convs"): v for k, v in sd.items()}
21 |     sd = {
22 |         k.replace("transformer.encoder", "memory_attention"): v for k, v in sd.items()
23 |     }
24 |     sd = {k.replace("maskmem_backbone", "memory_encoder"): v for k, v in sd.items()}
25 |     sd = {k.replace("maskmem_backbone", "memory_encoder"): v for k, v in sd.items()}
26 |     sd = {k.replace("mlp.lin1", "mlp.layers.0"): v for k, v in sd.items()}
27 |     sd = {k.replace("mlp.lin2", "mlp.layers.1"): v for k, v in sd.items()}
28 |     torch.save({"model": sd}, args.src.replace(".pt", "_converted.pt"))
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument("--src", type=str, required=True)
34 |     args = parser.parse_args()
35 | 
36 |     main(args)
37 | 


--------------------------------------------------------------------------------
/examples/01_breakdancer.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/01_breakdancer.mp4


--------------------------------------------------------------------------------
/examples/01_dog.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/01_dog.mp4


--------------------------------------------------------------------------------
/examples/02_cups.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/02_cups.mp4


--------------------------------------------------------------------------------
/examples/02_hummingbird.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/02_hummingbird.mp4


--------------------------------------------------------------------------------
/examples/03_blocks.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/03_blocks.mp4


--------------------------------------------------------------------------------
/examples/03_skateboarder.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/03_skateboarder.mp4


--------------------------------------------------------------------------------
/examples/04_coffee.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/04_coffee.mp4


--------------------------------------------------------------------------------
/examples/04_octopus.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/04_octopus.mp4


--------------------------------------------------------------------------------
/examples/05_default_juggle.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/05_default_juggle.mp4


--------------------------------------------------------------------------------
/examples/05_landing_dog_soccer.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/05_landing_dog_soccer.mp4


--------------------------------------------------------------------------------
/examples/06_pingpong.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/06_pingpong.mp4


--------------------------------------------------------------------------------
/examples/07_snowboarder.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/07_snowboarder.mp4


--------------------------------------------------------------------------------
/examples/08_driving.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/08_driving.mp4


--------------------------------------------------------------------------------
/examples/09_birdcartoon.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/09_birdcartoon.mp4


--------------------------------------------------------------------------------
/examples/10_cloth_magic.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/10_cloth_magic.mp4


--------------------------------------------------------------------------------
/examples/11_polevault.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/11_polevault.mp4


--------------------------------------------------------------------------------
/examples/12_hideandseek.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/12_hideandseek.mp4


--------------------------------------------------------------------------------
/examples/13_butterfly.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/13_butterfly.mp4


--------------------------------------------------------------------------------
/examples/14_social_dog_training.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/14_social_dog_training.mp4


--------------------------------------------------------------------------------
/examples/15_cricket.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/15_cricket.mp4


--------------------------------------------------------------------------------
/examples/16_robotarm.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/16_robotarm.mp4


--------------------------------------------------------------------------------
/examples/17_childrendancing.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/17_childrendancing.mp4


--------------------------------------------------------------------------------
/examples/18_threedogs.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/18_threedogs.mp4


--------------------------------------------------------------------------------
/examples/19_cyclist.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/19_cyclist.mp4


--------------------------------------------------------------------------------
/examples/20_doughkneading.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/20_doughkneading.mp4


--------------------------------------------------------------------------------
/examples/21_biker.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/21_biker.mp4


--------------------------------------------------------------------------------
/examples/22_dogskateboarder.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/22_dogskateboarder.mp4


--------------------------------------------------------------------------------
/examples/23_racecar.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/23_racecar.mp4


--------------------------------------------------------------------------------
/examples/24_clownfish.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/examples/24_clownfish.mp4


--------------------------------------------------------------------------------
/notebooks/images/cars.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/images/cars.jpg


--------------------------------------------------------------------------------
/notebooks/images/groceries.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/images/groceries.jpg


--------------------------------------------------------------------------------
/notebooks/images/truck.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/images/truck.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom.mp4


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00000.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00000.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00001.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00002.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00003.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00003.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00004.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00004.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00005.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00005.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00006.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00006.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00007.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00007.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00008.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00008.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00009.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00009.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00010.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00010.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00011.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00011.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00012.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00012.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00013.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00013.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00014.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00014.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00015.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00015.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00016.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00016.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00017.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00017.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00018.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00018.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00019.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00019.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00020.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00020.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00021.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00021.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00022.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00022.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00023.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00023.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00024.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00024.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00025.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00025.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00026.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00026.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00027.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00027.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00028.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00028.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00029.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00029.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00030.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00030.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00031.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00031.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00032.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00032.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00033.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00033.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00034.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00034.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00035.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00035.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00036.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00036.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00037.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00037.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00038.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00038.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00039.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00039.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00040.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00040.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00041.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00041.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00042.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00042.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00043.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00043.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00044.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00044.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00045.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00045.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00046.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00046.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00047.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00047.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00048.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00048.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00049.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00049.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00050.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00050.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00051.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00051.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00052.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00052.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00053.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00053.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00054.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00054.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00055.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00055.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00056.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00056.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00057.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00057.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00058.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00058.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00059.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00059.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00060.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00060.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00061.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00061.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00062.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00062.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00063.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00063.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00064.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00064.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00065.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00065.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00066.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00066.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00067.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00067.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00068.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00068.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00069.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00069.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00070.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00070.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00071.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00071.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00072.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00072.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00073.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00073.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00074.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00074.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00075.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00075.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00076.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00076.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00077.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00077.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00078.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00078.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00079.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00079.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00080.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00080.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00081.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00081.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00082.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00082.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00083.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00083.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00084.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00084.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00085.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00085.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00086.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00086.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00087.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00087.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00088.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00088.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00089.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00089.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00090.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00090.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00091.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00091.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00092.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00092.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00093.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00093.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00094.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00094.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00095.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00095.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00096.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00096.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00097.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00097.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00098.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00098.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00099.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00099.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00100.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00100.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00101.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00101.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00102.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00102.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00103.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00103.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00104.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00104.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00105.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00105.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00106.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00106.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00107.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00107.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00108.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00108.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00109.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00109.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00110.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00110.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00111.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00111.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00112.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00112.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00113.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00113.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00114.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00114.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00115.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00115.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00116.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00116.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00117.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00117.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00118.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00118.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00119.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00119.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00120.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00120.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00121.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00121.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00122.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00122.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00123.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00123.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00124.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00124.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00125.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00125.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00126.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00126.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00127.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00127.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00128.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00128.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00129.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00129.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00130.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00130.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00131.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00131.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00132.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00132.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00133.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00133.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00134.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00134.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00135.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00135.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00136.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00136.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00137.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00137.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00138.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00138.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00139.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00139.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00140.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00140.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00141.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00141.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00142.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00142.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00143.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00143.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00144.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00144.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00145.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00145.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00146.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00146.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00147.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00147.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00148.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00148.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00149.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00149.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00150.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00150.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00151.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00151.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00152.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00152.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00153.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00153.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00154.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00154.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00155.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00155.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00156.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00156.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00157.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00157.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00158.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00158.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00159.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00159.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00160.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00160.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00161.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00161.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00162.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00162.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00163.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00163.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00164.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00164.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00165.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00165.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00166.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00166.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00167.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00167.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00168.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00168.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00169.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00169.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00170.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00170.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00171.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00171.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00172.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00172.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00173.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00173.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00174.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00174.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00175.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00175.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00176.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00176.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00177.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00177.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00178.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00178.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00179.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00179.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00180.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00180.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00181.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00181.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00182.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00182.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00183.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00183.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00184.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00184.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00185.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00185.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00186.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00186.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00187.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00187.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00188.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00188.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00189.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00189.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00190.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00190.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00191.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00191.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00192.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00192.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00193.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00193.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00194.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00194.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00195.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00195.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00196.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00196.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00197.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00197.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00198.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00198.jpg


--------------------------------------------------------------------------------
/notebooks/videos/bedroom/00199.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/EdgeTAM/9cc44bfe9e16cc374242917b91283c718410ea87/notebooks/videos/bedroom/00199.jpg


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=61.0",  
4 |     "torch>=2.3.1",
5 |     ]
6 | build-backend = "setuptools.build_meta"
7 | 


--------------------------------------------------------------------------------
/sam2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from hydra import initialize_config_module
 8 | from hydra.core.global_hydra import GlobalHydra
 9 | 
10 | if not GlobalHydra.instance().is_initialized():
11 |     initialize_config_module("sam2", version_base="1.2")
12 | 


--------------------------------------------------------------------------------
/sam2/build_sam.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import logging
  8 | import os
  9 | 
 10 | import sam2
 11 | 
 12 | import torch
 13 | from hydra import compose
 14 | from hydra.utils import instantiate
 15 | from omegaconf import OmegaConf
 16 | 
 17 | # Check if the user is running Python from the parent directory of the sam2 repo
 18 | # (i.e. the directory where this repo is cloned into) -- this is not supported since
 19 | # it could shadow the sam2 package and cause issues.
 20 | if os.path.isdir(os.path.join(sam2.__path__[0], "sam2")):
 21 |     # If the user has "sam2/sam2" in their path, they are likey importing the repo itself
 22 |     # as "sam2" rather than importing the "sam2" python package (i.e. "sam2/sam2" directory).
 23 |     # This typically happens because the user is running Python from the parent directory
 24 |     # that contains the sam2 repo they cloned.
 25 |     raise RuntimeError(
 26 |         "You're likely running Python from the parent directory of the sam2 repository "
 27 |         "(i.e. the directory where https://github.com/facebookresearch/sam2 is cloned into). "
 28 |         "This is not supported since the `sam2` Python package could be shadowed by the "
 29 |         "repository name (the repository is also named `sam2` and contains the Python package "
 30 |         "in `sam2/sam2`). Please run Python from another directory (e.g. from the repo dir "
 31 |         "rather than its parent dir, or from your home directory) after installing SAM 2."
 32 |     )
 33 | 
 34 | 
 35 | HF_MODEL_ID_TO_FILENAMES = {
 36 |     "facebook/sam2-hiera-tiny": (
 37 |         "configs/sam2/sam2_hiera_t.yaml",
 38 |         "sam2_hiera_tiny.pt",
 39 |     ),
 40 |     "facebook/sam2-hiera-small": (
 41 |         "configs/sam2/sam2_hiera_s.yaml",
 42 |         "sam2_hiera_small.pt",
 43 |     ),
 44 |     "facebook/sam2-hiera-base-plus": (
 45 |         "configs/sam2/sam2_hiera_b+.yaml",
 46 |         "sam2_hiera_base_plus.pt",
 47 |     ),
 48 |     "facebook/sam2-hiera-large": (
 49 |         "configs/sam2/sam2_hiera_l.yaml",
 50 |         "sam2_hiera_large.pt",
 51 |     ),
 52 |     "facebook/sam2.1-hiera-tiny": (
 53 |         "configs/sam2.1/sam2.1_hiera_t.yaml",
 54 |         "sam2.1_hiera_tiny.pt",
 55 |     ),
 56 |     "facebook/sam2.1-hiera-small": (
 57 |         "configs/sam2.1/sam2.1_hiera_s.yaml",
 58 |         "sam2.1_hiera_small.pt",
 59 |     ),
 60 |     "facebook/sam2.1-hiera-base-plus": (
 61 |         "configs/sam2.1/sam2.1_hiera_b+.yaml",
 62 |         "sam2.1_hiera_base_plus.pt",
 63 |     ),
 64 |     "facebook/sam2.1-hiera-large": (
 65 |         "configs/sam2.1/sam2.1_hiera_l.yaml",
 66 |         "sam2.1_hiera_large.pt",
 67 |     ),
 68 | }
 69 | 
 70 | 
 71 | def build_sam2(
 72 |     config_file,
 73 |     ckpt_path=None,
 74 |     device="cuda",
 75 |     mode="eval",
 76 |     hydra_overrides_extra=[],
 77 |     apply_postprocessing=True,
 78 |     **kwargs,
 79 | ):
 80 | 
 81 |     if apply_postprocessing:
 82 |         hydra_overrides_extra = hydra_overrides_extra.copy()
 83 |         hydra_overrides_extra += [
 84 |             # dynamically fall back to multi-mask if the single mask is not stable
 85 |             "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
 86 |             "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
 87 |             "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
 88 |         ]
 89 |     # Read config and init model
 90 |     cfg = compose(config_name=config_file, overrides=hydra_overrides_extra)
 91 |     OmegaConf.resolve(cfg)
 92 |     model = instantiate(cfg.model, _recursive_=True)
 93 |     _load_checkpoint(model, ckpt_path)
 94 |     model = model.to(device)
 95 |     if mode == "eval":
 96 |         model.eval()
 97 |     return model
 98 | 
 99 | 
100 | def build_sam2_video_predictor(
101 |     config_file,
102 |     ckpt_path=None,
103 |     device="cuda",
104 |     mode="eval",
105 |     hydra_overrides_extra=[],
106 |     apply_postprocessing=True,
107 |     **kwargs,
108 | ):
109 |     hydra_overrides = [
110 |         "++model._target_=sam2.sam2_video_predictor.SAM2VideoPredictor",
111 |     ]
112 |     if apply_postprocessing:
113 |         hydra_overrides_extra = hydra_overrides_extra.copy()
114 |         hydra_overrides_extra += [
115 |             # dynamically fall back to multi-mask if the single mask is not stable
116 |             "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
117 |             "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
118 |             "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
119 |             # the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking
120 |             "++model.binarize_mask_from_pts_for_mem_enc=true",
121 |             # fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
122 |             "++model.fill_hole_area=8",
123 |         ]
124 |     hydra_overrides.extend(hydra_overrides_extra)
125 | 
126 |     # Read config and init model
127 |     cfg = compose(config_name=config_file, overrides=hydra_overrides)
128 |     OmegaConf.resolve(cfg)
129 |     print("configuration solved")
130 |     model = instantiate(cfg.model, _recursive_=True)
131 |     print("model instantiated")
132 |     _load_checkpoint(model, ckpt_path)
133 |     print("checkpoint loaded")
134 |     model = model.to(device)
135 |     if mode == "eval":
136 |         model.eval()
137 |     print("model ready")
138 |     return model
139 | 
140 | 
141 | def _hf_download(model_id):
142 |     from huggingface_hub import hf_hub_download
143 | 
144 |     config_name, checkpoint_name = HF_MODEL_ID_TO_FILENAMES[model_id]
145 |     ckpt_path = hf_hub_download(repo_id=model_id, filename=checkpoint_name)
146 |     return config_name, ckpt_path
147 | 
148 | 
149 | def build_sam2_hf(model_id, **kwargs):
150 |     config_name, ckpt_path = _hf_download(model_id)
151 |     return build_sam2(config_file=config_name, ckpt_path=ckpt_path, **kwargs)
152 | 
153 | 
154 | def build_sam2_video_predictor_hf(model_id, **kwargs):
155 |     config_name, ckpt_path = _hf_download(model_id)
156 |     return build_sam2_video_predictor(
157 |         config_file=config_name, ckpt_path=ckpt_path, **kwargs
158 |     )
159 | 
160 | 
161 | def _load_checkpoint(model, ckpt_path):
162 |     if ckpt_path is not None:
163 |         sd = torch.load(ckpt_path, map_location="cpu", weights_only=True)["model"]
164 |         missing_keys, unexpected_keys = model.load_state_dict(sd)
165 |         if missing_keys:
166 |             logging.error(missing_keys)
167 |             raise RuntimeError()
168 |         if unexpected_keys:
169 |             logging.error(unexpected_keys)
170 |             raise RuntimeError()
171 |         logging.info("Loaded checkpoint sucessfully")
172 | 


--------------------------------------------------------------------------------
/sam2/configs/edgetam.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | 
  3 | # Model
  4 | model:
  5 |   _target_: sam2.modeling.sam2_base.SAM2Base
  6 |   image_encoder:
  7 |     _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
  8 |     scalp: 1
  9 |     trunk:
 10 |       _target_: sam2.modeling.backbones.timm.TimmBackbone
 11 |       name: repvit_m1.dist_in1k
 12 |       features:
 13 |       - layer0
 14 |       - layer1
 15 |       - layer2
 16 |       - layer3
 17 |     neck:
 18 |       _target_: sam2.modeling.backbones.image_encoder.FpnNeck
 19 |       position_encoding:
 20 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 21 |         num_pos_feats: 256
 22 |         normalize: true
 23 |         scale: null
 24 |         temperature: 10000
 25 |       d_model: 256
 26 |       backbone_channel_list: [384, 192, 96, 48]
 27 |       fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
 28 |       fpn_interp_model: nearest
 29 | 
 30 |   memory_attention:
 31 |     _target_: sam2.modeling.memory_attention.MemoryAttention
 32 |     d_model: 256
 33 |     pos_enc_at_input: true
 34 |     layer:
 35 |       _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
 36 |       activation: relu
 37 |       dim_feedforward: 2048
 38 |       dropout: 0.1
 39 |       pos_enc_at_attn: false
 40 |       self_attention:
 41 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 42 |         rope_theta: 10000.0
 43 |         feat_sizes: [32, 32]
 44 |         embedding_dim: 256
 45 |         num_heads: 1
 46 |         downsample_rate: 1
 47 |         dropout: 0.1
 48 |       d_model: 256
 49 |       pos_enc_at_cross_attn_keys: true
 50 |       pos_enc_at_cross_attn_queries: false
 51 |       cross_attention:
 52 |         _target_: sam2.modeling.sam.transformer.RoPEAttentionv2
 53 |         rope_theta: 10000.0
 54 |         q_sizes: [64, 64]
 55 |         k_sizes: [16, 16]
 56 |         embedding_dim: 256
 57 |         num_heads: 1
 58 |         downsample_rate: 1
 59 |         dropout: 0.1
 60 |         kv_in_dim: 64
 61 |     num_layers: 2
 62 | 
 63 |   memory_encoder:
 64 |     _target_: sam2.modeling.memory_encoder.MemoryEncoder
 65 |     out_dim: 64
 66 |     position_encoding:
 67 |       _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 68 |       num_pos_feats: 64
 69 |       normalize: true
 70 |       scale: null
 71 |       temperature: 10000
 72 |     mask_downsampler:
 73 |       _target_: sam2.modeling.memory_encoder.MaskDownSampler
 74 |       kernel_size: 3
 75 |       stride: 2
 76 |       padding: 1
 77 |     fuser:
 78 |       _target_: sam2.modeling.memory_encoder.Fuser
 79 |       layer:
 80 |         _target_: sam2.modeling.memory_encoder.CXBlock
 81 |         dim: 256
 82 |         kernel_size: 7
 83 |         padding: 3
 84 |         layer_scale_init_value: 1e-6
 85 |         use_dwconv: True  # depth-wise convs
 86 |       num_layers: 2
 87 | 
 88 |   spatial_perceiver:
 89 |     _target_: sam2.modeling.perceiver.PerceiverResampler
 90 |     depth: 2
 91 |     dim: 64
 92 |     dim_head: 64
 93 |     heads: 1
 94 |     ff_mult: 4
 95 |     hidden_dropout_p: 0.
 96 |     attention_dropout_p: 0.
 97 |     pos_enc_at_key_value: true # implicit pos
 98 |     concat_kv_latents: false
 99 |     num_latents: 256
100 |     num_latents_2d: 256
101 |     position_encoding:
102 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
103 |         num_pos_feats: 64
104 |         normalize: true
105 |         scale: null
106 |         temperature: 10000
107 |     use_self_attn: true
108 | 
109 |   num_maskmem: 7
110 |   image_size: 1024
111 |   # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
112 |   sigmoid_scale_for_mem_enc: 20.0
113 |   sigmoid_bias_for_mem_enc: -10.0
114 |   use_mask_input_as_output_without_sam: true
115 |   # Memory
116 |   directly_add_no_mem_embed: true
117 |   # use high-resolution feature map in the SAM mask decoder
118 |   use_high_res_features_in_sam: true
119 |   # output 3 masks on the first click on initial conditioning frames
120 |   multimask_output_in_sam: true
121 |   # SAM heads
122 |   iou_prediction_use_sigmoid: True
123 |   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
124 |   use_obj_ptrs_in_encoder: true
125 |   add_tpos_enc_to_obj_ptrs: false
126 |   only_obj_ptrs_in_the_past_for_eval: true
127 |   # object occlusion prediction
128 |   pred_obj_scores: true
129 |   pred_obj_scores_mlp: true
130 |   fixed_no_obj_ptr: true
131 |   # multimask tracking settings
132 |   multimask_output_for_tracking: true
133 |   use_multimask_token_for_obj_ptr: true
134 |   multimask_min_pt_num: 0
135 |   multimask_max_pt_num: 1
136 |   use_mlp_for_obj_ptr_proj: true
137 |   # Compilation flag
138 |   compile_image_encoder: false


--------------------------------------------------------------------------------
/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | 
  3 | # Model
  4 | model:
  5 |   _target_: sam2.modeling.sam2_base.SAM2Base
  6 |   image_encoder:
  7 |     _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
  8 |     scalp: 1
  9 |     trunk:
 10 |       _target_: sam2.modeling.backbones.hieradet.Hiera
 11 |       embed_dim: 112
 12 |       num_heads: 2
 13 |     neck:
 14 |       _target_: sam2.modeling.backbones.image_encoder.FpnNeck
 15 |       position_encoding:
 16 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 17 |         num_pos_feats: 256
 18 |         normalize: true
 19 |         scale: null
 20 |         temperature: 10000
 21 |       d_model: 256
 22 |       backbone_channel_list: [896, 448, 224, 112]
 23 |       fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
 24 |       fpn_interp_model: nearest
 25 | 
 26 |   memory_attention:
 27 |     _target_: sam2.modeling.memory_attention.MemoryAttention
 28 |     d_model: 256
 29 |     pos_enc_at_input: true
 30 |     layer:
 31 |       _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
 32 |       activation: relu
 33 |       dim_feedforward: 2048
 34 |       dropout: 0.1
 35 |       pos_enc_at_attn: false
 36 |       self_attention:
 37 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 38 |         rope_theta: 10000.0
 39 |         feat_sizes: [32, 32]
 40 |         embedding_dim: 256
 41 |         num_heads: 1
 42 |         downsample_rate: 1
 43 |         dropout: 0.1
 44 |       d_model: 256
 45 |       pos_enc_at_cross_attn_keys: true
 46 |       pos_enc_at_cross_attn_queries: false
 47 |       cross_attention:
 48 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 49 |         rope_theta: 10000.0
 50 |         feat_sizes: [32, 32]
 51 |         rope_k_repeat: True
 52 |         embedding_dim: 256
 53 |         num_heads: 1
 54 |         downsample_rate: 1
 55 |         dropout: 0.1
 56 |         kv_in_dim: 64
 57 |     num_layers: 4
 58 | 
 59 |   memory_encoder:
 60 |       _target_: sam2.modeling.memory_encoder.MemoryEncoder
 61 |       out_dim: 64
 62 |       position_encoding:
 63 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 64 |         num_pos_feats: 64
 65 |         normalize: true
 66 |         scale: null
 67 |         temperature: 10000
 68 |       mask_downsampler:
 69 |         _target_: sam2.modeling.memory_encoder.MaskDownSampler
 70 |         kernel_size: 3
 71 |         stride: 2
 72 |         padding: 1
 73 |       fuser:
 74 |         _target_: sam2.modeling.memory_encoder.Fuser
 75 |         layer:
 76 |           _target_: sam2.modeling.memory_encoder.CXBlock
 77 |           dim: 256
 78 |           kernel_size: 7
 79 |           padding: 3
 80 |           layer_scale_init_value: 1e-6
 81 |           use_dwconv: True  # depth-wise convs
 82 |         num_layers: 2
 83 | 
 84 |   num_maskmem: 7
 85 |   image_size: 1024
 86 |   # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
 87 |   sigmoid_scale_for_mem_enc: 20.0
 88 |   sigmoid_bias_for_mem_enc: -10.0
 89 |   use_mask_input_as_output_without_sam: true
 90 |   # Memory
 91 |   directly_add_no_mem_embed: true
 92 |   no_obj_embed_spatial: true
 93 |   # use high-resolution feature map in the SAM mask decoder
 94 |   use_high_res_features_in_sam: true
 95 |   # output 3 masks on the first click on initial conditioning frames
 96 |   multimask_output_in_sam: true
 97 |   # SAM heads
 98 |   iou_prediction_use_sigmoid: True
 99 |   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
100 |   use_obj_ptrs_in_encoder: true
101 |   add_tpos_enc_to_obj_ptrs: true
102 |   proj_tpos_enc_in_obj_ptrs: true
103 |   use_signed_tpos_enc_to_obj_ptrs: true
104 |   only_obj_ptrs_in_the_past_for_eval: true
105 |   # object occlusion prediction
106 |   pred_obj_scores: true
107 |   pred_obj_scores_mlp: true
108 |   fixed_no_obj_ptr: true
109 |   # multimask tracking settings
110 |   multimask_output_for_tracking: true
111 |   use_multimask_token_for_obj_ptr: true
112 |   multimask_min_pt_num: 0
113 |   multimask_max_pt_num: 1
114 |   use_mlp_for_obj_ptr_proj: true
115 |   # Compilation flag
116 |   compile_image_encoder: False
117 | 


--------------------------------------------------------------------------------
/sam2/configs/sam2.1/sam2.1_hiera_l.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | 
  3 | # Model
  4 | model:
  5 |   _target_: sam2.modeling.sam2_base.SAM2Base
  6 |   image_encoder:
  7 |     _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
  8 |     scalp: 1
  9 |     trunk:
 10 |       _target_: sam2.modeling.backbones.hieradet.Hiera
 11 |       embed_dim: 144
 12 |       num_heads: 2
 13 |       stages: [2, 6, 36, 4]
 14 |       global_att_blocks: [23, 33, 43]
 15 |       window_pos_embed_bkg_spatial_size: [7, 7]
 16 |       window_spec: [8, 4, 16, 8]
 17 |     neck:
 18 |       _target_: sam2.modeling.backbones.image_encoder.FpnNeck
 19 |       position_encoding:
 20 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 21 |         num_pos_feats: 256
 22 |         normalize: true
 23 |         scale: null
 24 |         temperature: 10000
 25 |       d_model: 256
 26 |       backbone_channel_list: [1152, 576, 288, 144]
 27 |       fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
 28 |       fpn_interp_model: nearest
 29 | 
 30 |   memory_attention:
 31 |     _target_: sam2.modeling.memory_attention.MemoryAttention
 32 |     d_model: 256
 33 |     pos_enc_at_input: true
 34 |     layer:
 35 |       _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
 36 |       activation: relu
 37 |       dim_feedforward: 2048
 38 |       dropout: 0.1
 39 |       pos_enc_at_attn: false
 40 |       self_attention:
 41 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 42 |         rope_theta: 10000.0
 43 |         feat_sizes: [32, 32]
 44 |         embedding_dim: 256
 45 |         num_heads: 1
 46 |         downsample_rate: 1
 47 |         dropout: 0.1
 48 |       d_model: 256
 49 |       pos_enc_at_cross_attn_keys: true
 50 |       pos_enc_at_cross_attn_queries: false
 51 |       cross_attention:
 52 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 53 |         rope_theta: 10000.0
 54 |         feat_sizes: [32, 32]
 55 |         rope_k_repeat: True
 56 |         embedding_dim: 256
 57 |         num_heads: 1
 58 |         downsample_rate: 1
 59 |         dropout: 0.1
 60 |         kv_in_dim: 64
 61 |     num_layers: 4
 62 | 
 63 |   memory_encoder:
 64 |       _target_: sam2.modeling.memory_encoder.MemoryEncoder
 65 |       out_dim: 64
 66 |       position_encoding:
 67 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 68 |         num_pos_feats: 64
 69 |         normalize: true
 70 |         scale: null
 71 |         temperature: 10000
 72 |       mask_downsampler:
 73 |         _target_: sam2.modeling.memory_encoder.MaskDownSampler
 74 |         kernel_size: 3
 75 |         stride: 2
 76 |         padding: 1
 77 |       fuser:
 78 |         _target_: sam2.modeling.memory_encoder.Fuser
 79 |         layer:
 80 |           _target_: sam2.modeling.memory_encoder.CXBlock
 81 |           dim: 256
 82 |           kernel_size: 7
 83 |           padding: 3
 84 |           layer_scale_init_value: 1e-6
 85 |           use_dwconv: True  # depth-wise convs
 86 |         num_layers: 2
 87 | 
 88 |   num_maskmem: 7
 89 |   image_size: 1024
 90 |   # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
 91 |   sigmoid_scale_for_mem_enc: 20.0
 92 |   sigmoid_bias_for_mem_enc: -10.0
 93 |   use_mask_input_as_output_without_sam: true
 94 |   # Memory
 95 |   directly_add_no_mem_embed: true
 96 |   no_obj_embed_spatial: true
 97 |   # use high-resolution feature map in the SAM mask decoder
 98 |   use_high_res_features_in_sam: true
 99 |   # output 3 masks on the first click on initial conditioning frames
100 |   multimask_output_in_sam: true
101 |   # SAM heads
102 |   iou_prediction_use_sigmoid: True
103 |   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
104 |   use_obj_ptrs_in_encoder: true
105 |   add_tpos_enc_to_obj_ptrs: true
106 |   proj_tpos_enc_in_obj_ptrs: true
107 |   use_signed_tpos_enc_to_obj_ptrs: true
108 |   only_obj_ptrs_in_the_past_for_eval: true
109 |   # object occlusion prediction
110 |   pred_obj_scores: true
111 |   pred_obj_scores_mlp: true
112 |   fixed_no_obj_ptr: true
113 |   # multimask tracking settings
114 |   multimask_output_for_tracking: true
115 |   use_multimask_token_for_obj_ptr: true
116 |   multimask_min_pt_num: 0
117 |   multimask_max_pt_num: 1
118 |   use_mlp_for_obj_ptr_proj: true
119 |   # Compilation flag
120 |   compile_image_encoder: False
121 | 


--------------------------------------------------------------------------------
/sam2/configs/sam2.1/sam2.1_hiera_s.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | 
  3 | # Model
  4 | model:
  5 |   _target_: sam2.modeling.sam2_base.SAM2Base
  6 |   image_encoder:
  7 |     _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
  8 |     scalp: 1
  9 |     trunk:
 10 |       _target_: sam2.modeling.backbones.hieradet.Hiera
 11 |       embed_dim: 96
 12 |       num_heads: 1
 13 |       stages: [1, 2, 11, 2]
 14 |       global_att_blocks: [7, 10, 13]
 15 |       window_pos_embed_bkg_spatial_size: [7, 7]
 16 |     neck:
 17 |       _target_: sam2.modeling.backbones.image_encoder.FpnNeck
 18 |       position_encoding:
 19 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 20 |         num_pos_feats: 256
 21 |         normalize: true
 22 |         scale: null
 23 |         temperature: 10000
 24 |       d_model: 256
 25 |       backbone_channel_list: [768, 384, 192, 96]
 26 |       fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
 27 |       fpn_interp_model: nearest
 28 | 
 29 |   memory_attention:
 30 |     _target_: sam2.modeling.memory_attention.MemoryAttention
 31 |     d_model: 256
 32 |     pos_enc_at_input: true
 33 |     layer:
 34 |       _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
 35 |       activation: relu
 36 |       dim_feedforward: 2048
 37 |       dropout: 0.1
 38 |       pos_enc_at_attn: false
 39 |       self_attention:
 40 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 41 |         rope_theta: 10000.0
 42 |         feat_sizes: [32, 32]
 43 |         embedding_dim: 256
 44 |         num_heads: 1
 45 |         downsample_rate: 1
 46 |         dropout: 0.1
 47 |       d_model: 256
 48 |       pos_enc_at_cross_attn_keys: true
 49 |       pos_enc_at_cross_attn_queries: false
 50 |       cross_attention:
 51 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 52 |         rope_theta: 10000.0
 53 |         feat_sizes: [32, 32]
 54 |         rope_k_repeat: True
 55 |         embedding_dim: 256
 56 |         num_heads: 1
 57 |         downsample_rate: 1
 58 |         dropout: 0.1
 59 |         kv_in_dim: 64
 60 |     num_layers: 4
 61 | 
 62 |   memory_encoder:
 63 |       _target_: sam2.modeling.memory_encoder.MemoryEncoder
 64 |       out_dim: 64
 65 |       position_encoding:
 66 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 67 |         num_pos_feats: 64
 68 |         normalize: true
 69 |         scale: null
 70 |         temperature: 10000
 71 |       mask_downsampler:
 72 |         _target_: sam2.modeling.memory_encoder.MaskDownSampler
 73 |         kernel_size: 3
 74 |         stride: 2
 75 |         padding: 1
 76 |       fuser:
 77 |         _target_: sam2.modeling.memory_encoder.Fuser
 78 |         layer:
 79 |           _target_: sam2.modeling.memory_encoder.CXBlock
 80 |           dim: 256
 81 |           kernel_size: 7
 82 |           padding: 3
 83 |           layer_scale_init_value: 1e-6
 84 |           use_dwconv: True  # depth-wise convs
 85 |         num_layers: 2
 86 | 
 87 |   num_maskmem: 7
 88 |   image_size: 1024
 89 |   # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
 90 |   sigmoid_scale_for_mem_enc: 20.0
 91 |   sigmoid_bias_for_mem_enc: -10.0
 92 |   use_mask_input_as_output_without_sam: true
 93 |   # Memory
 94 |   directly_add_no_mem_embed: true
 95 |   no_obj_embed_spatial: true
 96 |   # use high-resolution feature map in the SAM mask decoder
 97 |   use_high_res_features_in_sam: true
 98 |   # output 3 masks on the first click on initial conditioning frames
 99 |   multimask_output_in_sam: true
100 |   # SAM heads
101 |   iou_prediction_use_sigmoid: True
102 |   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
103 |   use_obj_ptrs_in_encoder: true
104 |   add_tpos_enc_to_obj_ptrs: true
105 |   proj_tpos_enc_in_obj_ptrs: true
106 |   use_signed_tpos_enc_to_obj_ptrs: true
107 |   only_obj_ptrs_in_the_past_for_eval: true
108 |   # object occlusion prediction
109 |   pred_obj_scores: true
110 |   pred_obj_scores_mlp: true
111 |   fixed_no_obj_ptr: true
112 |   # multimask tracking settings
113 |   multimask_output_for_tracking: true
114 |   use_multimask_token_for_obj_ptr: true
115 |   multimask_min_pt_num: 0
116 |   multimask_max_pt_num: 1
117 |   use_mlp_for_obj_ptr_proj: true
118 |   # Compilation flag
119 |   compile_image_encoder: False
120 | 


--------------------------------------------------------------------------------
/sam2/configs/sam2.1/sam2.1_hiera_t.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | 
  3 | # Model
  4 | model:
  5 |   _target_: sam2.modeling.sam2_base.SAM2Base
  6 |   image_encoder:
  7 |     _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
  8 |     scalp: 1
  9 |     trunk:
 10 |       _target_: sam2.modeling.backbones.hieradet.Hiera
 11 |       embed_dim: 96
 12 |       num_heads: 1
 13 |       stages: [1, 2, 7, 2]
 14 |       global_att_blocks: [5, 7, 9]
 15 |       window_pos_embed_bkg_spatial_size: [7, 7]
 16 |     neck:
 17 |       _target_: sam2.modeling.backbones.image_encoder.FpnNeck
 18 |       position_encoding:
 19 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 20 |         num_pos_feats: 256
 21 |         normalize: true
 22 |         scale: null
 23 |         temperature: 10000
 24 |       d_model: 256
 25 |       backbone_channel_list: [768, 384, 192, 96]
 26 |       fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
 27 |       fpn_interp_model: nearest
 28 | 
 29 |   memory_attention:
 30 |     _target_: sam2.modeling.memory_attention.MemoryAttention
 31 |     d_model: 256
 32 |     pos_enc_at_input: true
 33 |     layer:
 34 |       _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
 35 |       activation: relu
 36 |       dim_feedforward: 2048
 37 |       dropout: 0.1
 38 |       pos_enc_at_attn: false
 39 |       self_attention:
 40 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 41 |         rope_theta: 10000.0
 42 |         feat_sizes: [32, 32]
 43 |         embedding_dim: 256
 44 |         num_heads: 1
 45 |         downsample_rate: 1
 46 |         dropout: 0.1
 47 |       d_model: 256
 48 |       pos_enc_at_cross_attn_keys: true
 49 |       pos_enc_at_cross_attn_queries: false
 50 |       cross_attention:
 51 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 52 |         rope_theta: 10000.0
 53 |         feat_sizes: [32, 32]
 54 |         rope_k_repeat: True
 55 |         embedding_dim: 256
 56 |         num_heads: 1
 57 |         downsample_rate: 1
 58 |         dropout: 0.1
 59 |         kv_in_dim: 64
 60 |     num_layers: 4
 61 | 
 62 |   memory_encoder:
 63 |       _target_: sam2.modeling.memory_encoder.MemoryEncoder
 64 |       out_dim: 64
 65 |       position_encoding:
 66 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 67 |         num_pos_feats: 64
 68 |         normalize: true
 69 |         scale: null
 70 |         temperature: 10000
 71 |       mask_downsampler:
 72 |         _target_: sam2.modeling.memory_encoder.MaskDownSampler
 73 |         kernel_size: 3
 74 |         stride: 2
 75 |         padding: 1
 76 |       fuser:
 77 |         _target_: sam2.modeling.memory_encoder.Fuser
 78 |         layer:
 79 |           _target_: sam2.modeling.memory_encoder.CXBlock
 80 |           dim: 256
 81 |           kernel_size: 7
 82 |           padding: 3
 83 |           layer_scale_init_value: 1e-6
 84 |           use_dwconv: True  # depth-wise convs
 85 |         num_layers: 2
 86 | 
 87 |   num_maskmem: 7
 88 |   image_size: 1024
 89 |   # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
 90 |   # SAM decoder
 91 |   sigmoid_scale_for_mem_enc: 20.0
 92 |   sigmoid_bias_for_mem_enc: -10.0
 93 |   use_mask_input_as_output_without_sam: true
 94 |   # Memory
 95 |   directly_add_no_mem_embed: true
 96 |   no_obj_embed_spatial: true
 97 |   # use high-resolution feature map in the SAM mask decoder
 98 |   use_high_res_features_in_sam: true
 99 |   # output 3 masks on the first click on initial conditioning frames
100 |   multimask_output_in_sam: true
101 |   # SAM heads
102 |   iou_prediction_use_sigmoid: True
103 |   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
104 |   use_obj_ptrs_in_encoder: true
105 |   add_tpos_enc_to_obj_ptrs: true
106 |   proj_tpos_enc_in_obj_ptrs: true
107 |   use_signed_tpos_enc_to_obj_ptrs: true
108 |   only_obj_ptrs_in_the_past_for_eval: true
109 |   # object occlusion prediction
110 |   pred_obj_scores: true
111 |   pred_obj_scores_mlp: true
112 |   fixed_no_obj_ptr: true
113 |   # multimask tracking settings
114 |   multimask_output_for_tracking: true
115 |   use_multimask_token_for_obj_ptr: true
116 |   multimask_min_pt_num: 0
117 |   multimask_max_pt_num: 1
118 |   use_mlp_for_obj_ptr_proj: true
119 |   # Compilation flag
120 |   # HieraT does not currently support compilation, should always be set to False
121 |   compile_image_encoder: False
122 | 


--------------------------------------------------------------------------------
/sam2/configs/sam2/sam2_hiera_b+.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | 
  3 | # Model
  4 | model:
  5 |   _target_: sam2.modeling.sam2_base.SAM2Base
  6 |   image_encoder:
  7 |     _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
  8 |     scalp: 1
  9 |     trunk:
 10 |       _target_: sam2.modeling.backbones.hieradet.Hiera
 11 |       embed_dim: 112
 12 |       num_heads: 2
 13 |     neck:
 14 |       _target_: sam2.modeling.backbones.image_encoder.FpnNeck
 15 |       position_encoding:
 16 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 17 |         num_pos_feats: 256
 18 |         normalize: true
 19 |         scale: null
 20 |         temperature: 10000
 21 |       d_model: 256
 22 |       backbone_channel_list: [896, 448, 224, 112]
 23 |       fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
 24 |       fpn_interp_model: nearest
 25 | 
 26 |   memory_attention:
 27 |     _target_: sam2.modeling.memory_attention.MemoryAttention
 28 |     d_model: 256
 29 |     pos_enc_at_input: true
 30 |     layer:
 31 |       _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
 32 |       activation: relu
 33 |       dim_feedforward: 2048
 34 |       dropout: 0.1
 35 |       pos_enc_at_attn: false
 36 |       self_attention:
 37 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 38 |         rope_theta: 10000.0
 39 |         feat_sizes: [32, 32]
 40 |         embedding_dim: 256
 41 |         num_heads: 1
 42 |         downsample_rate: 1
 43 |         dropout: 0.1
 44 |       d_model: 256
 45 |       pos_enc_at_cross_attn_keys: true
 46 |       pos_enc_at_cross_attn_queries: false
 47 |       cross_attention:
 48 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 49 |         rope_theta: 10000.0
 50 |         feat_sizes: [32, 32]
 51 |         rope_k_repeat: True
 52 |         embedding_dim: 256
 53 |         num_heads: 1
 54 |         downsample_rate: 1
 55 |         dropout: 0.1
 56 |         kv_in_dim: 64
 57 |     num_layers: 4
 58 | 
 59 |   memory_encoder:
 60 |       _target_: sam2.modeling.memory_encoder.MemoryEncoder
 61 |       out_dim: 64
 62 |       position_encoding:
 63 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 64 |         num_pos_feats: 64
 65 |         normalize: true
 66 |         scale: null
 67 |         temperature: 10000
 68 |       mask_downsampler:
 69 |         _target_: sam2.modeling.memory_encoder.MaskDownSampler
 70 |         kernel_size: 3
 71 |         stride: 2
 72 |         padding: 1
 73 |       fuser:
 74 |         _target_: sam2.modeling.memory_encoder.Fuser
 75 |         layer:
 76 |           _target_: sam2.modeling.memory_encoder.CXBlock
 77 |           dim: 256
 78 |           kernel_size: 7
 79 |           padding: 3
 80 |           layer_scale_init_value: 1e-6
 81 |           use_dwconv: True  # depth-wise convs
 82 |         num_layers: 2
 83 | 
 84 |   num_maskmem: 7
 85 |   image_size: 1024
 86 |   # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
 87 |   sigmoid_scale_for_mem_enc: 20.0
 88 |   sigmoid_bias_for_mem_enc: -10.0
 89 |   use_mask_input_as_output_without_sam: true
 90 |   # Memory
 91 |   directly_add_no_mem_embed: true
 92 |   # use high-resolution feature map in the SAM mask decoder
 93 |   use_high_res_features_in_sam: true
 94 |   # output 3 masks on the first click on initial conditioning frames
 95 |   multimask_output_in_sam: true
 96 |   # SAM heads
 97 |   iou_prediction_use_sigmoid: True
 98 |   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
 99 |   use_obj_ptrs_in_encoder: true
100 |   add_tpos_enc_to_obj_ptrs: false
101 |   only_obj_ptrs_in_the_past_for_eval: true
102 |   # object occlusion prediction
103 |   pred_obj_scores: true
104 |   pred_obj_scores_mlp: true
105 |   fixed_no_obj_ptr: true
106 |   # multimask tracking settings
107 |   multimask_output_for_tracking: true
108 |   use_multimask_token_for_obj_ptr: true
109 |   multimask_min_pt_num: 0
110 |   multimask_max_pt_num: 1
111 |   use_mlp_for_obj_ptr_proj: true
112 |   # Compilation flag
113 |   compile_image_encoder: False
114 | 


--------------------------------------------------------------------------------
/sam2/configs/sam2/sam2_hiera_l.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | 
  3 | # Model
  4 | model:
  5 |   _target_: sam2.modeling.sam2_base.SAM2Base
  6 |   image_encoder:
  7 |     _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
  8 |     scalp: 1
  9 |     trunk:
 10 |       _target_: sam2.modeling.backbones.hieradet.Hiera
 11 |       embed_dim: 144
 12 |       num_heads: 2
 13 |       stages: [2, 6, 36, 4]
 14 |       global_att_blocks: [23, 33, 43]
 15 |       window_pos_embed_bkg_spatial_size: [7, 7]
 16 |       window_spec: [8, 4, 16, 8]
 17 |     neck:
 18 |       _target_: sam2.modeling.backbones.image_encoder.FpnNeck
 19 |       position_encoding:
 20 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 21 |         num_pos_feats: 256
 22 |         normalize: true
 23 |         scale: null
 24 |         temperature: 10000
 25 |       d_model: 256
 26 |       backbone_channel_list: [1152, 576, 288, 144]
 27 |       fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
 28 |       fpn_interp_model: nearest
 29 | 
 30 |   memory_attention:
 31 |     _target_: sam2.modeling.memory_attention.MemoryAttention
 32 |     d_model: 256
 33 |     pos_enc_at_input: true
 34 |     layer:
 35 |       _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
 36 |       activation: relu
 37 |       dim_feedforward: 2048
 38 |       dropout: 0.1
 39 |       pos_enc_at_attn: false
 40 |       self_attention:
 41 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 42 |         rope_theta: 10000.0
 43 |         feat_sizes: [32, 32]
 44 |         embedding_dim: 256
 45 |         num_heads: 1
 46 |         downsample_rate: 1
 47 |         dropout: 0.1
 48 |       d_model: 256
 49 |       pos_enc_at_cross_attn_keys: true
 50 |       pos_enc_at_cross_attn_queries: false
 51 |       cross_attention:
 52 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 53 |         rope_theta: 10000.0
 54 |         feat_sizes: [32, 32]
 55 |         rope_k_repeat: True
 56 |         embedding_dim: 256
 57 |         num_heads: 1
 58 |         downsample_rate: 1
 59 |         dropout: 0.1
 60 |         kv_in_dim: 64
 61 |     num_layers: 4
 62 | 
 63 |   memory_encoder:
 64 |       _target_: sam2.modeling.memory_encoder.MemoryEncoder
 65 |       out_dim: 64
 66 |       position_encoding:
 67 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 68 |         num_pos_feats: 64
 69 |         normalize: true
 70 |         scale: null
 71 |         temperature: 10000
 72 |       mask_downsampler:
 73 |         _target_: sam2.modeling.memory_encoder.MaskDownSampler
 74 |         kernel_size: 3
 75 |         stride: 2
 76 |         padding: 1
 77 |       fuser:
 78 |         _target_: sam2.modeling.memory_encoder.Fuser
 79 |         layer:
 80 |           _target_: sam2.modeling.memory_encoder.CXBlock
 81 |           dim: 256
 82 |           kernel_size: 7
 83 |           padding: 3
 84 |           layer_scale_init_value: 1e-6
 85 |           use_dwconv: True  # depth-wise convs
 86 |         num_layers: 2
 87 | 
 88 |   num_maskmem: 7
 89 |   image_size: 1024
 90 |   # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
 91 |   sigmoid_scale_for_mem_enc: 20.0
 92 |   sigmoid_bias_for_mem_enc: -10.0
 93 |   use_mask_input_as_output_without_sam: true
 94 |   # Memory
 95 |   directly_add_no_mem_embed: true
 96 |   # use high-resolution feature map in the SAM mask decoder
 97 |   use_high_res_features_in_sam: true
 98 |   # output 3 masks on the first click on initial conditioning frames
 99 |   multimask_output_in_sam: true
100 |   # SAM heads
101 |   iou_prediction_use_sigmoid: True
102 |   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
103 |   use_obj_ptrs_in_encoder: true
104 |   add_tpos_enc_to_obj_ptrs: false
105 |   only_obj_ptrs_in_the_past_for_eval: true
106 |   # object occlusion prediction
107 |   pred_obj_scores: true
108 |   pred_obj_scores_mlp: true
109 |   fixed_no_obj_ptr: true
110 |   # multimask tracking settings
111 |   multimask_output_for_tracking: true
112 |   use_multimask_token_for_obj_ptr: true
113 |   multimask_min_pt_num: 0
114 |   multimask_max_pt_num: 1
115 |   use_mlp_for_obj_ptr_proj: true
116 |   # Compilation flag
117 |   compile_image_encoder: False
118 | 


--------------------------------------------------------------------------------
/sam2/configs/sam2/sam2_hiera_s.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | 
  3 | # Model
  4 | model:
  5 |   _target_: sam2.modeling.sam2_base.SAM2Base
  6 |   image_encoder:
  7 |     _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
  8 |     scalp: 1
  9 |     trunk:
 10 |       _target_: sam2.modeling.backbones.hieradet.Hiera
 11 |       embed_dim: 96
 12 |       num_heads: 1
 13 |       stages: [1, 2, 11, 2]
 14 |       global_att_blocks: [7, 10, 13]
 15 |       window_pos_embed_bkg_spatial_size: [7, 7]
 16 |     neck:
 17 |       _target_: sam2.modeling.backbones.image_encoder.FpnNeck
 18 |       position_encoding:
 19 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 20 |         num_pos_feats: 256
 21 |         normalize: true
 22 |         scale: null
 23 |         temperature: 10000
 24 |       d_model: 256
 25 |       backbone_channel_list: [768, 384, 192, 96]
 26 |       fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
 27 |       fpn_interp_model: nearest
 28 | 
 29 |   memory_attention:
 30 |     _target_: sam2.modeling.memory_attention.MemoryAttention
 31 |     d_model: 256
 32 |     pos_enc_at_input: true
 33 |     layer:
 34 |       _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
 35 |       activation: relu
 36 |       dim_feedforward: 2048
 37 |       dropout: 0.1
 38 |       pos_enc_at_attn: false
 39 |       self_attention:
 40 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 41 |         rope_theta: 10000.0
 42 |         feat_sizes: [32, 32]
 43 |         embedding_dim: 256
 44 |         num_heads: 1
 45 |         downsample_rate: 1
 46 |         dropout: 0.1
 47 |       d_model: 256
 48 |       pos_enc_at_cross_attn_keys: true
 49 |       pos_enc_at_cross_attn_queries: false
 50 |       cross_attention:
 51 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 52 |         rope_theta: 10000.0
 53 |         feat_sizes: [32, 32]
 54 |         rope_k_repeat: True
 55 |         embedding_dim: 256
 56 |         num_heads: 1
 57 |         downsample_rate: 1
 58 |         dropout: 0.1
 59 |         kv_in_dim: 64
 60 |     num_layers: 4
 61 | 
 62 |   memory_encoder:
 63 |       _target_: sam2.modeling.memory_encoder.MemoryEncoder
 64 |       out_dim: 64
 65 |       position_encoding:
 66 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 67 |         num_pos_feats: 64
 68 |         normalize: true
 69 |         scale: null
 70 |         temperature: 10000
 71 |       mask_downsampler:
 72 |         _target_: sam2.modeling.memory_encoder.MaskDownSampler
 73 |         kernel_size: 3
 74 |         stride: 2
 75 |         padding: 1
 76 |       fuser:
 77 |         _target_: sam2.modeling.memory_encoder.Fuser
 78 |         layer:
 79 |           _target_: sam2.modeling.memory_encoder.CXBlock
 80 |           dim: 256
 81 |           kernel_size: 7
 82 |           padding: 3
 83 |           layer_scale_init_value: 1e-6
 84 |           use_dwconv: True  # depth-wise convs
 85 |         num_layers: 2
 86 | 
 87 |   num_maskmem: 7
 88 |   image_size: 1024
 89 |   # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
 90 |   sigmoid_scale_for_mem_enc: 20.0
 91 |   sigmoid_bias_for_mem_enc: -10.0
 92 |   use_mask_input_as_output_without_sam: true
 93 |   # Memory
 94 |   directly_add_no_mem_embed: true
 95 |   # use high-resolution feature map in the SAM mask decoder
 96 |   use_high_res_features_in_sam: true
 97 |   # output 3 masks on the first click on initial conditioning frames
 98 |   multimask_output_in_sam: true
 99 |   # SAM heads
100 |   iou_prediction_use_sigmoid: True
101 |   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
102 |   use_obj_ptrs_in_encoder: true
103 |   add_tpos_enc_to_obj_ptrs: false
104 |   only_obj_ptrs_in_the_past_for_eval: true
105 |   # object occlusion prediction
106 |   pred_obj_scores: true
107 |   pred_obj_scores_mlp: true
108 |   fixed_no_obj_ptr: true
109 |   # multimask tracking settings
110 |   multimask_output_for_tracking: true
111 |   use_multimask_token_for_obj_ptr: true
112 |   multimask_min_pt_num: 0
113 |   multimask_max_pt_num: 1
114 |   use_mlp_for_obj_ptr_proj: true
115 |   # Compilation flag
116 |   compile_image_encoder: False
117 | 


--------------------------------------------------------------------------------
/sam2/configs/sam2/sam2_hiera_t.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | 
  3 | # Model
  4 | model:
  5 |   _target_: sam2.modeling.sam2_base.SAM2Base
  6 |   image_encoder:
  7 |     _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
  8 |     scalp: 1
  9 |     trunk:
 10 |       _target_: sam2.modeling.backbones.hieradet.Hiera
 11 |       embed_dim: 96
 12 |       num_heads: 1
 13 |       stages: [1, 2, 7, 2]
 14 |       global_att_blocks: [5, 7, 9]
 15 |       window_pos_embed_bkg_spatial_size: [7, 7]
 16 |     neck:
 17 |       _target_: sam2.modeling.backbones.image_encoder.FpnNeck
 18 |       position_encoding:
 19 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 20 |         num_pos_feats: 256
 21 |         normalize: true
 22 |         scale: null
 23 |         temperature: 10000
 24 |       d_model: 256
 25 |       backbone_channel_list: [768, 384, 192, 96]
 26 |       fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
 27 |       fpn_interp_model: nearest
 28 | 
 29 |   memory_attention:
 30 |     _target_: sam2.modeling.memory_attention.MemoryAttention
 31 |     d_model: 256
 32 |     pos_enc_at_input: true
 33 |     layer:
 34 |       _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
 35 |       activation: relu
 36 |       dim_feedforward: 2048
 37 |       dropout: 0.1
 38 |       pos_enc_at_attn: false
 39 |       self_attention:
 40 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 41 |         rope_theta: 10000.0
 42 |         feat_sizes: [32, 32]
 43 |         embedding_dim: 256
 44 |         num_heads: 1
 45 |         downsample_rate: 1
 46 |         dropout: 0.1
 47 |       d_model: 256
 48 |       pos_enc_at_cross_attn_keys: true
 49 |       pos_enc_at_cross_attn_queries: false
 50 |       cross_attention:
 51 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 52 |         rope_theta: 10000.0
 53 |         feat_sizes: [32, 32]
 54 |         rope_k_repeat: True
 55 |         embedding_dim: 256
 56 |         num_heads: 1
 57 |         downsample_rate: 1
 58 |         dropout: 0.1
 59 |         kv_in_dim: 64
 60 |     num_layers: 4
 61 | 
 62 |   memory_encoder:
 63 |       _target_: sam2.modeling.memory_encoder.MemoryEncoder
 64 |       out_dim: 64
 65 |       position_encoding:
 66 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 67 |         num_pos_feats: 64
 68 |         normalize: true
 69 |         scale: null
 70 |         temperature: 10000
 71 |       mask_downsampler:
 72 |         _target_: sam2.modeling.memory_encoder.MaskDownSampler
 73 |         kernel_size: 3
 74 |         stride: 2
 75 |         padding: 1
 76 |       fuser:
 77 |         _target_: sam2.modeling.memory_encoder.Fuser
 78 |         layer:
 79 |           _target_: sam2.modeling.memory_encoder.CXBlock
 80 |           dim: 256
 81 |           kernel_size: 7
 82 |           padding: 3
 83 |           layer_scale_init_value: 1e-6
 84 |           use_dwconv: True  # depth-wise convs
 85 |         num_layers: 2
 86 | 
 87 |   num_maskmem: 7
 88 |   image_size: 1024
 89 |   # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
 90 |   # SAM decoder
 91 |   sigmoid_scale_for_mem_enc: 20.0
 92 |   sigmoid_bias_for_mem_enc: -10.0
 93 |   use_mask_input_as_output_without_sam: true
 94 |   # Memory
 95 |   directly_add_no_mem_embed: true
 96 |   # use high-resolution feature map in the SAM mask decoder
 97 |   use_high_res_features_in_sam: true
 98 |   # output 3 masks on the first click on initial conditioning frames
 99 |   multimask_output_in_sam: true
100 |   # SAM heads
101 |   iou_prediction_use_sigmoid: True
102 |   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
103 |   use_obj_ptrs_in_encoder: true
104 |   add_tpos_enc_to_obj_ptrs: false
105 |   only_obj_ptrs_in_the_past_for_eval: true
106 |   # object occlusion prediction
107 |   pred_obj_scores: true
108 |   pred_obj_scores_mlp: true
109 |   fixed_no_obj_ptr: true
110 |   # multimask tracking settings
111 |   multimask_output_for_tracking: true
112 |   use_multimask_token_for_obj_ptr: true
113 |   multimask_min_pt_num: 0
114 |   multimask_max_pt_num: 1
115 |   use_mlp_for_obj_ptr_proj: true
116 |   # Compilation flag
117 |   # HieraT does not currently support compilation, should always be set to False
118 |   compile_image_encoder: False
119 | 


--------------------------------------------------------------------------------
/sam2/csrc/connected_components.cu:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | // All rights reserved.
  3 | 
  4 | // This source code is licensed under the license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | 
  7 | // adapted from https://github.com/zsef123/Connected_components_PyTorch
  8 | // with license found in the LICENSE_cctorch file in the root directory.
  9 | #include <ATen/cuda/CUDAContext.h>
 10 | #include <cuda.h>
 11 | #include <cuda_runtime.h>
 12 | #include <torch/extension.h>
 13 | #include <torch/script.h>
 14 | #include <vector>
 15 | 
 16 | // 2d
 17 | #define BLOCK_ROWS 16
 18 | #define BLOCK_COLS 16
 19 | 
 20 | namespace cc2d {
 21 | 
 22 | template <typename T>
 23 | __device__ __forceinline__ unsigned char hasBit(T bitmap, unsigned char pos) {
 24 |   return (bitmap >> pos) & 1;
 25 | }
 26 | 
 27 | __device__ int32_t find(const int32_t* s_buf, int32_t n) {
 28 |   while (s_buf[n] != n)
 29 |     n = s_buf[n];
 30 |   return n;
 31 | }
 32 | 
 33 | __device__ int32_t find_n_compress(int32_t* s_buf, int32_t n) {
 34 |   const int32_t id = n;
 35 |   while (s_buf[n] != n) {
 36 |     n = s_buf[n];
 37 |     s_buf[id] = n;
 38 |   }
 39 |   return n;
 40 | }
 41 | 
 42 | __device__ void union_(int32_t* s_buf, int32_t a, int32_t b) {
 43 |   bool done;
 44 |   do {
 45 |     a = find(s_buf, a);
 46 |     b = find(s_buf, b);
 47 | 
 48 |     if (a < b) {
 49 |       int32_t old = atomicMin(s_buf + b, a);
 50 |       done = (old == b);
 51 |       b = old;
 52 |     } else if (b < a) {
 53 |       int32_t old = atomicMin(s_buf + a, b);
 54 |       done = (old == a);
 55 |       a = old;
 56 |     } else
 57 |       done = true;
 58 | 
 59 |   } while (!done);
 60 | }
 61 | 
 62 | __global__ void
 63 | init_labeling(int32_t* label, const uint32_t W, const uint32_t H) {
 64 |   const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
 65 |   const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
 66 |   const uint32_t idx = row * W + col;
 67 | 
 68 |   if (row < H && col < W)
 69 |     label[idx] = idx;
 70 | }
 71 | 
 72 | __global__ void
 73 | merge(uint8_t* img, int32_t* label, const uint32_t W, const uint32_t H) {
 74 |   const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
 75 |   const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
 76 |   const uint32_t idx = row * W + col;
 77 | 
 78 |   if (row >= H || col >= W)
 79 |     return;
 80 | 
 81 |   uint32_t P = 0;
 82 | 
 83 |   if (img[idx])
 84 |     P |= 0x777;
 85 |   if (row + 1 < H && img[idx + W])
 86 |     P |= 0x777 << 4;
 87 |   if (col + 1 < W && img[idx + 1])
 88 |     P |= 0x777 << 1;
 89 | 
 90 |   if (col == 0)
 91 |     P &= 0xEEEE;
 92 |   if (col + 1 >= W)
 93 |     P &= 0x3333;
 94 |   else if (col + 2 >= W)
 95 |     P &= 0x7777;
 96 | 
 97 |   if (row == 0)
 98 |     P &= 0xFFF0;
 99 |   if (row + 1 >= H)
100 |     P &= 0xFF;
101 | 
102 |   if (P > 0) {
103 |     // If need check about top-left pixel(if flag the first bit) and hit the
104 |     // top-left pixel
105 |     if (hasBit(P, 0) && img[idx - W - 1]) {
106 |       union_(label, idx, idx - 2 * W - 2); // top left block
107 |     }
108 | 
109 |     if ((hasBit(P, 1) && img[idx - W]) || (hasBit(P, 2) && img[idx - W + 1]))
110 |       union_(label, idx, idx - 2 * W); // top bottom block
111 | 
112 |     if (hasBit(P, 3) && img[idx + 2 - W])
113 |       union_(label, idx, idx - 2 * W + 2); // top right block
114 | 
115 |     if ((hasBit(P, 4) && img[idx - 1]) || (hasBit(P, 8) && img[idx + W - 1]))
116 |       union_(label, idx, idx - 2); // just left block
117 |   }
118 | }
119 | 
120 | __global__ void compression(int32_t* label, const int32_t W, const int32_t H) {
121 |   const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
122 |   const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
123 |   const uint32_t idx = row * W + col;
124 | 
125 |   if (row < H && col < W)
126 |     find_n_compress(label, idx);
127 | }
128 | 
129 | __global__ void final_labeling(
130 |     const uint8_t* img,
131 |     int32_t* label,
132 |     const int32_t W,
133 |     const int32_t H) {
134 |   const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
135 |   const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
136 |   const uint32_t idx = row * W + col;
137 | 
138 |   if (row >= H || col >= W)
139 |     return;
140 | 
141 |   int32_t y = label[idx] + 1;
142 | 
143 |   if (img[idx])
144 |     label[idx] = y;
145 |   else
146 |     label[idx] = 0;
147 | 
148 |   if (col + 1 < W) {
149 |     if (img[idx + 1])
150 |       label[idx + 1] = y;
151 |     else
152 |       label[idx + 1] = 0;
153 | 
154 |     if (row + 1 < H) {
155 |       if (img[idx + W + 1])
156 |         label[idx + W + 1] = y;
157 |       else
158 |         label[idx + W + 1] = 0;
159 |     }
160 |   }
161 | 
162 |   if (row + 1 < H) {
163 |     if (img[idx + W])
164 |       label[idx + W] = y;
165 |     else
166 |       label[idx + W] = 0;
167 |   }
168 | }
169 | 
170 | __global__ void init_counting(
171 |     const int32_t* label,
172 |     int32_t* count_init,
173 |     const int32_t W,
174 |     const int32_t H) {
175 |   const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y);
176 |   const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x);
177 |   const uint32_t idx = row * W + col;
178 | 
179 |   if (row >= H || col >= W)
180 |     return;
181 | 
182 |   int32_t y = label[idx];
183 |   if (y > 0) {
184 |     int32_t count_idx = y - 1;
185 |     atomicAdd(count_init + count_idx, 1);
186 |   }
187 | }
188 | 
189 | __global__ void final_counting(
190 |     const int32_t* label,
191 |     const int32_t* count_init,
192 |     int32_t* count_final,
193 |     const int32_t W,
194 |     const int32_t H) {
195 |   const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y);
196 |   const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x);
197 |   const uint32_t idx = row * W + col;
198 | 
199 |   if (row >= H || col >= W)
200 |     return;
201 | 
202 |   int32_t y = label[idx];
203 |   if (y > 0) {
204 |     int32_t count_idx = y - 1;
205 |     count_final[idx] = count_init[count_idx];
206 |   } else {
207 |     count_final[idx] = 0;
208 |   }
209 | }
210 | 
211 | } // namespace cc2d
212 | 
213 | std::vector<torch::Tensor> get_connected_componnets(
214 |     const torch::Tensor& inputs) {
215 |   AT_ASSERTM(inputs.is_cuda(), "inputs must be a CUDA tensor");
216 |   AT_ASSERTM(inputs.ndimension() == 4, "inputs must be [N, 1, H, W] shape");
217 |   AT_ASSERTM(
218 |       inputs.scalar_type() == torch::kUInt8, "inputs must be a uint8 type");
219 | 
220 |   const uint32_t N = inputs.size(0);
221 |   const uint32_t C = inputs.size(1);
222 |   const uint32_t H = inputs.size(2);
223 |   const uint32_t W = inputs.size(3);
224 | 
225 |   AT_ASSERTM(C == 1, "inputs must be [N, 1, H, W] shape");
226 |   AT_ASSERTM((H % 2) == 0, "height must be an even number");
227 |   AT_ASSERTM((W % 2) == 0, "width must be an even number");
228 | 
229 |   // label must be uint32_t
230 |   auto label_options =
231 |       torch::TensorOptions().dtype(torch::kInt32).device(inputs.device());
232 |   torch::Tensor labels = torch::zeros({N, C, H, W}, label_options);
233 |   torch::Tensor counts_init = torch::zeros({N, C, H, W}, label_options);
234 |   torch::Tensor counts_final = torch::zeros({N, C, H, W}, label_options);
235 | 
236 |   dim3 grid = dim3(
237 |       ((W + 1) / 2 + BLOCK_COLS - 1) / BLOCK_COLS,
238 |       ((H + 1) / 2 + BLOCK_ROWS - 1) / BLOCK_ROWS);
239 |   dim3 block = dim3(BLOCK_COLS, BLOCK_ROWS);
240 |   dim3 grid_count =
241 |       dim3((W + BLOCK_COLS) / BLOCK_COLS, (H + BLOCK_ROWS) / BLOCK_ROWS);
242 |   dim3 block_count = dim3(BLOCK_COLS, BLOCK_ROWS);
243 |   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
244 | 
245 |   for (int n = 0; n < N; n++) {
246 |     uint32_t offset = n * H * W;
247 | 
248 |     cc2d::init_labeling<<<grid, block, 0, stream>>>(
249 |         labels.data_ptr<int32_t>() + offset, W, H);
250 |     cc2d::merge<<<grid, block, 0, stream>>>(
251 |         inputs.data_ptr<uint8_t>() + offset,
252 |         labels.data_ptr<int32_t>() + offset,
253 |         W,
254 |         H);
255 |     cc2d::compression<<<grid, block, 0, stream>>>(
256 |         labels.data_ptr<int32_t>() + offset, W, H);
257 |     cc2d::final_labeling<<<grid, block, 0, stream>>>(
258 |         inputs.data_ptr<uint8_t>() + offset,
259 |         labels.data_ptr<int32_t>() + offset,
260 |         W,
261 |         H);
262 | 
263 |     // get the counting of each pixel
264 |     cc2d::init_counting<<<grid_count, block_count, 0, stream>>>(
265 |         labels.data_ptr<int32_t>() + offset,
266 |         counts_init.data_ptr<int32_t>() + offset,
267 |         W,
268 |         H);
269 |     cc2d::final_counting<<<grid_count, block_count, 0, stream>>>(
270 |         labels.data_ptr<int32_t>() + offset,
271 |         counts_init.data_ptr<int32_t>() + offset,
272 |         counts_final.data_ptr<int32_t>() + offset,
273 |         W,
274 |         H);
275 |   }
276 | 
277 |   // returned values are [labels, counts]
278 |   std::vector<torch::Tensor> outputs;
279 |   outputs.push_back(labels);
280 |   outputs.push_back(counts_final);
281 |   return outputs;
282 | }
283 | 
284 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
285 |   m.def(
286 |       "get_connected_componnets",
287 |       &get_connected_componnets,
288 |       "get_connected_componnets");
289 | }
290 | 


--------------------------------------------------------------------------------
/sam2/edgetam.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | 
  3 | # Model
  4 | model:
  5 |   _target_: sam2.modeling.sam2_base.SAM2Base
  6 |   image_encoder:
  7 |     _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
  8 |     scalp: 1
  9 |     trunk:
 10 |       _target_: sam2.modeling.backbones.timm.TimmBackbone
 11 |       name: repvit_m1.dist_in1k
 12 |       features:
 13 |       - layer0
 14 |       - layer1
 15 |       - layer2
 16 |       - layer3
 17 |     neck:
 18 |       _target_: sam2.modeling.backbones.image_encoder.FpnNeck
 19 |       position_encoding:
 20 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 21 |         num_pos_feats: 256
 22 |         normalize: true
 23 |         scale: null
 24 |         temperature: 10000
 25 |       d_model: 256
 26 |       backbone_channel_list: [384, 192, 96, 48]
 27 |       fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
 28 |       fpn_interp_model: nearest
 29 | 
 30 |   memory_attention:
 31 |     _target_: sam2.modeling.memory_attention.MemoryAttention
 32 |     d_model: 256
 33 |     pos_enc_at_input: true
 34 |     layer:
 35 |       _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
 36 |       activation: relu
 37 |       dim_feedforward: 2048
 38 |       dropout: 0.1
 39 |       pos_enc_at_attn: false
 40 |       self_attention:
 41 |         _target_: sam2.modeling.sam.transformer.RoPEAttention
 42 |         rope_theta: 10000.0
 43 |         feat_sizes: [32, 32]
 44 |         embedding_dim: 256
 45 |         num_heads: 1
 46 |         downsample_rate: 1
 47 |         dropout: 0.1
 48 |       d_model: 256
 49 |       pos_enc_at_cross_attn_keys: true
 50 |       pos_enc_at_cross_attn_queries: false
 51 |       cross_attention:
 52 |         _target_: sam2.modeling.sam.transformer.RoPEAttentionv2
 53 |         rope_theta: 10000.0
 54 |         q_sizes: [64, 64]
 55 |         k_sizes: [16, 16]
 56 |         embedding_dim: 256
 57 |         num_heads: 1
 58 |         downsample_rate: 1
 59 |         dropout: 0.1
 60 |         kv_in_dim: 64
 61 |     num_layers: 2
 62 | 
 63 |   memory_encoder:
 64 |     _target_: sam2.modeling.memory_encoder.MemoryEncoder
 65 |     out_dim: 64
 66 |     position_encoding:
 67 |       _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
 68 |       num_pos_feats: 64
 69 |       normalize: true
 70 |       scale: null
 71 |       temperature: 10000
 72 |     mask_downsampler:
 73 |       _target_: sam2.modeling.memory_encoder.MaskDownSampler
 74 |       kernel_size: 3
 75 |       stride: 2
 76 |       padding: 1
 77 |     fuser:
 78 |       _target_: sam2.modeling.memory_encoder.Fuser
 79 |       layer:
 80 |         _target_: sam2.modeling.memory_encoder.CXBlock
 81 |         dim: 256
 82 |         kernel_size: 7
 83 |         padding: 3
 84 |         layer_scale_init_value: 1e-6
 85 |         use_dwconv: True  # depth-wise convs
 86 |       num_layers: 2
 87 | 
 88 |   spatial_perceiver:
 89 |     _target_: sam2.modeling.perceiver.PerceiverResampler
 90 |     depth: 2
 91 |     dim: 64
 92 |     dim_head: 64
 93 |     heads: 1
 94 |     ff_mult: 4
 95 |     hidden_dropout_p: 0.
 96 |     attention_dropout_p: 0.
 97 |     pos_enc_at_key_value: true # implicit pos
 98 |     concat_kv_latents: false
 99 |     num_latents: 256
100 |     num_latents_2d: 256
101 |     position_encoding:
102 |         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
103 |         num_pos_feats: 64
104 |         normalize: true
105 |         scale: null
106 |         temperature: 10000
107 |     use_self_attn: true
108 | 
109 |   num_maskmem: 7
110 |   image_size: 1024
111 |   # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
112 |   sigmoid_scale_for_mem_enc: 20.0
113 |   sigmoid_bias_for_mem_enc: -10.0
114 |   use_mask_input_as_output_without_sam: true
115 |   # Memory
116 |   directly_add_no_mem_embed: true
117 |   # use high-resolution feature map in the SAM mask decoder
118 |   use_high_res_features_in_sam: true
119 |   # output 3 masks on the first click on initial conditioning frames
120 |   multimask_output_in_sam: true
121 |   # SAM heads
122 |   iou_prediction_use_sigmoid: True
123 |   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
124 |   use_obj_ptrs_in_encoder: true
125 |   add_tpos_enc_to_obj_ptrs: false
126 |   only_obj_ptrs_in_the_past_for_eval: true
127 |   # object occlusion prediction
128 |   pred_obj_scores: true
129 |   pred_obj_scores_mlp: true
130 |   fixed_no_obj_ptr: true
131 |   # multimask tracking settings
132 |   multimask_output_for_tracking: true
133 |   use_multimask_token_for_obj_ptr: true
134 |   multimask_min_pt_num: 0
135 |   multimask_max_pt_num: 1
136 |   use_mlp_for_obj_ptr_proj: true
137 |   # Compilation flag
138 |   compile_image_encoder: false


--------------------------------------------------------------------------------
/sam2/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/sam2/modeling/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/sam2/modeling/backbones/hieradet.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import logging
  8 | from functools import partial
  9 | from typing import List, Tuple, Union
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | from iopath.common.file_io import g_pathmgr
 15 | 
 16 | from sam2.modeling.backbones.utils import (
 17 |     PatchEmbed,
 18 |     window_partition,
 19 |     window_unpartition,
 20 | )
 21 | 
 22 | from sam2.modeling.sam2_utils import DropPath, MLP
 23 | 
 24 | 
 25 | def do_pool(x: torch.Tensor, pool: nn.Module, norm: nn.Module = None) -> torch.Tensor:
 26 |     if pool is None:
 27 |         return x
 28 |     # (B, H, W, C) -> (B, C, H, W)
 29 |     x = x.permute(0, 3, 1, 2)
 30 |     x = pool(x)
 31 |     # (B, C, H', W') -> (B, H', W', C)
 32 |     x = x.permute(0, 2, 3, 1)
 33 |     if norm:
 34 |         x = norm(x)
 35 | 
 36 |     return x
 37 | 
 38 | 
 39 | class MultiScaleAttention(nn.Module):
 40 |     def __init__(
 41 |         self,
 42 |         dim: int,
 43 |         dim_out: int,
 44 |         num_heads: int,
 45 |         q_pool: nn.Module = None,
 46 |     ):
 47 |         super().__init__()
 48 | 
 49 |         self.dim = dim
 50 |         self.dim_out = dim_out
 51 |         self.num_heads = num_heads
 52 |         self.q_pool = q_pool
 53 |         self.qkv = nn.Linear(dim, dim_out * 3)
 54 |         self.proj = nn.Linear(dim_out, dim_out)
 55 | 
 56 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 57 |         B, H, W, _ = x.shape
 58 |         # qkv with shape (B, H * W, 3, nHead, C)
 59 |         qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1)
 60 |         # q, k, v with shape (B, H * W, nheads, C)
 61 |         q, k, v = torch.unbind(qkv, 2)
 62 | 
 63 |         # Q pooling (for downsample at stage changes)
 64 |         if self.q_pool:
 65 |             q = do_pool(q.reshape(B, H, W, -1), self.q_pool)
 66 |             H, W = q.shape[1:3]  # downsampled shape
 67 |             q = q.reshape(B, H * W, self.num_heads, -1)
 68 | 
 69 |         # Torch's SDPA expects [B, nheads, H*W, C] so we transpose
 70 |         x = F.scaled_dot_product_attention(
 71 |             q.transpose(1, 2),
 72 |             k.transpose(1, 2),
 73 |             v.transpose(1, 2),
 74 |         )
 75 |         # Transpose back
 76 |         x = x.transpose(1, 2)
 77 |         x = x.reshape(B, H, W, -1)
 78 | 
 79 |         x = self.proj(x)
 80 | 
 81 |         return x
 82 | 
 83 | 
 84 | class MultiScaleBlock(nn.Module):
 85 |     def __init__(
 86 |         self,
 87 |         dim: int,
 88 |         dim_out: int,
 89 |         num_heads: int,
 90 |         mlp_ratio: float = 4.0,
 91 |         drop_path: float = 0.0,
 92 |         norm_layer: Union[nn.Module, str] = "LayerNorm",
 93 |         q_stride: Tuple[int, int] = None,
 94 |         act_layer: nn.Module = nn.GELU,
 95 |         window_size: int = 0,
 96 |     ):
 97 |         super().__init__()
 98 | 
 99 |         if isinstance(norm_layer, str):
100 |             norm_layer = partial(getattr(nn, norm_layer), eps=1e-6)
101 | 
102 |         self.dim = dim
103 |         self.dim_out = dim_out
104 |         self.norm1 = norm_layer(dim)
105 | 
106 |         self.window_size = window_size
107 | 
108 |         self.pool, self.q_stride = None, q_stride
109 |         if self.q_stride:
110 |             self.pool = nn.MaxPool2d(
111 |                 kernel_size=q_stride, stride=q_stride, ceil_mode=False
112 |             )
113 | 
114 |         self.attn = MultiScaleAttention(
115 |             dim,
116 |             dim_out,
117 |             num_heads=num_heads,
118 |             q_pool=self.pool,
119 |         )
120 |         self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
121 | 
122 |         self.norm2 = norm_layer(dim_out)
123 |         self.mlp = MLP(
124 |             dim_out,
125 |             int(dim_out * mlp_ratio),
126 |             dim_out,
127 |             num_layers=2,
128 |             activation=act_layer,
129 |         )
130 | 
131 |         if dim != dim_out:
132 |             self.proj = nn.Linear(dim, dim_out)
133 | 
134 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
135 |         shortcut = x  # B, H, W, C
136 |         x = self.norm1(x)
137 | 
138 |         # Skip connection
139 |         if self.dim != self.dim_out:
140 |             shortcut = do_pool(self.proj(x), self.pool)
141 | 
142 |         # Window partition
143 |         window_size = self.window_size
144 |         if window_size > 0:
145 |             H, W = x.shape[1], x.shape[2]
146 |             x, pad_hw = window_partition(x, window_size)
147 | 
148 |         # Window Attention + Q Pooling (if stage change)
149 |         x = self.attn(x)
150 |         if self.q_stride:
151 |             # Shapes have changed due to Q pooling
152 |             window_size = self.window_size // self.q_stride[0]
153 |             H, W = shortcut.shape[1:3]
154 | 
155 |             pad_h = (window_size - H % window_size) % window_size
156 |             pad_w = (window_size - W % window_size) % window_size
157 |             pad_hw = (H + pad_h, W + pad_w)
158 | 
159 |         # Reverse window partition
160 |         if self.window_size > 0:
161 |             x = window_unpartition(x, window_size, pad_hw, (H, W))
162 | 
163 |         x = shortcut + self.drop_path(x)
164 |         # MLP
165 |         x = x + self.drop_path(self.mlp(self.norm2(x)))
166 |         return x
167 | 
168 | 
169 | class Hiera(nn.Module):
170 |     """
171 |     Reference: https://arxiv.org/abs/2306.00989
172 |     """
173 | 
174 |     def __init__(
175 |         self,
176 |         embed_dim: int = 96,  # initial embed dim
177 |         num_heads: int = 1,  # initial number of heads
178 |         drop_path_rate: float = 0.0,  # stochastic depth
179 |         q_pool: int = 3,  # number of q_pool stages
180 |         q_stride: Tuple[int, int] = (2, 2),  # downsample stride bet. stages
181 |         stages: Tuple[int, ...] = (2, 3, 16, 3),  # blocks per stage
182 |         dim_mul: float = 2.0,  # dim_mul factor at stage shift
183 |         head_mul: float = 2.0,  # head_mul factor at stage shift
184 |         window_pos_embed_bkg_spatial_size: Tuple[int, int] = (14, 14),
185 |         # window size per stage, when not using global att.
186 |         window_spec: Tuple[int, ...] = (
187 |             8,
188 |             4,
189 |             14,
190 |             7,
191 |         ),
192 |         # global attn in these blocks
193 |         global_att_blocks: Tuple[int, ...] = (
194 |             12,
195 |             16,
196 |             20,
197 |         ),
198 |         weights_path=None,
199 |         return_interm_layers=True,  # return feats from every stage
200 |     ):
201 |         super().__init__()
202 | 
203 |         assert len(stages) == len(window_spec)
204 |         self.window_spec = window_spec
205 | 
206 |         depth = sum(stages)
207 |         self.q_stride = q_stride
208 |         self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)]
209 |         assert 0 <= q_pool <= len(self.stage_ends[:-1])
210 |         self.q_pool_blocks = [x + 1 for x in self.stage_ends[:-1]][:q_pool]
211 |         self.return_interm_layers = return_interm_layers
212 | 
213 |         self.patch_embed = PatchEmbed(
214 |             embed_dim=embed_dim,
215 |         )
216 |         # Which blocks have global att?
217 |         self.global_att_blocks = global_att_blocks
218 | 
219 |         # Windowed positional embedding (https://arxiv.org/abs/2311.05613)
220 |         self.window_pos_embed_bkg_spatial_size = window_pos_embed_bkg_spatial_size
221 |         self.pos_embed = nn.Parameter(
222 |             torch.zeros(1, embed_dim, *self.window_pos_embed_bkg_spatial_size)
223 |         )
224 |         self.pos_embed_window = nn.Parameter(
225 |             torch.zeros(1, embed_dim, self.window_spec[0], self.window_spec[0])
226 |         )
227 | 
228 |         dpr = [
229 |             x.item() for x in torch.linspace(0, drop_path_rate, depth)
230 |         ]  # stochastic depth decay rule
231 | 
232 |         cur_stage = 1
233 |         self.blocks = nn.ModuleList()
234 | 
235 |         for i in range(depth):
236 |             dim_out = embed_dim
237 |             # lags by a block, so first block of
238 |             # next stage uses an initial window size
239 |             # of previous stage and final window size of current stage
240 |             window_size = self.window_spec[cur_stage - 1]
241 | 
242 |             if self.global_att_blocks is not None:
243 |                 window_size = 0 if i in self.global_att_blocks else window_size
244 | 
245 |             if i - 1 in self.stage_ends:
246 |                 dim_out = int(embed_dim * dim_mul)
247 |                 num_heads = int(num_heads * head_mul)
248 |                 cur_stage += 1
249 | 
250 |             block = MultiScaleBlock(
251 |                 dim=embed_dim,
252 |                 dim_out=dim_out,
253 |                 num_heads=num_heads,
254 |                 drop_path=dpr[i],
255 |                 q_stride=self.q_stride if i in self.q_pool_blocks else None,
256 |                 window_size=window_size,
257 |             )
258 | 
259 |             embed_dim = dim_out
260 |             self.blocks.append(block)
261 | 
262 |         self.channel_list = (
263 |             [self.blocks[i].dim_out for i in self.stage_ends[::-1]]
264 |             if return_interm_layers
265 |             else [self.blocks[-1].dim_out]
266 |         )
267 | 
268 |         if weights_path is not None:
269 |             with g_pathmgr.open(weights_path, "rb") as f:
270 |                 chkpt = torch.load(f, map_location="cpu")
271 |             logging.info("loading Hiera", self.load_state_dict(chkpt, strict=False))
272 | 
273 |     def _get_pos_embed(self, hw: Tuple[int, int]) -> torch.Tensor:
274 |         h, w = hw
275 |         window_embed = self.pos_embed_window
276 |         pos_embed = F.interpolate(self.pos_embed, size=(h, w), mode="bicubic")
277 |         pos_embed = pos_embed + window_embed.tile(
278 |             [x // y for x, y in zip(pos_embed.shape, window_embed.shape)]
279 |         )
280 |         pos_embed = pos_embed.permute(0, 2, 3, 1)
281 |         return pos_embed
282 | 
283 |     def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
284 |         x = self.patch_embed(x)
285 |         # x: (B, H, W, C)
286 | 
287 |         # Add pos embed
288 |         x = x + self._get_pos_embed(x.shape[1:3])
289 | 
290 |         outputs = []
291 |         for i, blk in enumerate(self.blocks):
292 |             x = blk(x)
293 |             if (i == self.stage_ends[-1]) or (
294 |                 i in self.stage_ends and self.return_interm_layers
295 |             ):
296 |                 feats = x.permute(0, 3, 1, 2)
297 |                 outputs.append(feats)
298 | 
299 |         return outputs
300 | 
301 |     def get_layer_id(self, layer_name):
302 |         # https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33
303 |         num_layers = self.get_num_layers()
304 | 
305 |         if layer_name.find("rel_pos") != -1:
306 |             return num_layers + 1
307 |         elif layer_name.find("pos_embed") != -1:
308 |             return 0
309 |         elif layer_name.find("patch_embed") != -1:
310 |             return 0
311 |         elif layer_name.find("blocks") != -1:
312 |             return int(layer_name.split("blocks")[1].split(".")[1]) + 1
313 |         else:
314 |             return num_layers + 1
315 | 
316 |     def get_num_layers(self) -> int:
317 |         return len(self.blocks)
318 | 


--------------------------------------------------------------------------------
/sam2/modeling/backbones/image_encoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | from typing import List, Optional
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | 
 13 | 
 14 | class ImageEncoder(nn.Module):
 15 |     def __init__(
 16 |         self,
 17 |         trunk: nn.Module,
 18 |         neck: nn.Module,
 19 |         scalp: int = 0,
 20 |     ):
 21 |         super().__init__()
 22 |         self.trunk = trunk
 23 |         self.neck = neck
 24 |         self.scalp = scalp
 25 |         assert (
 26 |             self.trunk.channel_list == self.neck.backbone_channel_list
 27 |         ), f"Channel dims of trunk and neck do not match. Trunk: {self.trunk.channel_list}, neck: {self.neck.backbone_channel_list}"
 28 | 
 29 |     def forward(self, sample: torch.Tensor):
 30 |         # Forward through backbone
 31 |         features, pos = self.neck(self.trunk(sample))
 32 |         if self.scalp > 0:
 33 |             # Discard the lowest resolution features
 34 |             features, pos = features[: -self.scalp], pos[: -self.scalp]
 35 | 
 36 |         src = features[-1]
 37 |         output = {
 38 |             "vision_features": src,
 39 |             "vision_pos_enc": pos,
 40 |             "backbone_fpn": features,
 41 |         }
 42 |         return output
 43 | 
 44 | 
 45 | class FpnNeck(nn.Module):
 46 |     """
 47 |     A modified variant of Feature Pyramid Network (FPN) neck
 48 |     (we remove output conv and also do bicubic interpolation similar to ViT
 49 |     pos embed interpolation)
 50 |     """
 51 | 
 52 |     def __init__(
 53 |         self,
 54 |         position_encoding: nn.Module,
 55 |         d_model: int,
 56 |         backbone_channel_list: List[int],
 57 |         kernel_size: int = 1,
 58 |         stride: int = 1,
 59 |         padding: int = 0,
 60 |         fpn_interp_model: str = "bilinear",
 61 |         fuse_type: str = "sum",
 62 |         fpn_top_down_levels: Optional[List[int]] = None,
 63 |     ):
 64 |         """Initialize the neck
 65 |         :param trunk: the backbone
 66 |         :param position_encoding: the positional encoding to use
 67 |         :param d_model: the dimension of the model
 68 |         :param neck_norm: the normalization to use
 69 |         """
 70 |         super().__init__()
 71 |         self.position_encoding = position_encoding
 72 |         self.convs = nn.ModuleList()
 73 |         self.backbone_channel_list = backbone_channel_list
 74 |         self.d_model = d_model
 75 |         for dim in backbone_channel_list:
 76 |             current = nn.Sequential()
 77 |             current.add_module(
 78 |                 "conv",
 79 |                 nn.Conv2d(
 80 |                     in_channels=dim,
 81 |                     out_channels=d_model,
 82 |                     kernel_size=kernel_size,
 83 |                     stride=stride,
 84 |                     padding=padding,
 85 |                 ),
 86 |             )
 87 | 
 88 |             self.convs.append(current)
 89 |         self.fpn_interp_model = fpn_interp_model
 90 |         assert fuse_type in ["sum", "avg"]
 91 |         self.fuse_type = fuse_type
 92 | 
 93 |         # levels to have top-down features in its outputs
 94 |         # e.g. if fpn_top_down_levels is [2, 3], then only outputs of level 2 and 3
 95 |         # have top-down propagation, while outputs of level 0 and level 1 have only
 96 |         # lateral features from the same backbone level.
 97 |         if fpn_top_down_levels is None:
 98 |             # default is to have top-down features on all levels
 99 |             fpn_top_down_levels = range(len(self.convs))
100 |         self.fpn_top_down_levels = list(fpn_top_down_levels)
101 | 
102 |     def forward(self, xs: List[torch.Tensor]):
103 | 
104 |         out = [None] * len(self.convs)
105 |         pos = [None] * len(self.convs)
106 |         assert len(xs) == len(self.convs)
107 |         # fpn forward pass
108 |         # see https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/fpn.py
109 |         prev_features = None
110 |         # forward in top-down order (from low to high resolution)
111 |         n = len(self.convs) - 1
112 |         for i in range(n, -1, -1):
113 |             x = xs[i]
114 |             lateral_features = self.convs[n - i](x)
115 |             if i in self.fpn_top_down_levels and prev_features is not None:
116 |                 top_down_features = F.interpolate(
117 |                     prev_features.to(dtype=torch.float32),
118 |                     scale_factor=2.0,
119 |                     mode=self.fpn_interp_model,
120 |                     align_corners=(
121 |                         None if self.fpn_interp_model == "nearest" else False
122 |                     ),
123 |                     antialias=False,
124 |                 )
125 |                 prev_features = lateral_features + top_down_features
126 |                 if self.fuse_type == "avg":
127 |                     prev_features /= 2
128 |             else:
129 |                 prev_features = lateral_features
130 |             x_out = prev_features
131 |             out[i] = x_out
132 |             pos[i] = self.position_encoding(x_out).to(x_out.dtype)
133 | 
134 |         return out, pos
135 | 


--------------------------------------------------------------------------------
/sam2/modeling/backbones/timm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | """Backbones from the TIMM library."""
 8 | 
 9 | from typing import List, Tuple
10 | 
11 | import torch
12 | 
13 | from timm.models import create_model
14 | from torch import nn
15 | 
16 | 
17 | class TimmBackbone(nn.Module):
18 |     def __init__(
19 |         self,
20 |         name: str,
21 |         features: Tuple[str, ...],
22 |     ):
23 |         super().__init__()
24 | 
25 |         out_indices = tuple(int(f[len("layer") :]) for f in features)
26 | 
27 |         backbone = create_model(
28 |             name,
29 |             pretrained=True,
30 |             in_chans=3,
31 |             features_only=True,
32 |             out_indices=out_indices,
33 |         )
34 | 
35 |         num_channels = backbone.feature_info.channels()
36 |         self.channel_list = num_channels[::-1]
37 |         self.body = backbone
38 | 
39 |     def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
40 |         xs = self.body(x)
41 | 
42 |         out = []
43 |         for i, x in enumerate(xs):
44 |             out.append(x)
45 |         return out
46 | 


--------------------------------------------------------------------------------
/sam2/modeling/backbones/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | """Some utilities for backbones, in particular for windowing"""
 8 | 
 9 | from typing import Tuple
10 | 
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | 
15 | 
16 | def window_partition(x, window_size):
17 |     """
18 |     Partition into non-overlapping windows with padding if needed.
19 |     Args:
20 |         x (tensor): input tokens with [B, H, W, C].
21 |         window_size (int): window size.
22 |     Returns:
23 |         windows: windows after partition with [B * num_windows, window_size, window_size, C].
24 |         (Hp, Wp): padded height and width before partition
25 |     """
26 |     B, H, W, C = x.shape
27 | 
28 |     pad_h = (window_size - H % window_size) % window_size
29 |     pad_w = (window_size - W % window_size) % window_size
30 |     if pad_h > 0 or pad_w > 0:
31 |         x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
32 |     Hp, Wp = H + pad_h, W + pad_w
33 | 
34 |     x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
35 |     windows = (
36 |         x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
37 |     )
38 |     return windows, (Hp, Wp)
39 | 
40 | 
41 | def window_unpartition(windows, window_size, pad_hw, hw):
42 |     """
43 |     Window unpartition into original sequences and removing padding.
44 |     Args:
45 |         x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
46 |         window_size (int): window size.
47 |         pad_hw (Tuple): padded height and width (Hp, Wp).
48 |         hw (Tuple): original height and width (H, W) before padding.
49 |     Returns:
50 |         x: unpartitioned sequences with [B, H, W, C].
51 |     """
52 |     Hp, Wp = pad_hw
53 |     H, W = hw
54 |     B = windows.shape[0] // (Hp * Wp // window_size // window_size)
55 |     x = windows.view(
56 |         B, Hp // window_size, Wp // window_size, window_size, window_size, -1
57 |     )
58 |     x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
59 | 
60 |     if Hp > H or Wp > W:
61 |         x = x[:, :H, :W, :].contiguous()
62 |     return x
63 | 
64 | 
65 | class PatchEmbed(nn.Module):
66 |     """
67 |     Image to Patch Embedding.
68 |     """
69 | 
70 |     def __init__(
71 |         self,
72 |         kernel_size: Tuple[int, ...] = (7, 7),
73 |         stride: Tuple[int, ...] = (4, 4),
74 |         padding: Tuple[int, ...] = (3, 3),
75 |         in_chans: int = 3,
76 |         embed_dim: int = 768,
77 |     ):
78 |         """
79 |         Args:
80 |             kernel_size (Tuple): kernel size of the projection layer.
81 |             stride (Tuple): stride of the projection layer.
82 |             padding (Tuple): padding size of the projection layer.
83 |             in_chans (int): Number of input image channels.
84 |             embed_dim (int):  embed_dim (int): Patch embedding dimension.
85 |         """
86 |         super().__init__()
87 |         self.proj = nn.Conv2d(
88 |             in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
89 |         )
90 | 
91 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
92 |         x = self.proj(x)
93 |         # B C H W -> B H W C
94 |         x = x.permute(0, 2, 3, 1)
95 |         return x
96 | 


--------------------------------------------------------------------------------
/sam2/modeling/memory_attention.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | from typing import Optional
  8 | 
  9 | import torch
 10 | 
 11 | from sam2.modeling.sam.transformer import RoPEAttention, RoPEAttentionv2
 12 | 
 13 | from sam2.modeling.sam2_utils import get_activation_fn, get_clones
 14 | from torch import nn, Tensor
 15 | 
 16 | 
 17 | class MemoryAttentionLayer(nn.Module):
 18 | 
 19 |     def __init__(
 20 |         self,
 21 |         activation: str,
 22 |         cross_attention: nn.Module,
 23 |         d_model: int,
 24 |         dim_feedforward: int,
 25 |         dropout: float,
 26 |         pos_enc_at_attn: bool,
 27 |         pos_enc_at_cross_attn_keys: bool,
 28 |         pos_enc_at_cross_attn_queries: bool,
 29 |         self_attention: nn.Module,
 30 |     ):
 31 |         super().__init__()
 32 |         self.d_model = d_model
 33 |         self.dim_feedforward = dim_feedforward
 34 |         self.dropout_value = dropout
 35 |         self.self_attn = self_attention
 36 |         self.cross_attn_image = cross_attention
 37 | 
 38 |         # Implementation of Feedforward model
 39 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
 40 |         self.dropout = nn.Dropout(dropout)
 41 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
 42 | 
 43 |         self.norm1 = nn.LayerNorm(d_model)
 44 |         self.norm2 = nn.LayerNorm(d_model)
 45 |         self.norm3 = nn.LayerNorm(d_model)
 46 |         self.dropout1 = nn.Dropout(dropout)
 47 |         self.dropout2 = nn.Dropout(dropout)
 48 |         self.dropout3 = nn.Dropout(dropout)
 49 | 
 50 |         self.activation_str = activation
 51 |         self.activation = get_activation_fn(activation)
 52 | 
 53 |         # Where to add pos enc
 54 |         self.pos_enc_at_attn = pos_enc_at_attn
 55 |         self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
 56 |         self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
 57 | 
 58 |     def _forward_sa(self, tgt, query_pos):
 59 |         # Self-Attention
 60 |         tgt2 = self.norm1(tgt)
 61 |         q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
 62 |         tgt2 = self.self_attn(q, k, v=tgt2)
 63 |         tgt = tgt + self.dropout1(tgt2)
 64 |         return tgt
 65 | 
 66 |     def _forward_ca(
 67 |         self, tgt, memory, query_pos, pos, num_k_exclude_rope=0, rope_k_repeat=-1
 68 |     ):
 69 |         kwds = {}
 70 |         if rope_k_repeat >= 0:
 71 |             assert isinstance(self.cross_attn_image, RoPEAttentionv2)
 72 |             kwds["num_k_exclude_rope"] = num_k_exclude_rope
 73 |             kwds["rope_k_repeat"] = rope_k_repeat
 74 |         elif num_k_exclude_rope > 0:
 75 |             assert isinstance(self.cross_attn_image, RoPEAttention)
 76 |             kwds = {"num_k_exclude_rope": num_k_exclude_rope}
 77 | 
 78 |         # Cross-Attention
 79 |         tgt2 = self.norm2(tgt)
 80 |         tgt2 = self.cross_attn_image(
 81 |             q=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2,
 82 |             k=memory + pos if self.pos_enc_at_cross_attn_keys else memory,
 83 |             v=memory,
 84 |             **kwds,
 85 |         )
 86 |         tgt = tgt + self.dropout2(tgt2)
 87 |         return tgt
 88 | 
 89 |     def forward(
 90 |         self,
 91 |         tgt,
 92 |         memory,
 93 |         pos: Optional[Tensor] = None,
 94 |         query_pos: Optional[Tensor] = None,
 95 |         num_k_exclude_rope: int = 0,
 96 |         rope_k_repeat: int = -1,
 97 |     ) -> torch.Tensor:
 98 | 
 99 |         # Self-Attn, Cross-Attn
100 |         tgt = self._forward_sa(tgt, query_pos)
101 |         tgt = self._forward_ca(
102 |             tgt, memory, query_pos, pos, num_k_exclude_rope, rope_k_repeat
103 |         )
104 |         # MLP
105 |         tgt2 = self.norm3(tgt)
106 |         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
107 |         tgt = tgt + self.dropout3(tgt2)
108 |         return tgt
109 | 
110 | 
111 | class MemoryAttention(nn.Module):
112 |     def __init__(
113 |         self,
114 |         d_model: int,
115 |         pos_enc_at_input: bool,
116 |         layer: nn.Module,
117 |         num_layers: int,
118 |         batch_first: bool = True,  # Do layers expect batch first input?
119 |     ):
120 |         super().__init__()
121 |         self.d_model = d_model
122 |         self.layers = get_clones(layer, num_layers)
123 |         self.num_layers = num_layers
124 |         self.norm = nn.LayerNorm(d_model)
125 |         self.pos_enc_at_input = pos_enc_at_input
126 |         self.batch_first = batch_first
127 | 
128 |     def forward(
129 |         self,
130 |         curr: torch.Tensor,  # self-attention inputs
131 |         memory: torch.Tensor,  # cross-attention inputs
132 |         curr_pos: Optional[Tensor] = None,  # pos_enc for self-attention inputs
133 |         memory_pos: Optional[Tensor] = None,  # pos_enc for cross-attention inputs
134 |         num_obj_ptr_tokens: int = 0,  # number of object pointer *tokens*
135 |         num_spatial_mem: int = -1,  # number of spatial memory embeddings
136 |     ):
137 |         if isinstance(curr, list):
138 |             assert isinstance(curr_pos, list)
139 |             assert len(curr) == len(curr_pos) == 1
140 |             curr, curr_pos = (
141 |                 curr[0],
142 |                 curr_pos[0],
143 |             )
144 | 
145 |         assert (
146 |             curr.shape[1] == memory.shape[1]
147 |         ), "Batch size must be the same for curr and memory"
148 | 
149 |         output = curr
150 |         if self.pos_enc_at_input and curr_pos is not None:
151 |             output = output + 0.1 * curr_pos
152 | 
153 |         if self.batch_first:
154 |             # Convert to batch first
155 |             output = output.transpose(0, 1)
156 |             curr_pos = curr_pos.transpose(0, 1)
157 |             memory = memory.transpose(0, 1)
158 |             memory_pos = memory_pos.transpose(0, 1)
159 | 
160 |         for layer in self.layers:
161 |             kwds = {}
162 |             if isinstance(layer.cross_attn_image, RoPEAttention):
163 |                 kwds = {"num_k_exclude_rope": num_obj_ptr_tokens}
164 |             if isinstance(layer.cross_attn_image, RoPEAttentionv2):
165 |                 kwds["num_k_exclude_rope"] = num_obj_ptr_tokens
166 |                 kwds["rope_k_repeat"] = num_spatial_mem
167 | 
168 |             output = layer(
169 |                 tgt=output,
170 |                 memory=memory,
171 |                 pos=memory_pos,
172 |                 query_pos=curr_pos,
173 |                 **kwds,
174 |             )
175 |         normed_output = self.norm(output)
176 | 
177 |         if self.batch_first:
178 |             # Convert back to seq first
179 |             normed_output = normed_output.transpose(0, 1)
180 |             curr_pos = curr_pos.transpose(0, 1)
181 | 
182 |         return normed_output
183 | 


--------------------------------------------------------------------------------
/sam2/modeling/memory_encoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import math
  8 | from typing import Tuple
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.nn.functional as F
 13 | 
 14 | from sam2.modeling.sam2_utils import DropPath, get_clones, LayerNorm2d
 15 | 
 16 | 
 17 | class MaskDownSampler(nn.Module):
 18 |     """
 19 |     Progressively downsample a mask by total_stride, each time by stride.
 20 |     Note that LayerNorm is applied per *token*, like in ViT.
 21 | 
 22 |     With each downsample (by a factor stride**2), channel capacity increases by the same factor.
 23 |     In the end, we linearly project to embed_dim channels.
 24 |     """
 25 | 
 26 |     def __init__(
 27 |         self,
 28 |         embed_dim=256,
 29 |         kernel_size=4,
 30 |         stride=4,
 31 |         padding=0,
 32 |         total_stride=16,
 33 |         activation=nn.GELU,
 34 |     ):
 35 |         super().__init__()
 36 |         num_layers = int(math.log2(total_stride) // math.log2(stride))
 37 |         assert stride**num_layers == total_stride
 38 |         self.encoder = nn.Sequential()
 39 |         mask_in_chans, mask_out_chans = 1, 1
 40 |         for _ in range(num_layers):
 41 |             mask_out_chans = mask_in_chans * (stride**2)
 42 |             self.encoder.append(
 43 |                 nn.Conv2d(
 44 |                     mask_in_chans,
 45 |                     mask_out_chans,
 46 |                     kernel_size=kernel_size,
 47 |                     stride=stride,
 48 |                     padding=padding,
 49 |                 )
 50 |             )
 51 |             self.encoder.append(LayerNorm2d(mask_out_chans))
 52 |             self.encoder.append(activation())
 53 |             mask_in_chans = mask_out_chans
 54 | 
 55 |         self.encoder.append(nn.Conv2d(mask_out_chans, embed_dim, kernel_size=1))
 56 | 
 57 |     def forward(self, x):
 58 |         return self.encoder(x)
 59 | 
 60 | 
 61 | # Lightly adapted from ConvNext (https://github.com/facebookresearch/ConvNeXt)
 62 | class CXBlock(nn.Module):
 63 |     r"""ConvNeXt Block. There are two equivalent implementations:
 64 |     (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
 65 |     (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
 66 |     We use (2) as we find it slightly faster in PyTorch
 67 | 
 68 |     Args:
 69 |         dim (int): Number of input channels.
 70 |         drop_path (float): Stochastic depth rate. Default: 0.0
 71 |         layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
 72 |     """
 73 | 
 74 |     def __init__(
 75 |         self,
 76 |         dim,
 77 |         kernel_size=7,
 78 |         padding=3,
 79 |         drop_path=0.0,
 80 |         layer_scale_init_value=1e-6,
 81 |         use_dwconv=True,
 82 |     ):
 83 |         super().__init__()
 84 |         self.dwconv = nn.Conv2d(
 85 |             dim,
 86 |             dim,
 87 |             kernel_size=kernel_size,
 88 |             padding=padding,
 89 |             groups=dim if use_dwconv else 1,
 90 |         )  # depthwise conv
 91 |         self.norm = LayerNorm2d(dim, eps=1e-6)
 92 |         self.pwconv1 = nn.Linear(
 93 |             dim, 4 * dim
 94 |         )  # pointwise/1x1 convs, implemented with linear layers
 95 |         self.act = nn.GELU()
 96 |         self.pwconv2 = nn.Linear(4 * dim, dim)
 97 |         self.gamma = (
 98 |             nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
 99 |             if layer_scale_init_value > 0
100 |             else None
101 |         )
102 |         self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
103 | 
104 |     def forward(self, x):
105 |         input = x
106 |         x = self.dwconv(x)
107 |         x = self.norm(x)
108 |         x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
109 |         x = self.pwconv1(x)
110 |         x = self.act(x)
111 |         x = self.pwconv2(x)
112 |         if self.gamma is not None:
113 |             x = self.gamma * x
114 |         x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
115 | 
116 |         x = input + self.drop_path(x)
117 |         return x
118 | 
119 | 
120 | class Fuser(nn.Module):
121 |     def __init__(self, layer, num_layers, dim=None, input_projection=False):
122 |         super().__init__()
123 |         self.proj = nn.Identity()
124 |         self.layers = get_clones(layer, num_layers)
125 | 
126 |         if input_projection:
127 |             assert dim is not None
128 |             self.proj = nn.Conv2d(dim, dim, kernel_size=1)
129 | 
130 |     def forward(self, x):
131 |         # normally x: (N, C, H, W)
132 |         x = self.proj(x)
133 |         for layer in self.layers:
134 |             x = layer(x)
135 |         return x
136 | 
137 | 
138 | class MemoryEncoder(nn.Module):
139 |     def __init__(
140 |         self,
141 |         out_dim,
142 |         mask_downsampler,
143 |         fuser,
144 |         position_encoding,
145 |         in_dim=256,  # in_dim of pix_feats
146 |     ):
147 |         super().__init__()
148 | 
149 |         self.mask_downsampler = mask_downsampler
150 | 
151 |         self.pix_feat_proj = nn.Conv2d(in_dim, in_dim, kernel_size=1)
152 |         self.fuser = fuser
153 |         self.position_encoding = position_encoding
154 |         self.out_proj = nn.Identity()
155 |         if out_dim != in_dim:
156 |             self.out_proj = nn.Conv2d(in_dim, out_dim, kernel_size=1)
157 | 
158 |     def forward(
159 |         self,
160 |         pix_feat: torch.Tensor,
161 |         masks: torch.Tensor,
162 |         skip_mask_sigmoid: bool = False,
163 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
164 |         ## Process masks
165 |         # sigmoid, so that less domain shift from gt masks which are bool
166 |         if not skip_mask_sigmoid:
167 |             masks = F.sigmoid(masks)
168 |         masks = self.mask_downsampler(masks)
169 | 
170 |         ## Fuse pix_feats and downsampled masks
171 |         # in case the visual features are on CPU, cast them to CUDA
172 |         pix_feat = pix_feat.to(masks.device)
173 | 
174 |         x = self.pix_feat_proj(pix_feat)
175 |         x = x + masks
176 |         x = self.fuser(x)
177 |         x = self.out_proj(x)
178 | 
179 |         pos = self.position_encoding(x).to(x.dtype)
180 | 
181 |         return {"vision_features": x, "vision_pos_enc": [pos]}
182 | 


--------------------------------------------------------------------------------
/sam2/modeling/perceiver.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import math
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | 
 13 | 
 14 | def FeedForward(dim, mult=4):
 15 |     inner_dim = int(dim * mult)
 16 |     return nn.Sequential(
 17 |         nn.LayerNorm(dim),
 18 |         nn.Linear(dim, inner_dim, bias=False),
 19 |         nn.GELU(),
 20 |         nn.Linear(inner_dim, dim, bias=False),
 21 |     )
 22 | 
 23 | 
 24 | class PerceiverAttention(nn.Module):
 25 |     def __init__(
 26 |         self, *, dim, dim_head=64, heads=8, dropout_p=0.05, concat_kv_latents=True
 27 |     ):
 28 |         super().__init__()
 29 |         self.scale = dim_head**-0.5
 30 |         self.heads = heads
 31 |         inner_dim = dim_head * heads
 32 | 
 33 |         self.norm_x = nn.LayerNorm(dim)
 34 |         self.norm_latents = nn.LayerNorm(dim)
 35 | 
 36 |         self.to_q = nn.Linear(dim, inner_dim, bias=False)
 37 |         self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
 38 |         self.to_out = nn.Linear(inner_dim, dim, bias=False)
 39 | 
 40 |         self.dropout_p = dropout_p
 41 |         self.concat_kv_latents = concat_kv_latents
 42 | 
 43 |     def _separate_heads(self, x: torch.Tensor, num_heads: int) -> torch.Tensor:
 44 |         b, n, c = x.shape
 45 |         x = x.reshape(b, n, num_heads, c // num_heads)
 46 |         return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
 47 | 
 48 |     def _recombine_heads(self, x: torch.Tensor) -> torch.Tensor:
 49 |         b, n_heads, n_tokens, c_per_head = x.shape
 50 |         x = x.transpose(1, 2)
 51 |         return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
 52 | 
 53 |     def forward(self, latents, x, pos=None):
 54 |         latents = self.norm_latents(latents)
 55 |         x = self.norm_x(x)
 56 | 
 57 |         q = self.to_q(latents)
 58 | 
 59 |         # the paper differs from Perceiver in which they also concat the key / values derived from the latents to be attended to
 60 |         if self.concat_kv_latents:
 61 |             kv_input = torch.cat((x, latents), dim=-2)
 62 |         else:
 63 |             kv_input = x
 64 |         k, v = self.to_kv(kv_input).chunk(2, dim=-1)
 65 | 
 66 |         q = self._separate_heads(q, self.heads)
 67 |         k = self._separate_heads(k, self.heads)
 68 |         v = self._separate_heads(v, self.heads)
 69 | 
 70 |         if pos is not None:
 71 |             assert not self.concat_kv_latents
 72 |             pos = self._separate_heads(pos, self.heads)
 73 |             k, v = k + pos, v + pos
 74 | 
 75 |         out = F.scaled_dot_product_attention(
 76 |             q,
 77 |             k,
 78 |             v,
 79 |             attn_mask=None,
 80 |             dropout_p=self.dropout_p if self.training else 0.0,
 81 |         )
 82 |         out = self._recombine_heads(out)
 83 |         return self.to_out(out)
 84 | 
 85 | 
 86 | class Attention(nn.Module):
 87 |     def __init__(self, *, dim, dim_head=64, heads=8, dropout_p=0.05):
 88 |         super().__init__()
 89 |         self.scale = dim_head**-0.5
 90 |         self.heads = heads
 91 |         inner_dim = dim_head * heads
 92 | 
 93 |         self.norm = nn.LayerNorm(dim)
 94 | 
 95 |         self.to_q = nn.Linear(dim, inner_dim, bias=False)
 96 |         self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
 97 |         self.to_out = nn.Linear(inner_dim, dim, bias=False)
 98 | 
 99 |         self.dropout_p = dropout_p
100 | 
101 |     def _separate_heads(self, x: torch.Tensor, num_heads: int) -> torch.Tensor:
102 |         b, n, c = x.shape
103 |         x = x.reshape(b, n, num_heads, c // num_heads)
104 |         return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
105 | 
106 |     def _recombine_heads(self, x: torch.Tensor) -> torch.Tensor:
107 |         b, n_heads, n_tokens, c_per_head = x.shape
108 |         x = x.transpose(1, 2)
109 |         return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
110 | 
111 |     def forward(self, x):
112 |         x = self.norm(x)
113 | 
114 |         q = self.to_q(x)
115 |         k, v = self.to_kv(x).chunk(2, dim=-1)
116 | 
117 |         q = self._separate_heads(q, self.heads)
118 |         k = self._separate_heads(k, self.heads)
119 |         v = self._separate_heads(v, self.heads)
120 | 
121 |         out = F.scaled_dot_product_attention(
122 |             q,
123 |             k,
124 |             v,
125 |             attn_mask=None,
126 |             dropout_p=self.dropout_p if self.training else 0.0,
127 |         )
128 |         out = self._recombine_heads(out)
129 |         return self.to_out(out)
130 | 
131 | 
132 | class PerceiverEncoderLayer(nn.Module):
133 |     def __init__(
134 |         self,
135 |         dim,
136 |         dim_head=64,
137 |         heads=8,
138 |         ff_mult=4,
139 |         hidden_dropout_p=0.0,
140 |         attention_dropout_p=0.0,
141 |         concat_kv_latents=False,
142 |         use_self_attn=False,
143 |     ):
144 |         super().__init__()
145 |         self.attn = PerceiverAttention(
146 |             dim=dim,
147 |             dim_head=dim_head,
148 |             heads=heads,
149 |             dropout_p=attention_dropout_p,
150 |             concat_kv_latents=concat_kv_latents,
151 |         )
152 |         self.ff = FeedForward(dim=dim, mult=ff_mult)
153 |         self.dropout = nn.Dropout(hidden_dropout_p)
154 |         self.use_self_attn = use_self_attn
155 |         if use_self_attn:
156 |             self.self_attn = Attention(
157 |                 dim=dim,
158 |                 dim_head=dim_head,
159 |                 heads=heads,
160 |                 dropout_p=attention_dropout_p,
161 |             )
162 |             self.self_ff = FeedForward(dim=dim, mult=ff_mult)
163 | 
164 |     def forward(self, latents, x, pos=None):
165 |         latents = self.attn(latents, x, pos) + latents
166 |         latents = self.dropout(latents)
167 |         latents = self.ff(latents) + latents
168 |         if self.use_self_attn:
169 |             latents = self.self_attn(latents) + latents
170 |             latents = self.self_ff(latents) + latents
171 |         return latents
172 | 
173 | 
174 | def window_partition(x, window_size):
175 |     """
176 |     Args:
177 |         x: (B, H, W, C)
178 |         window_size (int): window size
179 | 
180 |     Returns:
181 |         windows: (num_windows*B, window_size, window_size, C)
182 |     """
183 |     B, H, W, C = x.shape
184 |     x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
185 |     windows = (
186 |         x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
187 |     )
188 |     return windows
189 | 
190 | 
191 | def window_reverse(windows, window_size, H, W):
192 |     """
193 |     Args:
194 |         windows: (num_windows*B, window_size, window_size, C)
195 |         window_size (int): Window size
196 |         H (int): Height of image
197 |         W (int): Width of image
198 | 
199 |     Returns:
200 |         x: (B, H, W, C)
201 |     """
202 |     B = int(windows.shape[0] / (H * W / window_size / window_size))
203 |     x = windows.view(
204 |         B, H // window_size, W // window_size, window_size, window_size, -1
205 |     )
206 |     x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
207 |     return x
208 | 
209 | 
210 | class PerceiverResampler(nn.Module):
211 |     def __init__(
212 |         self,
213 |         *,
214 |         dim,
215 |         depth,
216 |         dim_head=64,
217 |         heads=1,
218 |         num_latents=-1,
219 |         num_latents_2d=-1,
220 |         ff_mult=4,
221 |         hidden_dropout_p=0.1,
222 |         attention_dropout_p=0.05,
223 |         pos_enc_at_key_value=False,
224 |         concat_kv_latents=False,
225 |         position_encoding=None,
226 |         use_self_attn=False,
227 |         **kwargs,
228 |     ):
229 |         super().__init__()
230 |         self.num_latents = num_latents
231 |         self.num_latents_2d = num_latents_2d
232 | 
233 |         if num_latents > 0:
234 |             self.latents = nn.Parameter(torch.randn(num_latents, dim))
235 |         if num_latents_2d > 0:
236 |             self.latents_2d = nn.Parameter(torch.randn(num_latents_2d, dim))
237 |         self.position_encoding = position_encoding
238 | 
239 |         self.layers = nn.ModuleList([])
240 |         for _ in range(depth):
241 |             self.layers.append(
242 |                 PerceiverEncoderLayer(
243 |                     dim=dim,
244 |                     dim_head=dim_head,
245 |                     heads=heads,
246 |                     ff_mult=ff_mult,
247 |                     hidden_dropout_p=hidden_dropout_p,
248 |                     attention_dropout_p=attention_dropout_p,
249 |                     concat_kv_latents=concat_kv_latents,
250 |                     use_self_attn=use_self_attn,
251 |                 ),
252 |             )
253 | 
254 |         self.norm = nn.LayerNorm(dim)
255 |         self.pos_enc_at_key_value = pos_enc_at_key_value
256 | 
257 |     def forward(self, x, pos=None):
258 |         out_latents = []
259 |         out_pos = []
260 |         if self.num_latents > 0:
261 |             latents_1d, pos_1d = self.forward_1d(x, pos)
262 |             out_latents.append(latents_1d)
263 |             out_pos.append(pos_1d)
264 |         if self.num_latents_2d > 0:
265 |             latents_2d, pos_2d = self.forward_2d(x)
266 |             out_latents.append(latents_2d)
267 |             out_pos.append(pos_2d)
268 | 
269 |         latents = torch.concat(out_latents, dim=1)
270 |         if pos is not None:
271 |             pos = torch.concat(out_pos, dim=1)
272 | 
273 |         return latents, pos
274 | 
275 |     def forward_1d(self, x, pos):
276 |         latents = self.latents.unsqueeze(0).expand(x.shape[0], -1, -1)
277 |         x = x.permute(0, 2, 3, 1).flatten(1, 2)
278 | 
279 |         if not self.pos_enc_at_key_value:
280 |             _pos = None
281 |         if pos is not None:
282 |             _pos = pos.permute(0, 2, 3, 1).flatten(1, 2)
283 |         else:
284 |             _pos = None
285 | 
286 |         for layer in self.layers:
287 |             latents = layer(latents, x, _pos)
288 | 
289 |         if pos is not None:
290 |             pos = torch.zeros_like(latents)
291 | 
292 |         latents = self.norm(latents)
293 |         return latents, pos
294 | 
295 |     def forward_2d(self, x):
296 |         B, C, H, W = x.shape
297 | 
298 |         latents_2d = self.latents_2d.unsqueeze(0).expand(B, -1, -1).view(-1, 1, C)
299 | 
300 |         num_window = int(math.sqrt(self.num_latents_2d))
301 |         window_size = H // num_window
302 |         x = x.permute(0, 2, 3, 1)
303 | 
304 |         x = window_partition(x, window_size)
305 |         x = x.flatten(1, 2)
306 | 
307 |         for layer in self.layers:
308 |             latents_2d = layer(latents_2d, x)
309 | 
310 |         latents_2d = latents_2d.view(B, num_window, num_window, C).permute(0, 3, 1, 2)
311 | 
312 |         pos_2d = self.position_encoding(latents_2d)
313 |         pos_2d = pos_2d.permute(0, 2, 3, 1).flatten(1, 2)
314 | 
315 |         latents_2d = latents_2d.permute(0, 2, 3, 1).flatten(1, 2)
316 | 
317 |         latents_2d = self.norm(latents_2d)
318 | 
319 |         return latents_2d, pos_2d
320 | 


--------------------------------------------------------------------------------
/sam2/modeling/sam/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/sam2/modeling/sam/prompt_encoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | from typing import Optional, Tuple, Type
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | 
 12 | from sam2.modeling.position_encoding import PositionEmbeddingRandom
 13 | 
 14 | from sam2.modeling.sam2_utils import LayerNorm2d
 15 | 
 16 | 
 17 | class PromptEncoder(nn.Module):
 18 |     def __init__(
 19 |         self,
 20 |         embed_dim: int,
 21 |         image_embedding_size: Tuple[int, int],
 22 |         input_image_size: Tuple[int, int],
 23 |         mask_in_chans: int,
 24 |         activation: Type[nn.Module] = nn.GELU,
 25 |     ) -> None:
 26 |         """
 27 |         Encodes prompts for input to SAM's mask decoder.
 28 | 
 29 |         Arguments:
 30 |           embed_dim (int): The prompts' embedding dimension
 31 |           image_embedding_size (tuple(int, int)): The spatial size of the
 32 |             image embedding, as (H, W).
 33 |           input_image_size (int): The padded size of the image as input
 34 |             to the image encoder, as (H, W).
 35 |           mask_in_chans (int): The number of hidden channels used for
 36 |             encoding input masks.
 37 |           activation (nn.Module): The activation to use when encoding
 38 |             input masks.
 39 |         """
 40 |         super().__init__()
 41 |         self.embed_dim = embed_dim
 42 |         self.input_image_size = input_image_size
 43 |         self.image_embedding_size = image_embedding_size
 44 |         self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
 45 | 
 46 |         self.num_point_embeddings: int = 4  # pos/neg point + 2 box corners
 47 |         point_embeddings = [
 48 |             nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)
 49 |         ]
 50 |         self.point_embeddings = nn.ModuleList(point_embeddings)
 51 |         self.not_a_point_embed = nn.Embedding(1, embed_dim)
 52 | 
 53 |         self.mask_input_size = (
 54 |             4 * image_embedding_size[0],
 55 |             4 * image_embedding_size[1],
 56 |         )
 57 |         self.mask_downscaling = nn.Sequential(
 58 |             nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
 59 |             LayerNorm2d(mask_in_chans // 4),
 60 |             activation(),
 61 |             nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
 62 |             LayerNorm2d(mask_in_chans),
 63 |             activation(),
 64 |             nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
 65 |         )
 66 |         self.no_mask_embed = nn.Embedding(1, embed_dim)
 67 | 
 68 |     def get_dense_pe(self) -> torch.Tensor:
 69 |         """
 70 |         Returns the positional encoding used to encode point prompts,
 71 |         applied to a dense set of points the shape of the image encoding.
 72 | 
 73 |         Returns:
 74 |           torch.Tensor: Positional encoding with shape
 75 |             1x(embed_dim)x(embedding_h)x(embedding_w)
 76 |         """
 77 |         return self.pe_layer(self.image_embedding_size).unsqueeze(0)
 78 | 
 79 |     def _embed_points(
 80 |         self,
 81 |         points: torch.Tensor,
 82 |         labels: torch.Tensor,
 83 |         pad: bool,
 84 |     ) -> torch.Tensor:
 85 |         """Embeds point prompts."""
 86 |         points = points + 0.5  # Shift to center of pixel
 87 |         if pad:
 88 |             padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
 89 |             padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
 90 |             points = torch.cat([points, padding_point], dim=1)
 91 |             labels = torch.cat([labels, padding_label], dim=1)
 92 |         point_embedding = self.pe_layer.forward_with_coords(
 93 |             points, self.input_image_size
 94 |         )
 95 |         point_embedding[labels == -1] = 0.0
 96 |         point_embedding[labels == -1] += self.not_a_point_embed.weight
 97 |         point_embedding[labels == 0] += self.point_embeddings[0].weight
 98 |         point_embedding[labels == 1] += self.point_embeddings[1].weight
 99 |         point_embedding[labels == 2] += self.point_embeddings[2].weight
100 |         point_embedding[labels == 3] += self.point_embeddings[3].weight
101 |         return point_embedding
102 | 
103 |     def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
104 |         """Embeds box prompts."""
105 |         boxes = boxes + 0.5  # Shift to center of pixel
106 |         coords = boxes.reshape(-1, 2, 2)
107 |         corner_embedding = self.pe_layer.forward_with_coords(
108 |             coords, self.input_image_size
109 |         )
110 |         corner_embedding[:, 0, :] += self.point_embeddings[2].weight
111 |         corner_embedding[:, 1, :] += self.point_embeddings[3].weight
112 |         return corner_embedding
113 | 
114 |     def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
115 |         """Embeds mask inputs."""
116 |         mask_embedding = self.mask_downscaling(masks)
117 |         return mask_embedding
118 | 
119 |     def _get_batch_size(
120 |         self,
121 |         points: Optional[Tuple[torch.Tensor, torch.Tensor]],
122 |         boxes: Optional[torch.Tensor],
123 |         masks: Optional[torch.Tensor],
124 |     ) -> int:
125 |         """
126 |         Gets the batch size of the output given the batch size of the input prompts.
127 |         """
128 |         if points is not None:
129 |             return points[0].shape[0]
130 |         elif boxes is not None:
131 |             return boxes.shape[0]
132 |         elif masks is not None:
133 |             return masks.shape[0]
134 |         else:
135 |             return 1
136 | 
137 |     def _get_device(self) -> torch.device:
138 |         return self.point_embeddings[0].weight.device
139 | 
140 |     def forward(
141 |         self,
142 |         points: Optional[Tuple[torch.Tensor, torch.Tensor]],
143 |         boxes: Optional[torch.Tensor],
144 |         masks: Optional[torch.Tensor],
145 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
146 |         """
147 |         Embeds different types of prompts, returning both sparse and dense
148 |         embeddings.
149 | 
150 |         Arguments:
151 |           points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
152 |             and labels to embed.
153 |           boxes (torch.Tensor or none): boxes to embed
154 |           masks (torch.Tensor or none): masks to embed
155 | 
156 |         Returns:
157 |           torch.Tensor: sparse embeddings for the points and boxes, with shape
158 |             BxNx(embed_dim), where N is determined by the number of input points
159 |             and boxes.
160 |           torch.Tensor: dense embeddings for the masks, in the shape
161 |             Bx(embed_dim)x(embed_H)x(embed_W)
162 |         """
163 |         bs = self._get_batch_size(points, boxes, masks)
164 |         sparse_embeddings = torch.empty(
165 |             (bs, 0, self.embed_dim), device=self._get_device()
166 |         )
167 |         if points is not None:
168 |             coords, labels = points
169 |             point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
170 |             sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
171 |         if boxes is not None:
172 |             box_embeddings = self._embed_boxes(boxes)
173 |             sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)
174 | 
175 |         if masks is not None:
176 |             dense_embeddings = self._embed_masks(masks)
177 |         else:
178 |             dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
179 |                 bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
180 |             )
181 | 
182 |         return sparse_embeddings, dense_embeddings
183 | 


--------------------------------------------------------------------------------
/sam2/sam2_hiera_b+.yaml:
--------------------------------------------------------------------------------
1 | configs/sam2/sam2_hiera_b+.yaml


--------------------------------------------------------------------------------
/sam2/sam2_hiera_l.yaml:
--------------------------------------------------------------------------------
1 | configs/sam2/sam2_hiera_l.yaml


--------------------------------------------------------------------------------
/sam2/sam2_hiera_s.yaml:
--------------------------------------------------------------------------------
1 | configs/sam2/sam2_hiera_s.yaml


--------------------------------------------------------------------------------
/sam2/sam2_hiera_t.yaml:
--------------------------------------------------------------------------------
1 | configs/sam2/sam2_hiera_t.yaml


--------------------------------------------------------------------------------
/sam2/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/sam2/utils/transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import warnings
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | from torchvision.transforms import Normalize, Resize, ToTensor
 13 | 
 14 | 
 15 | class SAM2Transforms(nn.Module):
 16 |     def __init__(
 17 |         self, resolution, mask_threshold, max_hole_area=0.0, max_sprinkle_area=0.0
 18 |     ):
 19 |         """
 20 |         Transforms for SAM2.
 21 |         """
 22 |         super().__init__()
 23 |         self.resolution = resolution
 24 |         self.mask_threshold = mask_threshold
 25 |         self.max_hole_area = max_hole_area
 26 |         self.max_sprinkle_area = max_sprinkle_area
 27 |         self.mean = [0.485, 0.456, 0.406]
 28 |         self.std = [0.229, 0.224, 0.225]
 29 |         self.to_tensor = ToTensor()
 30 |         self.transforms = torch.jit.script(
 31 |             nn.Sequential(
 32 |                 Resize((self.resolution, self.resolution)),
 33 |                 Normalize(self.mean, self.std),
 34 |             )
 35 |         )
 36 | 
 37 |     def __call__(self, x):
 38 |         x = self.to_tensor(x)
 39 |         return self.transforms(x)
 40 | 
 41 |     def forward_batch(self, img_list):
 42 |         img_batch = [self.transforms(self.to_tensor(img)) for img in img_list]
 43 |         img_batch = torch.stack(img_batch, dim=0)
 44 |         return img_batch
 45 | 
 46 |     def transform_coords(
 47 |         self, coords: torch.Tensor, normalize=False, orig_hw=None
 48 |     ) -> torch.Tensor:
 49 |         """
 50 |         Expects a torch tensor with length 2 in the last dimension. The coordinates can be in absolute image or normalized coordinates,
 51 |         If the coords are in absolute image coordinates, normalize should be set to True and original image size is required.
 52 | 
 53 |         Returns
 54 |             Un-normalized coordinates in the range of [0, 1] which is expected by the SAM2 model.
 55 |         """
 56 |         if normalize:
 57 |             assert orig_hw is not None
 58 |             h, w = orig_hw
 59 |             coords = coords.clone()
 60 |             coords[..., 0] = coords[..., 0] / w
 61 |             coords[..., 1] = coords[..., 1] / h
 62 | 
 63 |         coords = coords * self.resolution  # unnormalize coords
 64 |         return coords
 65 | 
 66 |     def transform_boxes(
 67 |         self, boxes: torch.Tensor, normalize=False, orig_hw=None
 68 |     ) -> torch.Tensor:
 69 |         """
 70 |         Expects a tensor of shape Bx4. The coordinates can be in absolute image or normalized coordinates,
 71 |         if the coords are in absolute image coordinates, normalize should be set to True and original image size is required.
 72 |         """
 73 |         boxes = self.transform_coords(boxes.reshape(-1, 2, 2), normalize, orig_hw)
 74 |         return boxes
 75 | 
 76 |     def postprocess_masks(self, masks: torch.Tensor, orig_hw) -> torch.Tensor:
 77 |         """
 78 |         Perform PostProcessing on output masks.
 79 |         """
 80 |         from sam2.utils.misc import get_connected_components
 81 | 
 82 |         masks = masks.float()
 83 |         input_masks = masks
 84 |         mask_flat = masks.flatten(0, 1).unsqueeze(1)  # flatten as 1-channel image
 85 |         try:
 86 |             if self.max_hole_area > 0:
 87 |                 # Holes are those connected components in background with area <= self.fill_hole_area
 88 |                 # (background regions are those with mask scores <= self.mask_threshold)
 89 |                 labels, areas = get_connected_components(
 90 |                     mask_flat <= self.mask_threshold
 91 |                 )
 92 |                 is_hole = (labels > 0) & (areas <= self.max_hole_area)
 93 |                 is_hole = is_hole.reshape_as(masks)
 94 |                 # We fill holes with a small positive mask score (10.0) to change them to foreground.
 95 |                 masks = torch.where(is_hole, self.mask_threshold + 10.0, masks)
 96 | 
 97 |             if self.max_sprinkle_area > 0:
 98 |                 labels, areas = get_connected_components(
 99 |                     mask_flat > self.mask_threshold
100 |                 )
101 |                 is_hole = (labels > 0) & (areas <= self.max_sprinkle_area)
102 |                 is_hole = is_hole.reshape_as(masks)
103 |                 # We fill holes with negative mask score (-10.0) to change them to background.
104 |                 masks = torch.where(is_hole, self.mask_threshold - 10.0, masks)
105 |         except Exception as e:
106 |             # Skip the post-processing step if the CUDA kernel fails
107 |             warnings.warn(
108 |                 f"{e}\n\nSkipping the post-processing step due to the error above. You can "
109 |                 "still use SAM 2 and it's OK to ignore the error above, although some post-processing "
110 |                 "functionality may be limited (which doesn't affect the results in most cases; see "
111 |                 "https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).",
112 |                 category=UserWarning,
113 |                 stacklevel=2,
114 |             )
115 |             masks = input_masks
116 | 
117 |         masks = F.interpolate(masks, orig_hw, mode="bilinear", align_corners=False)
118 |         return masks
119 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | import os
  7 | 
  8 | from setuptools import find_packages, setup
  9 | 
 10 | # Package metadata
 11 | NAME = "EdgeTAM"
 12 | VERSION = "1.0"
 13 | DESCRIPTION = "EdgeTAM: On-Device Track Anything Model"
 14 | URL = "https://github.com/facebookresearch/EdgeTAM"
 15 | AUTHOR = "Meta AI"
 16 | AUTHOR_EMAIL = "chongzhou1024@gmail.com"
 17 | LICENSE = "Apache 2.0"
 18 | 
 19 | # Read the contents of README file
 20 | with open("README.md", "r", encoding="utf-8") as f:
 21 |     LONG_DESCRIPTION = f.read()
 22 | 
 23 | # Required dependencies
 24 | REQUIRED_PACKAGES = [
 25 |     "torch>=2.3.1",
 26 |     "torchvision>=0.18.1",
 27 |     "numpy>=1.24.4",
 28 |     "tqdm>=4.66.1",
 29 |     "hydra-core>=1.3.2",
 30 |     "iopath>=0.1.10",
 31 |     "pillow>=9.4.0",
 32 | ]
 33 | 
 34 | EXTRA_PACKAGES = {
 35 |     "notebooks": [
 36 |         "matplotlib>=3.9.1",
 37 |         "jupyter>=1.0.0",
 38 |         "opencv-python>=4.7.0",
 39 |         "eva-decord>=0.6.1",
 40 |     ],
 41 |     "dev": [
 42 |         "black==24.2.0",
 43 |         "usort==1.0.2",
 44 |         "ufmt==2.0.0b2",
 45 |         "fvcore>=0.1.5.post20221221",
 46 |         "pandas>=2.2.2",
 47 |         "scikit-image>=0.24.0",
 48 |         "tensorboard>=2.17.0",
 49 |         "pycocotools>=2.0.8",
 50 |         "tensordict>=0.5.0",
 51 |         "opencv-python>=4.7.0",
 52 |         "submitit>=1.5.1",
 53 |     ],
 54 |     "gradio": [
 55 |         "gradio==4.44.0",
 56 |         "gradio_client==1.3.0",
 57 |         "gradio_image_prompter==0.1.0",
 58 |         "opencv-python==4.10.0.84",
 59 |         "moviepy==1.0.3",
 60 |         "pydantic==2.10.6",
 61 |         "timm==1.0.15",
 62 |         "eva-decord==0.6.1",
 63 |     ],
 64 | }
 65 | 
 66 | # By default, we also build the SAM 2 CUDA extension.
 67 | # You may turn off CUDA build with `export SAM2_BUILD_CUDA=0`.
 68 | BUILD_CUDA = os.getenv("SAM2_BUILD_CUDA", "1") == "1"
 69 | # By default, we allow SAM 2 installation to proceed even with build errors.
 70 | # You may force stopping on errors with `export SAM2_BUILD_ALLOW_ERRORS=0`.
 71 | BUILD_ALLOW_ERRORS = os.getenv("SAM2_BUILD_ALLOW_ERRORS", "1") == "1"
 72 | 
 73 | # Catch and skip errors during extension building and print a warning message
 74 | # (note that this message only shows up under verbose build mode
 75 | # "pip install -v -e ." or "python setup.py build_ext -v")
 76 | CUDA_ERROR_MSG = (
 77 |     "{}\n\n"
 78 |     "Failed to build the SAM 2 CUDA extension due to the error above. "
 79 |     "You can still use SAM 2 and it's OK to ignore the error above, although some "
 80 |     "post-processing functionality may be limited (which doesn't affect the results in most cases; "
 81 |     "(see https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).\n"
 82 | )
 83 | 
 84 | 
 85 | def get_extensions():
 86 |     if not BUILD_CUDA:
 87 |         return []
 88 | 
 89 |     try:
 90 |         from torch.utils.cpp_extension import CUDAExtension
 91 | 
 92 |         srcs = ["sam2/csrc/connected_components.cu"]
 93 |         compile_args = {
 94 |             "cxx": [],
 95 |             "nvcc": [
 96 |                 "-DCUDA_HAS_FP16=1",
 97 |                 "-D__CUDA_NO_HALF_OPERATORS__",
 98 |                 "-D__CUDA_NO_HALF_CONVERSIONS__",
 99 |                 "-D__CUDA_NO_HALF2_OPERATORS__",
100 |             ],
101 |         }
102 |         ext_modules = [CUDAExtension("sam2._C", srcs, extra_compile_args=compile_args)]
103 |     except Exception as e:
104 |         if BUILD_ALLOW_ERRORS:
105 |             print(CUDA_ERROR_MSG.format(e))
106 |             ext_modules = []
107 |         else:
108 |             raise e
109 | 
110 |     return ext_modules
111 | 
112 | 
113 | try:
114 |     from torch.utils.cpp_extension import BuildExtension
115 | 
116 |     class BuildExtensionIgnoreErrors(BuildExtension):
117 | 
118 |         def finalize_options(self):
119 |             try:
120 |                 super().finalize_options()
121 |             except Exception as e:
122 |                 print(CUDA_ERROR_MSG.format(e))
123 |                 self.extensions = []
124 | 
125 |         def build_extensions(self):
126 |             try:
127 |                 super().build_extensions()
128 |             except Exception as e:
129 |                 print(CUDA_ERROR_MSG.format(e))
130 |                 self.extensions = []
131 | 
132 |         def get_ext_filename(self, ext_name):
133 |             try:
134 |                 return super().get_ext_filename(ext_name)
135 |             except Exception as e:
136 |                 print(CUDA_ERROR_MSG.format(e))
137 |                 self.extensions = []
138 |                 return "_C.so"
139 | 
140 |     cmdclass = {
141 |         "build_ext": (
142 |             BuildExtensionIgnoreErrors.with_options(no_python_abi_suffix=True)
143 |             if BUILD_ALLOW_ERRORS
144 |             else BuildExtension.with_options(no_python_abi_suffix=True)
145 |         )
146 |     }
147 | except Exception as e:
148 |     cmdclass = {}
149 |     if BUILD_ALLOW_ERRORS:
150 |         print(CUDA_ERROR_MSG.format(e))
151 |     else:
152 |         raise e
153 | 
154 | 
155 | # Setup configuration
156 | setup(
157 |     name=NAME,
158 |     version=VERSION,
159 |     description=DESCRIPTION,
160 |     long_description=LONG_DESCRIPTION,
161 |     long_description_content_type="text/markdown",
162 |     url=URL,
163 |     author=AUTHOR,
164 |     author_email=AUTHOR_EMAIL,
165 |     license=LICENSE,
166 |     packages=find_packages(exclude="notebooks"),
167 |     include_package_data=True,
168 |     install_requires=REQUIRED_PACKAGES,
169 |     extras_require=EXTRA_PACKAGES,
170 |     python_requires=">=3.10.0",
171 |     ext_modules=get_extensions(),
172 |     cmdclass=cmdclass,
173 | )
174 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | ## SAM 2 toolkits
 2 | 
 3 | This directory provides toolkits for additional SAM 2 use cases.
 4 | 
 5 | ### Semi-supervised VOS inference
 6 | 
 7 | The `vos_inference.py` script can be used to generate predictions for semi-supervised video object segmentation (VOS) evaluation on datasets such as [DAVIS](https://davischallenge.org/index.html), [MOSE](https://henghuiding.github.io/MOSE/) or the SA-V dataset.
 8 | 
 9 | After installing SAM 2 and its dependencies, it can be used as follows ([DAVIS 2017 dataset](https://davischallenge.org/davis2017/code.html) as an example). This script saves the prediction PNG files to the `--output_mask_dir`.
10 | ```bash
11 | python ./tools/vos_inference.py \
12 |   --sam2_cfg configs/sam2.1/sam2.1_hiera_b+.yaml \
13 |   --sam2_checkpoint ./checkpoints/sam2.1_hiera_base_plus.pt \
14 |   --base_video_dir /path-to-davis-2017/JPEGImages/480p \
15 |   --input_mask_dir /path-to-davis-2017/Annotations/480p \
16 |   --video_list_file /path-to-davis-2017/ImageSets/2017/val.txt \
17 |   --output_mask_dir ./outputs/davis_2017_pred_pngs
18 | ```
19 | (replace `/path-to-davis-2017` with the path to DAVIS 2017 dataset)
20 | 
21 | To evaluate on the SA-V dataset with per-object PNG files for the object masks, we need to **add the `--per_obj_png_file` flag** as follows (using SA-V val as an example). This script will also save per-object PNG files for the output masks under the `--per_obj_png_file` flag.
22 | ```bash
23 | python ./tools/vos_inference.py \
24 |   --sam2_cfg configs/sam2.1/sam2.1_hiera_b+.yaml \
25 |   --sam2_checkpoint ./checkpoints/sam2.1_hiera_base_plus.pt \
26 |   --base_video_dir /path-to-sav-val/JPEGImages_24fps \
27 |   --input_mask_dir /path-to-sav-val/Annotations_6fps \
28 |   --video_list_file /path-to-sav-val/sav_val.txt \
29 |   --per_obj_png_file \
30 |   --output_mask_dir ./outputs/sav_val_pred_pngs
31 | ```
32 | (replace `/path-to-sav-val` with the path to SA-V val)
33 | 
34 | Then, we can use the evaluation tools or servers for each dataset to get the performance of the prediction PNG files above.
35 | 
36 | Note: by default, the `vos_inference.py` script above assumes that all objects to track already appear on frame 0 in each video (as is the case in DAVIS, MOSE or SA-V). **For VOS datasets that don't have all objects to track appearing in the first frame (such as LVOS or YouTube-VOS), please add the `--track_object_appearing_later_in_video` flag when using `vos_inference.py`**.
37 | 


--------------------------------------------------------------------------------