├── .gradient
    └── settings.yaml
├── AutoYOLO.ipynb
├── AutoYOLO.py
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── README.md
├── assets
    ├── logo.png
    ├── masks1.png
    ├── masks2.jpg
    ├── model_diagram.png
    ├── notebook1.png
    └── notebook2.png
├── automatic_mask_generator.py
├── build_sam.py
├── dataset
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-39.pyc
    │   ├── catalog.cpython-39.pyc
    │   └── concat_dataset.cpython-39.pyc
    ├── base_dataset.py
    ├── catalog.py
    ├── cd_dataset.py
    ├── concat_dataset.py
    ├── grounding_dataset.py
    ├── layout_dataset.py
    ├── tsv.py
    ├── tsv_dataset.py
    └── utils.py
├── datasets
    └── .placeholder
├── gligen
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-39.pyc
    │   ├── distributed.cpython-39.pyc
    │   ├── evaluator.cpython-39.pyc
    │   ├── task_grounded_generation.cpython-39.pyc
    │   └── trainer.cpython-39.pyc
    ├── create_meta.py
    ├── distributed.py
    ├── evaluator.py
    ├── ldm
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-39.pyc
    │   │   └── util.cpython-39.pyc
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── imagenet.py
    │   │   ├── imagenet_clsidx_to_label.txt
    │   │   ├── index_synset.yaml
    │   │   └── lsun.py
    │   ├── lr_scheduler.py
    │   ├── models
    │   │   ├── __pycache__
    │   │   │   └── autoencoder.cpython-39.pyc
    │   │   ├── autoencoder.py
    │   │   └── diffusion
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │       ├── __init__.cpython-39.pyc
    │   │   │       ├── ddim.cpython-39.pyc
    │   │   │       ├── ddpm.cpython-39.pyc
    │   │   │       ├── ldm.cpython-39.pyc
    │   │   │       └── plms.cpython-39.pyc
    │   │   │   ├── classifier.py
    │   │   │   ├── ddim.py
    │   │   │   ├── ddpm.py
    │   │   │   ├── ldm.py
    │   │   │   └── plms.py
    │   ├── modules
    │   │   ├── __pycache__
    │   │   │   ├── attention.cpython-39.pyc
    │   │   │   └── x_transformer.cpython-39.pyc
    │   │   ├── attention.py
    │   │   ├── diffusionmodules
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-39.pyc
    │   │   │   │   ├── model.cpython-39.pyc
    │   │   │   │   ├── openaimodel.cpython-39.pyc
    │   │   │   │   ├── positionnet.cpython-39.pyc
    │   │   │   │   └── util.cpython-39.pyc
    │   │   │   ├── model.py
    │   │   │   ├── openaimodel.py
    │   │   │   ├── positionnet.py
    │   │   │   ├── positionnet_with_image.py
    │   │   │   └── util.py
    │   │   ├── distributions
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-39.pyc
    │   │   │   │   └── distributions.cpython-39.pyc
    │   │   │   └── distributions.py
    │   │   ├── ema.py
    │   │   ├── encoders
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-39.pyc
    │   │   │   │   └── modules.cpython-39.pyc
    │   │   │   ├── modules.py
    │   │   │   └── modules_backup.py
    │   │   ├── image_degradation
    │   │   │   ├── __init__.py
    │   │   │   ├── bsrgan.py
    │   │   │   ├── bsrgan_light.py
    │   │   │   └── utils_image.py
    │   │   ├── losses
    │   │   │   ├── __init__.py
    │   │   │   ├── contperceptual.py
    │   │   │   └── vqperceptual.py
    │   │   └── x_transformer.py
    │   └── util.py
    ├── projection_matrix.pth
    ├── task_grounded_generation.py
    └── trainer.py
├── groundingdino
    ├── _C.cpython-39-x86_64-linux-gnu.so
    ├── __init__.py
    ├── __pycache__
    │   └── __init__.cpython-39.pyc
    ├── config
    │   ├── GroundingDINO_SwinB.cfg.py
    │   └── GroundingDINO_SwinT_OGC.py
    ├── datasets
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-39.pyc
    │   │   └── transforms.cpython-39.pyc
    │   └── transforms.py
    ├── models
    │   ├── GroundingDINO
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-39.pyc
    │   │   │   ├── bertwarper.cpython-39.pyc
    │   │   │   ├── fuse_modules.cpython-39.pyc
    │   │   │   ├── groundingdino.cpython-39.pyc
    │   │   │   ├── ms_deform_attn.cpython-39.pyc
    │   │   │   ├── transformer.cpython-39.pyc
    │   │   │   ├── transformer_vanilla.cpython-39.pyc
    │   │   │   └── utils.cpython-39.pyc
    │   │   ├── backbone
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-39.pyc
    │   │   │   │   ├── backbone.cpython-39.pyc
    │   │   │   │   ├── position_encoding.cpython-39.pyc
    │   │   │   │   └── swin_transformer.cpython-39.pyc
    │   │   │   ├── backbone.py
    │   │   │   ├── position_encoding.py
    │   │   │   └── swin_transformer.py
    │   │   ├── bertwarper.py
    │   │   ├── csrc
    │   │   │   ├── MsDeformAttn
    │   │   │   │   ├── ms_deform_attn.h
    │   │   │   │   ├── ms_deform_attn_cpu.cpp
    │   │   │   │   ├── ms_deform_attn_cpu.h
    │   │   │   │   ├── ms_deform_attn_cuda.cu
    │   │   │   │   ├── ms_deform_attn_cuda.h
    │   │   │   │   └── ms_deform_im2col_cuda.cuh
    │   │   │   ├── cuda_version.cu
    │   │   │   └── vision.cpp
    │   │   ├── fuse_modules.py
    │   │   ├── groundingdino.py
    │   │   ├── ms_deform_attn.py
    │   │   ├── transformer.py
    │   │   ├── transformer_vanilla.py
    │   │   └── utils.py
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-39.pyc
    │   │   └── registry.cpython-39.pyc
    │   └── registry.py
    ├── util
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-39.pyc
    │   │   ├── box_ops.cpython-39.pyc
    │   │   ├── slconfig.cpython-39.pyc
    │   │   └── utils.cpython-39.pyc
    │   ├── box_ops.py
    │   ├── get_tokenlizer.py
    │   ├── inference.py
    │   ├── logger.py
    │   ├── misc.py
    │   ├── slconfig.py
    │   ├── slio.py
    │   ├── time_counter.py
    │   ├── utils.py
    │   ├── visualizer.py
    │   └── vl_utils.py
    └── version.py
├── linter.sh
├── modeling
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-39.pyc
    │   ├── common.cpython-39.pyc
    │   ├── image_encoder.cpython-39.pyc
    │   ├── mask_decoder.cpython-39.pyc
    │   ├── prompt_encoder.cpython-39.pyc
    │   ├── sam.cpython-39.pyc
    │   └── transformer.cpython-39.pyc
    ├── common.py
    ├── image_encoder.py
    ├── mask_decoder.py
    ├── prompt_encoder.py
    ├── sam.py
    └── transformer.py
├── notebooks
    ├── automatic_mask_generator_example.ipynb
    ├── images
    │   ├── dog.jpg
    │   ├── groceries.jpg
    │   └── truck.jpg
    ├── onnx_model_example.ipynb
    └── predictor_example.ipynb
├── outputs
    └── .placeholder
├── predictor.py
├── requirements.txt
├── runs
    └── detect
    │   └── .placeholder
├── scripts
    ├── amg.py
    └── export_onnx_model.py
├── segment_anything
    ├── .flake8
    ├── CODE_OF_CONDUCT.md
    ├── CONTRIBUTING.md
    ├── LICENSE
    ├── README.md
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-39.pyc
    │   ├── automatic_mask_generator.cpython-39.pyc
    │   ├── build_sam.cpython-39.pyc
    │   └── predictor.cpython-39.pyc
    ├── assets
    │   ├── masks1.png
    │   ├── masks2.jpg
    │   ├── model_diagram.png
    │   ├── notebook1.png
    │   └── notebook2.png
    ├── automatic_mask_generator.py
    ├── build_sam.py
    ├── linter.sh
    ├── modeling
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-39.pyc
    │   │   ├── common.cpython-39.pyc
    │   │   ├── image_encoder.cpython-39.pyc
    │   │   ├── mask_decoder.cpython-39.pyc
    │   │   ├── prompt_encoder.cpython-39.pyc
    │   │   ├── sam.cpython-39.pyc
    │   │   └── transformer.cpython-39.pyc
    │   ├── common.py
    │   ├── image_encoder.py
    │   ├── mask_decoder.py
    │   ├── prompt_encoder.py
    │   ├── sam.py
    │   └── transformer.py
    ├── notebooks
    │   ├── automatic_mask_generator_example.ipynb
    │   ├── images
    │   │   ├── dog.jpg
    │   │   ├── groceries.jpg
    │   │   └── truck.jpg
    │   ├── onnx_model_example.ipynb
    │   └── predictor_example.ipynb
    ├── predictor.py
    ├── scripts
    │   ├── amg.py
    │   └── export_onnx_model.py
    ├── segment_anything.egg-info
    │   ├── PKG-INFO
    │   ├── SOURCES.txt
    │   ├── dependency_links.txt
    │   ├── requires.txt
    │   └── top_level.txt
    ├── segment_anything
    │   ├── __init__.py
    │   ├── automatic_mask_generator.py
    │   ├── build_sam.py
    │   ├── modeling
    │   │   ├── __init__.py
    │   │   ├── common.py
    │   │   ├── image_encoder.py
    │   │   ├── mask_decoder.py
    │   │   ├── prompt_encoder.py
    │   │   ├── sam.py
    │   │   └── transformer.py
    │   ├── predictor.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── amg.py
    │   │   ├── onnx.py
    │   │   └── transforms.py
    ├── setup.cfg
    ├── setup.py
    └── utils
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-39.pyc
    │       ├── amg.cpython-39.pyc
    │       └── transforms.cpython-39.pyc
    │   ├── amg.py
    │   ├── onnx.py
    │   └── transforms.py
├── setup.cfg
├── setup.py
├── setup.sh
└── utils
    ├── __init__.py
    ├── __pycache__
        ├── __init__.cpython-39.pyc
        ├── amg.cpython-39.pyc
        └── transforms.cpython-39.pyc
    ├── amg.py
    ├── onnx.py
    └── transforms.py


/.gradient/settings.yaml:
--------------------------------------------------------------------------------
1 | integrations:
2 |   dolly-v2-12b:
3 |     type: dataset
4 |     ref: paperspace/dsi5inn7aonbmv3:latest
5 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@fb.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq
81 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to segment-anything
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `main`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints, using the `linter.sh` script in the project's root directory. Linting requires `black==23.*`, `isort==5.12.0`, `flake8`, and `mypy`.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Facebook's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to segment-anything, you agree that your contributions will be licensed
31 | under the LICENSE file in the root directory of this source tree.
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AutoYOLO - Ultralytics YOLOv8 Web UI v2
 2 | 
 3 | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/gradient-ai/autoyolo?machine=Free-GPU)
 4 | 
 5 | This Gradio application is designed to facilitate the end-to-end creation of a YOLOv8 object detection model using Segment Anything, GroundingDINO, BLIP-2, and Dolly v2 to facilitate the automatic labeling of objects in images. Users can then train Ultralytics YOLOv8 models to generate predictions on inputted videos and images. Optionally, users may also manually label images as desired.
 6 | 
 7 | ## Capabilities
 8 | 
 9 | - **AutoLabel**: The key contribution of this application is the AutoLabeler. Using the autolabel tab, users can automatically generate fully labeled images in the Ultralytics YOLOv8 format with only the submission of the images and desired, target, object labels
10 | - **Manually Label Images**: this tab lets you upload images, either in bulk or one at a time, to be labeled. The bounding boxes are automatically detected, and the labels are assigned through a textbox. Entries are separated by semi-colons
11 | - **Image Gallery**: this tab allows us to view our labeled images, seperated by the assigned training split
12 | - **Train**: train any of the YOLOv8 models on the labeled images. Outputs the validation metrics and the best trained model from the run, `best.pt`
13 | - **Inference**: predict object labels on images and videos. Works for direct upload and URL submission of images and YouTube Videos
14 | 
15 | ## Next steps
16 | 
17 | - Integrating with RoboFlow to enable training on the application with existing projects and Universe datasets
18 | - Streaming video object detection for real time viewing and interaction with the object detection model
19 | - Add in additional text models (GPT4All, OpenAssistant, Otter, etc.) to enable multimodal integration. Potentially removes BLIP-2 from pipeline and speeds up processing
20 | 
21 | ## Thanks and credits to:
22 | 
23 | - This application was inspired by the work done by Idea Research on their [Grounded Segment Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything) project. Thanks to them for releasing their awesome work, and for inspiring this project.
24 | - This application wouldn't have been feasible without the groundwork completed by the researchers for the [GLIGEN](https://github.com/gligen/GLIGEN) project. Their bounding box detector code was instrumental to making this work.
25 | - [Ultralytics](https://github.com/ultralytics/ultralytics) for their incredible work on YOLOv8
26 | 


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/assets/logo.png


--------------------------------------------------------------------------------
/assets/masks1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/assets/masks1.png


--------------------------------------------------------------------------------
/assets/masks2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/assets/masks2.jpg


--------------------------------------------------------------------------------
/assets/model_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/assets/model_diagram.png


--------------------------------------------------------------------------------
/assets/notebook1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/assets/notebook1.png


--------------------------------------------------------------------------------
/assets/notebook2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/assets/notebook2.png


--------------------------------------------------------------------------------
/build_sam.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import torch
  8 | 
  9 | from functools import partial
 10 | 
 11 | from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
 12 | 
 13 | 
 14 | def build_sam_vit_h(checkpoint=None):
 15 |     return _build_sam(
 16 |         encoder_embed_dim=1280,
 17 |         encoder_depth=32,
 18 |         encoder_num_heads=16,
 19 |         encoder_global_attn_indexes=[7, 15, 23, 31],
 20 |         checkpoint=checkpoint,
 21 |     )
 22 | 
 23 | 
 24 | build_sam = build_sam_vit_h
 25 | 
 26 | 
 27 | def build_sam_vit_l(checkpoint=None):
 28 |     return _build_sam(
 29 |         encoder_embed_dim=1024,
 30 |         encoder_depth=24,
 31 |         encoder_num_heads=16,
 32 |         encoder_global_attn_indexes=[5, 11, 17, 23],
 33 |         checkpoint=checkpoint,
 34 |     )
 35 | 
 36 | 
 37 | def build_sam_vit_b(checkpoint=None):
 38 |     return _build_sam(
 39 |         encoder_embed_dim=768,
 40 |         encoder_depth=12,
 41 |         encoder_num_heads=12,
 42 |         encoder_global_attn_indexes=[2, 5, 8, 11],
 43 |         checkpoint=checkpoint,
 44 |     )
 45 | 
 46 | 
 47 | sam_model_registry = {
 48 |     "default": build_sam,
 49 |     "vit_h": build_sam,
 50 |     "vit_l": build_sam_vit_l,
 51 |     "vit_b": build_sam_vit_b,
 52 | }
 53 | 
 54 | 
 55 | def _build_sam(
 56 |     encoder_embed_dim,
 57 |     encoder_depth,
 58 |     encoder_num_heads,
 59 |     encoder_global_attn_indexes,
 60 |     checkpoint=None,
 61 | ):
 62 |     prompt_embed_dim = 256
 63 |     image_size = 1024
 64 |     vit_patch_size = 16
 65 |     image_embedding_size = image_size // vit_patch_size
 66 |     sam = Sam(
 67 |         image_encoder=ImageEncoderViT(
 68 |             depth=encoder_depth,
 69 |             embed_dim=encoder_embed_dim,
 70 |             img_size=image_size,
 71 |             mlp_ratio=4,
 72 |             norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
 73 |             num_heads=encoder_num_heads,
 74 |             patch_size=vit_patch_size,
 75 |             qkv_bias=True,
 76 |             use_rel_pos=True,
 77 |             global_attn_indexes=encoder_global_attn_indexes,
 78 |             window_size=14,
 79 |             out_chans=prompt_embed_dim,
 80 |         ),
 81 |         prompt_encoder=PromptEncoder(
 82 |             embed_dim=prompt_embed_dim,
 83 |             image_embedding_size=(image_embedding_size, image_embedding_size),
 84 |             input_image_size=(image_size, image_size),
 85 |             mask_in_chans=16,
 86 |         ),
 87 |         mask_decoder=MaskDecoder(
 88 |             num_multimask_outputs=3,
 89 |             transformer=TwoWayTransformer(
 90 |                 depth=2,
 91 |                 embedding_dim=prompt_embed_dim,
 92 |                 mlp_dim=2048,
 93 |                 num_heads=8,
 94 |             ),
 95 |             transformer_dim=prompt_embed_dim,
 96 |             iou_head_depth=3,
 97 |             iou_head_hidden_dim=256,
 98 |         ),
 99 |         pixel_mean=[123.675, 116.28, 103.53],
100 |         pixel_std=[58.395, 57.12, 57.375],
101 |     )
102 |     sam.eval()
103 |     if checkpoint is not None:
104 |         with open(checkpoint, "rb") as f:
105 |             state_dict = torch.load(f)
106 |         sam.load_state_dict(state_dict)
107 |     return sam
108 | 


--------------------------------------------------------------------------------
/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/dataset/__init__.py


--------------------------------------------------------------------------------
/dataset/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/dataset/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/catalog.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/dataset/__pycache__/catalog.cpython-39.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/concat_dataset.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/dataset/__pycache__/concat_dataset.cpython-39.pyc


--------------------------------------------------------------------------------
/dataset/catalog.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | 
 3 | class DatasetCatalog:
 4 |     def __init__(self, ROOT, which_embedder):
 5 |         assert which_embedder in ['clip', 'bert']
 6 | 
 7 |         # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 
 8 | 
 9 | 
10 |         self.VGGrounding = {   
11 |             "target": "dataset.tsv_dataset.TSVDataset",
12 |             "train_params": dict(
13 |                 tsv_path=os.path.join(ROOT,'GROUNDING/gqa/tsv/train-00.tsv'),
14 |             )
15 |         }
16 | 
17 | 
18 |         # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 
19 | 
20 | 
21 |         self.FlickrGrounding = {
22 |             "target": "dataset.tsv_dataset.TSVDataset",
23 |             "train_params":dict(
24 |                 tsv_path=os.path.join(ROOT,'GROUNDING/flickr30k/tsv/train-00.tsv'),
25 |             )
26 |         }
27 | 
28 |         # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 
29 | 
30 |         self.SBUGrounding = {   
31 |             "target": "dataset.tsv_dataset.TSVDataset",
32 |             "train_params":dict(
33 |                 tsv_path=os.path.join(ROOT,'GROUNDING/SBU/tsv/train-00.tsv'),
34 |             )
35 |         }
36 | 
37 | 
38 |         # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 
39 | 
40 | 
41 |         self.CC3MGrounding = {   
42 |             "target": "dataset.tsv_dataset.TSVDataset",
43 |             "train_params":dict(
44 |                 tsv_path=os.path.join(ROOT,'GROUNDING/CC3M/tsv/train-00.tsv'),
45 |             )
46 |         }
47 | 
48 | 
49 |         # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 
50 | 
51 | 
52 |         self.CC12MGrounding = {   
53 |             "target": "dataset.tsv_dataset.TSVDataset",
54 |             "train_params":dict(
55 |                 tsv_path=os.path.join(ROOT,'GROUNDING/CC12M/tsv/train-00.tsv'),
56 |             )
57 |         }
58 | 
59 | 
60 |         # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 
61 | 
62 |         # temp = 'category_embedding_clip.pth' if which_embedder == 'clip' else 'category_embedding_bert.pth' 
63 |         # obj365_category_embedding_path = os.path.join(ROOT, 'OBJECTS365', temp)
64 | 
65 |         self.Obj365Detection = {   
66 |         "target": "dataset.tsv_dataset.TSVDataset",
67 |         "train_params":dict(
68 |             tsv_path=os.path.join(ROOT,'OBJECTS365/tsv/train-00.tsv'),
69 |             ),
70 |         }
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/dataset/concat_dataset.py:
--------------------------------------------------------------------------------
 1 | from .catalog import DatasetCatalog
 2 | from ldm.util import instantiate_from_config
 3 | import torch 
 4 | 
 5 | 
 6 | 
 7 | 
 8 | class ConCatDataset():
 9 |     def __init__(self, dataset_name_list, ROOT, which_embedder, train=True, repeats=None):
10 |         self.datasets = [] 
11 |         cul_previous_dataset_length = 0 
12 |         offset_map = []
13 |         which_dataset = []
14 | 
15 |         if repeats is None:
16 |             repeats = [1] * len(dataset_name_list)
17 |         else:
18 |             assert len(repeats) == len(dataset_name_list)
19 |             
20 | 
21 |         Catalog = DatasetCatalog(ROOT, which_embedder)
22 |         for dataset_idx, (dataset_name, yaml_params) in enumerate(dataset_name_list.items()):
23 |             repeat = repeats[dataset_idx]
24 | 
25 |             dataset_dict = getattr(Catalog, dataset_name)
26 |             
27 |             target = dataset_dict['target']
28 |             params = dataset_dict['train_params'] if train else dataset_dict['val_params']
29 |             if yaml_params is not None:
30 |                 params.update(yaml_params)
31 |             dataset = instantiate_from_config( dict(target=target, params=params) )
32 |             
33 |             self.datasets.append(dataset)
34 |             for _ in range(repeat):
35 |                 offset_map.append(  torch.ones(len(dataset))*cul_previous_dataset_length  )
36 |                 which_dataset.append(  torch.ones(len(dataset))*dataset_idx  )
37 |                 cul_previous_dataset_length += len(dataset)
38 |         offset_map = torch.cat(offset_map, dim=0).long()
39 |         self.total_length = cul_previous_dataset_length
40 | 
41 |         self.mapping = torch.arange(self.total_length) - offset_map
42 |         self.which_dataset = torch.cat(which_dataset, dim=0).long()
43 | 
44 | 
45 |     def total_images(self):
46 |         count = 0
47 |         for dataset in self.datasets:
48 |             print(dataset.total_images())
49 |             count += dataset.total_images()
50 |         return count
51 | 
52 | 
53 | 
54 |     def __getitem__(self, idx):
55 |         dataset = self.datasets[ self.which_dataset[idx] ]   
56 |         return dataset[ self.mapping[idx] ]     
57 | 
58 | 
59 |     def __len__(self):
60 |         return self.total_length
61 |             
62 | 
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/dataset/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright 2018 Google LLC
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #      http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import PIL
 18 | import torch
 19 | import torchvision.transforms as T
 20 | 
 21 | 
 22 | IMAGENET_MEAN = [0.485, 0.456, 0.406]
 23 | IMAGENET_STD = [0.229, 0.224, 0.225]
 24 | 
 25 | INV_IMAGENET_MEAN = [-m for m in IMAGENET_MEAN]
 26 | INV_IMAGENET_STD = [1.0 / s for s in IMAGENET_STD]
 27 | 
 28 | 
 29 | def imagenet_preprocess():
 30 |   return T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
 31 | 
 32 | 
 33 | def rescale(x):
 34 |   lo, hi = x.min(), x.max()
 35 |   return x.sub(lo).div(hi - lo)
 36 | 
 37 | 
 38 | def imagenet_deprocess(rescale_image=True):
 39 |   transforms = [
 40 |     T.Normalize(mean=[0, 0, 0], std=INV_IMAGENET_STD),
 41 |     T.Normalize(mean=INV_IMAGENET_MEAN, std=[1.0, 1.0, 1.0]),
 42 |   ]
 43 |   if rescale_image:
 44 |     transforms.append(rescale)
 45 |   return T.Compose(transforms)
 46 | 
 47 | 
 48 | def imagenet_deprocess_batch(imgs, rescale=True):
 49 |   """
 50 |   Input:
 51 |   - imgs: FloatTensor of shape (N, C, H, W) giving preprocessed images
 52 | 
 53 |   Output:
 54 |   - imgs_de: ByteTensor of shape (N, C, H, W) giving deprocessed images
 55 |     in the range [0, 255]
 56 |   """
 57 |   if isinstance(imgs, torch.autograd.Variable):
 58 |     imgs = imgs.data
 59 |   imgs = imgs.cpu().clone()
 60 |   deprocess_fn = imagenet_deprocess(rescale_image=rescale)
 61 |   imgs_de = []
 62 |   for i in range(imgs.size(0)):
 63 |     img_de = deprocess_fn(imgs[i])[None]
 64 |     img_de = img_de.mul(255).clamp(0, 255).byte()
 65 |     imgs_de.append(img_de)
 66 |   imgs_de = torch.cat(imgs_de, dim=0)
 67 |   return imgs_de
 68 | 
 69 | 
 70 | class Resize(object):
 71 |   def __init__(self, size, interp=PIL.Image.BILINEAR):
 72 |     if isinstance(size, tuple):
 73 |       H, W = size
 74 |       self.size = (W, H)
 75 |     else:
 76 |       self.size = (size, size)
 77 |     self.interp = interp
 78 | 
 79 |   def __call__(self, img):
 80 |     return img.resize(self.size, self.interp)
 81 | 
 82 | 
 83 | def unpack_var(v):
 84 |   if isinstance(v, torch.autograd.Variable):
 85 |     return v.data
 86 |   return v
 87 | 
 88 | 
 89 | def split_graph_batch(triples, obj_data, obj_to_img, triple_to_img):
 90 |   triples = unpack_var(triples)
 91 |   obj_data = [unpack_var(o) for o in obj_data]
 92 |   obj_to_img = unpack_var(obj_to_img)
 93 |   triple_to_img = unpack_var(triple_to_img)
 94 | 
 95 |   triples_out = []
 96 |   obj_data_out = [[] for _ in obj_data]
 97 |   obj_offset = 0
 98 |   N = obj_to_img.max() + 1
 99 |   for i in range(N):
100 |     o_idxs = (obj_to_img == i).nonzero().view(-1)
101 |     t_idxs = (triple_to_img == i).nonzero().view(-1)
102 | 
103 |     cur_triples = triples[t_idxs].clone()
104 |     cur_triples[:, 0] -= obj_offset
105 |     cur_triples[:, 2] -= obj_offset
106 |     triples_out.append(cur_triples)
107 | 
108 |     for j, o_data in enumerate(obj_data):
109 |       cur_o_data = None
110 |       if o_data is not None:
111 |         cur_o_data = o_data[o_idxs]
112 |       obj_data_out[j].append(cur_o_data)
113 | 
114 |     obj_offset += o_idxs.size(0)
115 | 
116 |   return triples_out, obj_data_out
117 | 


--------------------------------------------------------------------------------
/datasets/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/datasets/.placeholder


--------------------------------------------------------------------------------
/gligen/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os, sys
 3 | sys.path.append(os.path.dirname(__file__))
 4 | sys.path.append(os.path.join(os.path.dirname(__file__), "ldm"))
 5 | 
 6 | import gligen.evaluator as evaluator
 7 | import gligen.trainer as trainer
 8 | 
 9 | 
10 | # import gligen.ldm as ldm


--------------------------------------------------------------------------------
/gligen/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/__pycache__/distributed.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/__pycache__/distributed.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/__pycache__/evaluator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/__pycache__/evaluator.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/__pycache__/task_grounded_generation.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/__pycache__/task_grounded_generation.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/__pycache__/trainer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/__pycache__/trainer.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/create_meta.py:
--------------------------------------------------------------------------------
  1 | CKPTS = [
  2 | 
  3 |     dict(
  4 |         path="/home/chunyl/azure_mount/yuhengdb/fine_tune_ldm/version5_branch6_output/GoldG+SBU+CC3M+CC12M+O365/second_stage_drop_both/tag01/checkpoint_00450001.pth",
  5 |         feature_type=['before','after_reproject'],
  6 |         save_folder_name="v5b6_drop_both",
  7 |     ),
  8 | 
  9 | 
 10 |     # dict(
 11 |     #     path="/home/v-yuhengli/blobfuse/output/fine_tune_ldm/version5_branch6_output/GoldG+SBU+CC3M+CC12M+O365/second_stage_drop_none/tag00/checkpoint_00165001.pth",
 12 |     #     feature_type=['before','after_reproject'],
 13 |     #     save_folder_name="v5b6_drop_none",
 14 |     # ),
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | ]
 21 | 
 22 | 
 23 | 
 24 | # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
 25 | 
 26 | 
 27 | 
 28 | 
 29 | 
 30 | 
 31 | 
 32 | 
 33 |     # if meta["has_image_mask"] == 0:
 34 |     #     image_embeddings = text_embeddings
 35 |     # if meta["has_text_mask"] == 0:
 36 |     #     text_embeddings = image_embeddings
 37 | 
 38 |     # out = {
 39 |     #     "boxes" : boxes.unsqueeze(0).repeat(batch,1,1),
 40 |     #     "masks" : masks.unsqueeze(0).repeat(batch,1),
 41 |     #     "text_masks" : masks.unsqueeze(0).repeat(batch,1),
 42 |     #     "image_masks" : masks.unsqueeze(0).repeat(batch,1),
 43 |     #     "text_embeddings"  : text_embeddings.unsqueeze(0).repeat(batch,1,1),
 44 |     #     "image_embeddings" : image_embeddings.unsqueeze(0).repeat(batch,1,1)
 45 |     # }
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | META = [
 54 | 
 55 | 
 56 |     dict(
 57 |         prompt = "a teddy bear sitting next to a red bird",
 58 |         phrases = ['a teddy bear', 'a red bird'],
 59 |         images = ['images/teddy.jpg', 'images/red_bird.jpg'],
 60 |         locations = [ [0.0,0.09,0.33,0.76], [0.55,0.11,1.0,0.8]   ],
 61 |         alpha_type = [1.0, 0, 0.0], 
 62 |         has_text_mask = 1,  
 63 |         has_image_mask = 0,  
 64 |         save_folder_name="teddy_bird_1_1"
 65 |     ),
 66 | 
 67 | 
 68 |     # dict(
 69 |     #     prompt = "a teddy bear sitting next to a bird",
 70 |     #     phrases = ['a teddy bear', 'a bird'],
 71 |     #     images = ['images/teddy.jpg', 'images/red_bird.jpg'],
 72 |     #     locations = [ [0.0,0.09,0.33,0.76], [0.55,0.11,1.0,0.8]   ],
 73 |     #     alpha_type = [1.0, 0, 0.0], 
 74 |     #     has_text_mask = 1,  
 75 |     #     has_image_mask = 1,  
 76 |     #     save_folder_name="teddy_bird_1_1"
 77 |     # ),
 78 | 
 79 | 
 80 |     # dict(
 81 |     #     prompt = "a teddy bear sitting next to a bird",
 82 |     #     phrases = ['a teddy bear', 'a bird'],
 83 |     #     images = ['images/teddy.jpg', 'images/red_bird.jpg'],
 84 |     #     locations = [ [0.0,0.09,0.33,0.76], [0.55,0.11,1.0,0.8]   ],
 85 |     #     alpha_type = [0.5, 0, 0.5], 
 86 |     #     has_text_mask = 1,  
 87 |     #     has_image_mask = 0,  
 88 |     #     save_folder_name="teddy_bird_1_0"
 89 |     # ),
 90 | 
 91 |     # dict(
 92 |     #     prompt = "",
 93 |     #     phrases = ['a teddy bear', 'an umbrella'],
 94 |     #     images = ['images/teddy.jpg', 'images/umbrella.png'],
 95 |     #     locations = [ [0.0,0.09,0.33,0.76], [0.55,0.11,1.0,0.8]   ],
 96 |     #     alpha_type = [1.0, 0, 0.0], 
 97 |     #     has_text_mask = 1,  
 98 |     #     has_image_mask = 1,  
 99 |     #     save_folder_name="empty_teddy_umbrella_1_1"
100 |     # ),
101 | 
102 |     # dict(
103 |     #     prompt = "hello kitty and bird hybrid",
104 |     #     phrases = ['a hello kitty', 'a hello kitty'],
105 |     #     images = ['images/red_bird.jpg', 'images/red_bird.jpg'],
106 |     #     locations = [ [0.0,0.09,0.33,0.76], [0.55,0.11,1.0,0.8]   ],
107 |     #     has_text_mask = 1,  
108 |     #     has_image_mask = 1,  
109 |     #     save_folder_name="hello+bird_1_1"
110 |     # ),
111 | 
112 |     # dict(
113 |     #     prompt = "hello kitty and teddy bear hybrid",
114 |     #     phrases = ['a hello kitty', 'a hello kitty'],
115 |     #     images = ['images/teddy.jpg', 'images/teddy.jpg'],
116 |     #     locations = [ [0.0,0.09,0.33,0.76], [0.55,0.11,1.0,0.8]   ],
117 |     #     has_text_mask = 1,  
118 |     #     has_image_mask = 1,  
119 |     #     save_folder_name="hello+teddy_1_1"
120 |     # ),
121 | 
122 |     # dict(
123 |     #     prompt = "bird and hello kitty hybrid",
124 |     #     phrases = ['a bird', 'a bird'],
125 |     #     images = ['images/hello.jpg', 'images/hello.jpg'],
126 |     #     locations = [ [0.0,0.09,0.33,0.76], [0.55,0.11,1.0,0.8]   ],
127 |     #     alpha_type = [1.0, 0, 0.0], 
128 |     #     has_text_mask = 1,  
129 |     #     has_image_mask = 0.5,  
130 |     #     save_folder_name="bird+hello_1_1"
131 |     # ),
132 | 
133 | 
134 | 
135 |     # dict(
136 |     #     prompt = "a deer standing in front of a brick house in the woods, anime, oil painting, high resolution, cottagecore, ghibli inspired, 4k",
137 |     #     phrases = ['a deer'],
138 |     #     images = ['images/sky.jpg'],
139 |     #     locations = [ [0.0,0.5,0.5,0.9] ],
140 |     #     alpha_type = [1, 0, 0],  
141 |     #     has_text_mask = 1,  
142 |     #     has_image_mask = 1,  
143 |     #     save_folder_name="deer_sky"
144 |     # ),
145 | 
146 | 
147 |     # dict(
148 |     #     prompt = "A woman sitting in a restaurant with a slice of pizza in front of her",
149 |     #     phrases = ['dining table', 'pizza', 'person', 'wall', 'car', 'paper', 'chair', 'window', 'bottle', 'cup'],
150 |     #     images = ['images/hello.jpg','images/hello.jpg','images/hello.jpg','images/hello.jpg','images/hello.jpg','images/hello.jpg','images/hello.jpg','images/hello.jpg','images/hello.jpg','images/hello.jpg'],
151 |     #     locations = [   [0.0030, 0.3589, 1.0000, 1.0000],
152 |     #                     [0.0779, 0.6744, 0.9768, 1.0000],
153 |     #                     [0.2236, 0.0000, 0.7809, 0.4352],
154 |     #                     [0.0000, 0.0000, 0.4313, 0.4505],
155 |     #                     [0.6275, 0.1050, 0.9444, 0.2497],
156 |     #                     [0.0000, 0.3859, 0.1250, 0.6922],
157 |     #                     [0.7137, 0.2389, 0.8540, 0.4549],
158 |     #                     [0.0000, 0.0000, 0.4667, 0.0630],
159 |     #                     [0.3822, 0.4235, 0.4932, 0.6575],
160 |     #                     [0.6616, 0.3617, 0.7880, 0.5165]  ],
161 |     #     alpha_type = [0.0, 0, 1.0],  
162 |     #     has_text_mask = 1,  
163 |     #     has_image_mask = 0,  
164 |     #     save_folder_name="pizza_1_0"
165 |     # ),
166 | 
167 | 
168 | 
169 | 
170 | ]


--------------------------------------------------------------------------------
/gligen/distributed.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import pickle
  3 | 
  4 | import torch
  5 | from torch import distributed as dist
  6 | from torch.utils.data.sampler import Sampler
  7 | 
  8 | 
  9 | def get_rank():
 10 |     if not dist.is_available():
 11 |         return 0
 12 | 
 13 |     if not dist.is_initialized():
 14 |         return 0
 15 | 
 16 |     return dist.get_rank()
 17 | 
 18 | 
 19 | def synchronize():
 20 |     if not dist.is_available():
 21 |         return
 22 |     if not dist.is_initialized():
 23 |         return
 24 | 
 25 |     world_size = dist.get_world_size()
 26 |     if world_size == 1:
 27 |         return
 28 | 
 29 |     dist.barrier()
 30 | 
 31 | 
 32 | def get_world_size():
 33 |     if not dist.is_available():
 34 |         return 1
 35 |     if not dist.is_initialized():
 36 |         return 1
 37 |     return dist.get_world_size()
 38 | 
 39 | 
 40 | def reduce_sum(tensor):
 41 |     if not dist.is_available():
 42 |         return tensor
 43 | 
 44 |     if not dist.is_initialized():
 45 |         return tensor
 46 | 
 47 |     tensor = tensor.clone()
 48 |     dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
 49 | 
 50 |     return tensor
 51 | 
 52 | 
 53 | def gather_grad(params):
 54 |     world_size = get_world_size()
 55 |     
 56 |     if world_size == 1:
 57 |         return
 58 | 
 59 |     for param in params:
 60 |         if param.grad is not None:
 61 |             dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)
 62 |             param.grad.data.div_(world_size)
 63 | 
 64 | 
 65 | def all_gather(data):
 66 |     world_size = get_world_size()
 67 | 
 68 |     if world_size == 1:
 69 |         return [data]
 70 | 
 71 |     buffer = pickle.dumps(data)
 72 |     storage = torch.ByteStorage.from_buffer(buffer)
 73 |     tensor = torch.ByteTensor(storage).to('cuda')
 74 | 
 75 |     local_size = torch.IntTensor([tensor.numel()]).to('cuda')
 76 |     size_list = [torch.IntTensor([0]).to('cuda') for _ in range(world_size)]
 77 |     dist.all_gather(size_list, local_size)
 78 |     size_list = [int(size.item()) for size in size_list]
 79 |     max_size = max(size_list)
 80 | 
 81 |     tensor_list = []
 82 |     for _ in size_list:
 83 |         tensor_list.append(torch.ByteTensor(size=(max_size,)).to('cuda'))
 84 | 
 85 |     if local_size != max_size:
 86 |         padding = torch.ByteTensor(size=(max_size - local_size,)).to('cuda')
 87 |         tensor = torch.cat((tensor, padding), 0)
 88 | 
 89 |     dist.all_gather(tensor_list, tensor)
 90 | 
 91 |     data_list = []
 92 | 
 93 |     for size, tensor in zip(size_list, tensor_list):
 94 |         buffer = tensor.cpu().numpy().tobytes()[:size]
 95 |         data_list.append(pickle.loads(buffer))
 96 | 
 97 |     return data_list
 98 | 
 99 | 
100 | def reduce_loss_dict(loss_dict):
101 |     world_size = get_world_size()
102 | 
103 |     if world_size < 2:
104 |         return loss_dict
105 | 
106 |     with torch.no_grad():
107 |         keys = []
108 |         losses = []
109 | 
110 |         for k in sorted(loss_dict.keys()):
111 |             keys.append(k)
112 |             losses.append(loss_dict[k])
113 | 
114 |         losses = torch.stack(losses, 0)
115 |         dist.reduce(losses, dst=0)
116 | 
117 |         if dist.get_rank() == 0:
118 |             losses /= world_size
119 | 
120 |         reduced_losses = {k: v for k, v in zip(keys, losses)}
121 | 
122 |     return reduced_losses
123 | 


--------------------------------------------------------------------------------
/gligen/ldm/__init__.py:
--------------------------------------------------------------------------------
1 | import gligen.evaluator as evaluator
2 | import gligen.trainer as trainer
3 | import gligen.ldm as ldm


--------------------------------------------------------------------------------
/gligen/ldm/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/__pycache__/util.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/__pycache__/util.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/data/__init__.py


--------------------------------------------------------------------------------
/gligen/ldm/data/base.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset
 3 | 
 4 | 
 5 | class Txt2ImgIterableBaseDataset(IterableDataset):
 6 |     '''
 7 |     Define an interface to make the IterableDatasets for text2img data chainable
 8 |     '''
 9 |     def __init__(self, num_records=0, valid_ids=None, size=256):
10 |         super().__init__()
11 |         self.num_records = num_records
12 |         self.valid_ids = valid_ids
13 |         self.sample_ids = valid_ids
14 |         self.size = size
15 | 
16 |         print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.')
17 | 
18 |     def __len__(self):
19 |         return self.num_records
20 | 
21 |     @abstractmethod
22 |     def __iter__(self):
23 |         pass


--------------------------------------------------------------------------------
/gligen/ldm/data/lsun.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import PIL
 4 | from PIL import Image
 5 | from torch.utils.data import Dataset
 6 | from torchvision import transforms
 7 | 
 8 | 
 9 | class LSUNBase(Dataset):
10 |     def __init__(self,
11 |                  txt_file,
12 |                  data_root,
13 |                  size=None,
14 |                  interpolation="bicubic",
15 |                  flip_p=0.5
16 |                  ):
17 |         self.data_paths = txt_file
18 |         self.data_root = data_root
19 |         with open(self.data_paths, "r") as f:
20 |             self.image_paths = f.read().splitlines()
21 |         self._length = len(self.image_paths)
22 |         self.labels = {
23 |             "relative_file_path_": [l for l in self.image_paths],
24 |             "file_path_": [os.path.join(self.data_root, l)
25 |                            for l in self.image_paths],
26 |         }
27 | 
28 |         self.size = size
29 |         self.interpolation = {"linear": PIL.Image.LINEAR,
30 |                               "bilinear": PIL.Image.BILINEAR,
31 |                               "bicubic": PIL.Image.BICUBIC,
32 |                               "lanczos": PIL.Image.LANCZOS,
33 |                               }[interpolation]
34 |         self.flip = transforms.RandomHorizontalFlip(p=flip_p)
35 | 
36 |     def __len__(self):
37 |         return self._length
38 | 
39 |     def __getitem__(self, i):
40 |         example = dict((k, self.labels[k][i]) for k in self.labels)
41 |         image = Image.open(example["file_path_"])
42 |         if not image.mode == "RGB":
43 |             image = image.convert("RGB")
44 | 
45 |         # default to score-sde preprocessing
46 |         img = np.array(image).astype(np.uint8)
47 |         crop = min(img.shape[0], img.shape[1])
48 |         h, w, = img.shape[0], img.shape[1]
49 |         img = img[(h - crop) // 2:(h + crop) // 2,
50 |               (w - crop) // 2:(w + crop) // 2]
51 | 
52 |         image = Image.fromarray(img)
53 |         if self.size is not None:
54 |             image = image.resize((self.size, self.size), resample=self.interpolation)
55 | 
56 |         image = self.flip(image)
57 |         image = np.array(image).astype(np.uint8)
58 |         example["image"] = (image / 127.5 - 1.0).astype(np.float32)
59 |         return example
60 | 
61 | 
62 | class LSUNChurchesTrain(LSUNBase):
63 |     def __init__(self, **kwargs):
64 |         super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs)
65 | 
66 | 
67 | class LSUNChurchesValidation(LSUNBase):
68 |     def __init__(self, flip_p=0., **kwargs):
69 |         super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches",
70 |                          flip_p=flip_p, **kwargs)
71 | 
72 | 
73 | class LSUNBedroomsTrain(LSUNBase):
74 |     def __init__(self, **kwargs):
75 |         super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs)
76 | 
77 | 
78 | class LSUNBedroomsValidation(LSUNBase):
79 |     def __init__(self, flip_p=0.0, **kwargs):
80 |         super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms",
81 |                          flip_p=flip_p, **kwargs)
82 | 
83 | 
84 | class LSUNCatsTrain(LSUNBase):
85 |     def __init__(self, **kwargs):
86 |         super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs)
87 | 
88 | 
89 | class LSUNCatsValidation(LSUNBase):
90 |     def __init__(self, flip_p=0., **kwargs):
91 |         super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats",
92 |                          flip_p=flip_p, **kwargs)
93 | 


--------------------------------------------------------------------------------
/gligen/ldm/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class LambdaWarmUpCosineScheduler:
 5 |     """
 6 |     note: use with a base_lr of 1.0
 7 |     """
 8 |     def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
 9 |         self.lr_warm_up_steps = warm_up_steps
10 |         self.lr_start = lr_start
11 |         self.lr_min = lr_min
12 |         self.lr_max = lr_max
13 |         self.lr_max_decay_steps = max_decay_steps
14 |         self.last_lr = 0.
15 |         self.verbosity_interval = verbosity_interval
16 | 
17 |     def schedule(self, n, **kwargs):
18 |         if self.verbosity_interval > 0:
19 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
20 |         if n < self.lr_warm_up_steps:
21 |             lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
22 |             self.last_lr = lr
23 |             return lr
24 |         else:
25 |             t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
26 |             t = min(t, 1.0)
27 |             lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
28 |                     1 + np.cos(t * np.pi))
29 |             self.last_lr = lr
30 |             return lr
31 | 
32 |     def __call__(self, n, **kwargs):
33 |         return self.schedule(n,**kwargs)
34 | 
35 | 
36 | class LambdaWarmUpCosineScheduler2:
37 |     """
38 |     supports repeated iterations, configurable via lists
39 |     note: use with a base_lr of 1.0.
40 |     """
41 |     def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
42 |         assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
43 |         self.lr_warm_up_steps = warm_up_steps
44 |         self.f_start = f_start
45 |         self.f_min = f_min
46 |         self.f_max = f_max
47 |         self.cycle_lengths = cycle_lengths
48 |         self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
49 |         self.last_f = 0.
50 |         self.verbosity_interval = verbosity_interval
51 | 
52 |     def find_in_interval(self, n):
53 |         interval = 0
54 |         for cl in self.cum_cycles[1:]:
55 |             if n <= cl:
56 |                 return interval
57 |             interval += 1
58 | 
59 |     def schedule(self, n, **kwargs):
60 |         cycle = self.find_in_interval(n)
61 |         n = n - self.cum_cycles[cycle]
62 |         if self.verbosity_interval > 0:
63 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
64 |                                                        f"current cycle {cycle}")
65 |         if n < self.lr_warm_up_steps[cycle]:
66 |             f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
67 |             self.last_f = f
68 |             return f
69 |         else:
70 |             t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
71 |             t = min(t, 1.0)
72 |             f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
73 |                     1 + np.cos(t * np.pi))
74 |             self.last_f = f
75 |             return f
76 | 
77 |     def __call__(self, n, **kwargs):
78 |         return self.schedule(n, **kwargs)
79 | 
80 | 
81 | class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
82 | 
83 |     def schedule(self, n, **kwargs):
84 |         cycle = self.find_in_interval(n)
85 |         n = n - self.cum_cycles[cycle]
86 |         if self.verbosity_interval > 0:
87 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
88 |                                                        f"current cycle {cycle}")
89 | 
90 |         if n < self.lr_warm_up_steps[cycle]:
91 |             f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
92 |             self.last_f = f
93 |             return f
94 |         else:
95 |             f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
96 |             self.last_f = f
97 |             return f
98 | 
99 | 


--------------------------------------------------------------------------------
/gligen/ldm/models/__pycache__/autoencoder.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/models/__pycache__/autoencoder.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/models/autoencoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | #import pytorch_lightning as pl
 4 | import torch.nn.functional as F
 5 | from contextlib import contextmanager
 6 | 
 7 | # from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
 8 | 
 9 | from ldm.modules.diffusionmodules.model import Encoder, Decoder
10 | from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
11 | 
12 | from ldm.util import instantiate_from_config
13 | 
14 | 
15 | 
16 | 
17 | class AutoencoderKL(nn.Module):
18 |     def __init__(self,
19 |                  ddconfig,
20 |                  embed_dim,
21 |                  scale_factor=1
22 |                  ):
23 |         super().__init__()
24 |         self.encoder = Encoder(**ddconfig)
25 |         self.decoder = Decoder(**ddconfig)
26 |         assert ddconfig["double_z"]
27 |         self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
28 |         self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
29 |         self.embed_dim = embed_dim
30 |         self.scale_factor = scale_factor
31 | 
32 | 
33 | 
34 |     def encode(self, x):
35 |         h = self.encoder(x)
36 |         moments = self.quant_conv(h)
37 |         posterior = DiagonalGaussianDistribution(moments)
38 |         return posterior.sample() * self.scale_factor
39 | 
40 |     def decode(self, z):
41 |         z = 1. / self.scale_factor * z
42 |         z = self.post_quant_conv(z)
43 |         dec = self.decoder(z)
44 |         return dec
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/gligen/ldm/models/diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/models/diffusion/__init__.py


--------------------------------------------------------------------------------
/gligen/ldm/models/diffusion/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/models/diffusion/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/models/diffusion/__pycache__/ddim.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/models/diffusion/__pycache__/ddim.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/models/diffusion/__pycache__/ddpm.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/models/diffusion/__pycache__/ddpm.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/models/diffusion/__pycache__/ldm.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/models/diffusion/__pycache__/ldm.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/models/diffusion/__pycache__/plms.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/models/diffusion/__pycache__/plms.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/models/diffusion/ddim.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from tqdm import tqdm
  4 | from functools import partial
  5 | 
  6 | from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
  7 | 
  8 | 
  9 | class DDIMSampler(object):
 10 |     def __init__(self, diffusion, model, schedule="linear", alpha_generator_func=None, set_alpha_scale=None):
 11 |         super().__init__()
 12 |         self.diffusion = diffusion
 13 |         self.model = model
 14 |         self.device = diffusion.betas.device
 15 |         self.ddpm_num_timesteps = diffusion.num_timesteps
 16 |         self.schedule = schedule
 17 |         self.alpha_generator_func = alpha_generator_func
 18 |         self.set_alpha_scale = set_alpha_scale
 19 |         
 20 | 
 21 |     def register_buffer(self, name, attr):
 22 |         if type(attr) == torch.Tensor:
 23 |             attr = attr.to(self.device)
 24 |         setattr(self, name, attr)
 25 | 
 26 | 
 27 |     def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0.):
 28 |         self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
 29 |                                                   num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=False)
 30 |         alphas_cumprod = self.diffusion.alphas_cumprod
 31 |         assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
 32 |         to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.device)
 33 | 
 34 |         self.register_buffer('betas', to_torch(self.diffusion.betas))
 35 |         self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
 36 |         self.register_buffer('alphas_cumprod_prev', to_torch(self.diffusion.alphas_cumprod_prev))
 37 | 
 38 |         # calculations for diffusion q(x_t | x_{t-1}) and others
 39 |         self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
 40 |         self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
 41 |         self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
 42 |         self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
 43 |         self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
 44 | 
 45 |         # ddim sampling parameters
 46 |         ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
 47 |                                                                                    ddim_timesteps=self.ddim_timesteps,
 48 |                                                                                    eta=ddim_eta,verbose=False)
 49 |         self.register_buffer('ddim_sigmas', ddim_sigmas)
 50 |         self.register_buffer('ddim_alphas', ddim_alphas)
 51 |         self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
 52 |         self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
 53 |         sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
 54 |             (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
 55 |                         1 - self.alphas_cumprod / self.alphas_cumprod_prev))
 56 |         self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
 57 | 
 58 | 
 59 |     @torch.no_grad()
 60 |     def sample(self, S, shape, input, uc=None, guidance_scale=1, mask=None, x0=None):
 61 |         self.make_schedule(ddim_num_steps=S)
 62 |         return self.ddim_sampling(shape, input, uc, guidance_scale,  mask=mask, x0=x0)
 63 |  
 64 | 
 65 |     @torch.no_grad()
 66 |     def ddim_sampling(self, shape, input, uc, guidance_scale=1, mask=None, x0=None):
 67 |         b = shape[0]
 68 |         
 69 |         img = input["x"]
 70 |         if img == None:     
 71 |             img = torch.randn(shape, device=self.device)
 72 |             input["x"] = img
 73 | 
 74 | 
 75 |         time_range = np.flip(self.ddim_timesteps)
 76 |         total_steps = self.ddim_timesteps.shape[0]
 77 | 
 78 |         #iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
 79 |         iterator = time_range
 80 |   
 81 |         if self.alpha_generator_func != None:
 82 |             alphas = self.alpha_generator_func(len(iterator))
 83 | 
 84 | 
 85 |         for i, step in enumerate(iterator):
 86 | 
 87 |             # set alpha 
 88 |             if self.alpha_generator_func != None:
 89 |                 self.set_alpha_scale(self.model, alphas[i])
 90 | 
 91 |             # run 
 92 |             index = total_steps - i - 1
 93 |             input["timesteps"] = torch.full((b,), step, device=self.device, dtype=torch.long)
 94 |             
 95 |             if mask is not None:
 96 |                 assert x0 is not None
 97 |                 img_orig = self.diffusion.q_sample( x0, input["timesteps"] ) 
 98 |                 img = img_orig * mask + (1. - mask) * img
 99 |                 input["x"] = img
100 |             
101 |             img, pred_x0 = self.p_sample_ddim(input, index=index, uc=uc, guidance_scale=guidance_scale)
102 |             input["x"] = img 
103 | 
104 |         return img
105 | 
106 | 
107 |     @torch.no_grad()
108 |     def p_sample_ddim(self, input, index, uc=None, guidance_scale=1):
109 | 
110 | 
111 |         e_t = self.model(input) 
112 |         if uc is not None and guidance_scale != 1:
113 |             unconditional_input = dict(x=input["x"], timesteps=input["timesteps"], context=uc)
114 |             if "inpainting_extra_input" in input:
115 |                 unconditional_input["inpainting_extra_input"] = input["inpainting_extra_input"]
116 |             e_t_uncond = self.model( unconditional_input ) 
117 |             e_t = e_t_uncond + guidance_scale * (e_t - e_t_uncond)
118 | 
119 |         # select parameters corresponding to the currently considered timestep
120 |         b = input["x"].shape[0] 
121 |         a_t = torch.full((b, 1, 1, 1), self.ddim_alphas[index], device=self.device)
122 |         a_prev = torch.full((b, 1, 1, 1), self.ddim_alphas_prev[index], device=self.device)
123 |         sigma_t = torch.full((b, 1, 1, 1), self.ddim_sigmas[index], device=self.device)
124 |         sqrt_one_minus_at = torch.full((b, 1, 1, 1), self.ddim_sqrt_one_minus_alphas[index],device=self.device)
125 | 
126 |         # current prediction for x_0
127 |         pred_x0 = (input["x"] - sqrt_one_minus_at * e_t) / a_t.sqrt()
128 | 
129 |         # direction pointing to x_t
130 |         dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
131 |         noise = sigma_t * torch.randn_like( input["x"] ) 
132 |         x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
133 | 
134 |         return x_prev, pred_x0
135 | 


--------------------------------------------------------------------------------
/gligen/ldm/models/diffusion/ddpm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | from functools import partial
 5 | from ldm.modules.diffusionmodules.util import make_beta_schedule
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | class DDPM(nn.Module):
12 |     def __init__(self, beta_schedule="linear", timesteps=1000, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
13 |         super().__init__()
14 | 
15 |         self.v_posterior = 0 
16 |         self.register_schedule(beta_schedule, timesteps, linear_start, linear_end, cosine_s)
17 | 
18 | 
19 |     def register_schedule(self, beta_schedule="linear", timesteps=1000, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
20 |         
21 |         betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
22 |         alphas = 1. - betas
23 |         alphas_cumprod = np.cumprod(alphas, axis=0)
24 |         alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
25 | 
26 |         timesteps, = betas.shape
27 |         self.num_timesteps = int(timesteps)
28 |         self.linear_start = linear_start
29 |         self.linear_end = linear_end
30 |         assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
31 | 
32 |         to_torch = partial(torch.tensor, dtype=torch.float32)
33 | 
34 |         self.register_buffer('betas', to_torch(betas))
35 |         self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
36 |         self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
37 | 
38 |         # calculations for diffusion q(x_t | x_{t-1}) and others
39 |         self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
40 |         self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
41 |         self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
42 |         self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
43 |         self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
44 | 
45 |         # calculations for posterior q(x_{t-1} | x_t, x_0)
46 |         posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / ( 1. - alphas_cumprod) + self.v_posterior * betas
47 |         # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
48 |         
49 |         self.register_buffer('posterior_variance', to_torch(posterior_variance))
50 |         
51 |         # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
52 |         self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
53 |         self.register_buffer('posterior_mean_coef1', to_torch( betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
54 |         self.register_buffer('posterior_mean_coef2', to_torch( (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/gligen/ldm/models/diffusion/ldm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | from tqdm import tqdm
 5 | from ldm.util import default
 6 | from ldm.modules.diffusionmodules.util import  extract_into_tensor
 7 | from .ddpm import DDPM
 8 | 
 9 | 
10 | 
11 | class LatentDiffusion(DDPM):
12 |     def __init__(self, *args, **kwargs):
13 |         super().__init__(*args, **kwargs)
14 |         # hardcoded 
15 |         self.clip_denoised = False
16 |         
17 | 
18 | 
19 |     def q_sample(self, x_start, t, noise=None):
20 |         noise = default(noise, lambda: torch.randn_like(x_start))
21 |         return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
22 |                 extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
23 | 
24 | 
25 |     "Does not support DDPM sampling anymore. Only do DDIM or PLMS"
26 | 
27 |     # = = = = = = = = = = = = Below is for sampling = = = = = = = = = = = = # 
28 | 
29 |     # def predict_start_from_noise(self, x_t, t, noise):
30 |     #     return ( extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
31 |     #              extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise )
32 | 
33 |     # def q_posterior(self, x_start, x_t, t):
34 |     #     posterior_mean = (
35 |     #             extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
36 |     #             extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
37 |     #     )
38 |     #     posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape)
39 |     #     posterior_log_variance_clipped = extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
40 |     #     return posterior_mean, posterior_variance, posterior_log_variance_clipped
41 | 
42 | 
43 |     # def p_mean_variance(self, model, x, c, t):
44 | 
45 |     #     model_out = model(x, t, c)
46 |     #     x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
47 | 
48 |     #     if self.clip_denoised:
49 |     #         x_recon.clamp_(-1., 1.)
50 | 
51 |     #     model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
52 |     #     return model_mean, posterior_variance, posterior_log_variance, x_recon
53 | 
54 | 
55 |     # @torch.no_grad()
56 |     # def p_sample(self, model, x, c, t):
57 |     #     b, *_, device = *x.shape, x.device
58 |     #     model_mean, _, model_log_variance, x0 = self.p_mean_variance(model, x=x, c=c, t=t, )
59 |     #     noise = torch.randn_like(x) 
60 | 
61 |     #     # no noise when t == 0
62 |     #     nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
63 | 
64 |     #     return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, x0
65 | 
66 | 
67 |     # @torch.no_grad()
68 |     # def p_sample_loop(self, model, shape, c):
69 |     #     device = self.betas.device
70 |     #     b = shape[0]
71 |     #     img = torch.randn(shape, device=device)
72 | 
73 |     #     iterator = tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps) 
74 |     #     for i in iterator:
75 |     #         ts = torch.full((b,), i, device=device, dtype=torch.long)
76 |     #         img, x0 = self.p_sample(model, img, c, ts)
77 | 
78 |     #     return img
79 | 
80 | 
81 |     # @torch.no_grad()
82 |     # def sample(self, model, shape, c, uc=None, guidance_scale=None):
83 |     #     return self.p_sample_loop(model, shape, c)
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/gligen/ldm/modules/__pycache__/attention.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/__pycache__/attention.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/modules/__pycache__/x_transformer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/__pycache__/x_transformer.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/modules/diffusionmodules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/diffusionmodules/__init__.py


--------------------------------------------------------------------------------
/gligen/ldm/modules/diffusionmodules/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/diffusionmodules/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/modules/diffusionmodules/__pycache__/model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/diffusionmodules/__pycache__/model.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/modules/diffusionmodules/__pycache__/openaimodel.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/diffusionmodules/__pycache__/openaimodel.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/modules/diffusionmodules/__pycache__/positionnet.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/diffusionmodules/__pycache__/positionnet.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/modules/diffusionmodules/__pycache__/util.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/diffusionmodules/__pycache__/util.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/modules/diffusionmodules/positionnet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from ldm.modules.attention import BasicTransformerBlock
 4 | from ldm.modules.diffusionmodules.util import checkpoint, FourierEmbedder
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | 
 9 | class PositionNet(nn.Module):
10 |     def __init__(self,  positive_len, out_dim, fourier_freqs=8):
11 |         super().__init__()
12 |         self.positive_len = positive_len
13 |         self.out_dim = out_dim 
14 | 
15 |         self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs)
16 |         self.position_dim = fourier_freqs*2*4 # 2 is sin&cos, 4 is xyxy 
17 | 
18 |         self.linears = nn.Sequential(
19 |             nn.Linear( self.positive_len + self.position_dim, 512),
20 |             nn.SiLU(),
21 |             nn.Linear( 512, 512),
22 |             nn.SiLU(),
23 |             nn.Linear(512, out_dim),
24 |         )
25 |         
26 |         self.null_positive_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
27 |         self.null_position_feature = torch.nn.Parameter(torch.zeros([self.position_dim]))
28 |   
29 | 
30 |     def forward(self, boxes, masks, positive_embeddings):
31 |         B, N, _ = boxes.shape 
32 |         masks = masks.unsqueeze(-1)
33 | 
34 |         # embedding position (it may includes padding as placeholder)
35 |         xyxy_embedding = self.fourier_embedder(boxes) # B*N*4 --> B*N*C
36 | 
37 |         # learnable null embedding 
38 |         positive_null = self.null_positive_feature.view(1,1,-1)
39 |         xyxy_null =  self.null_position_feature.view(1,1,-1)
40 | 
41 |         # replace padding with learnable null embedding 
42 |         positive_embeddings = positive_embeddings*masks + (1-masks)*positive_null
43 |         xyxy_embedding = xyxy_embedding*masks + (1-masks)*xyxy_null
44 | 
45 |         objs = self.linears(  torch.cat([positive_embeddings, xyxy_embedding], dim=-1)  )
46 |         assert objs.shape == torch.Size([B,N,self.out_dim])        
47 |         return objs
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/gligen/ldm/modules/diffusionmodules/positionnet_with_image.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from ldm.modules.attention import BasicTransformerBlock
 4 | from ldm.modules.diffusionmodules.util import checkpoint, FourierEmbedder
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | 
 9 | class PositionNet(nn.Module):
10 |     def __init__(self, positive_len, out_dim, fourier_freqs=8):
11 |         super().__init__()
12 |         self.positive_len = positive_len
13 |         self.out_dim = out_dim 
14 | 
15 |         self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs)
16 |         self.position_dim = fourier_freqs*2*4 # 2 is sin&cos, 4 is xyxy 
17 | 
18 |         # -------------------------------------------------------------- #
19 |         self.linears_text = nn.Sequential(
20 |             nn.Linear( self.positive_len + self.position_dim, 512),
21 |             nn.SiLU(),
22 |             nn.Linear( 512, 512),
23 |             nn.SiLU(),
24 |             nn.Linear(512, out_dim),
25 |         )
26 | 
27 |         self.linears_image = nn.Sequential(
28 |             nn.Linear( self.positive_len + self.position_dim, 512),
29 |             nn.SiLU(),
30 |             nn.Linear( 512, 512),
31 |             nn.SiLU(),
32 |             nn.Linear(512, out_dim),
33 |         )
34 |         
35 |         # -------------------------------------------------------------- #
36 |         self.null_text_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
37 |         self.null_image_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
38 |         self.null_position_feature = torch.nn.Parameter(torch.zeros([self.position_dim]))
39 |   
40 | 
41 |     def forward(self, boxes, masks, text_masks, image_masks, text_embeddings, image_embeddings):
42 |         B, N, _ = boxes.shape 
43 |         masks = masks.unsqueeze(-1) # B*N*1 
44 |         text_masks = text_masks.unsqueeze(-1) # B*N*1 
45 |         image_masks = image_masks.unsqueeze(-1) # B*N*1
46 |         
47 |         # embedding position (it may includes padding as placeholder)
48 |         xyxy_embedding = self.fourier_embedder(boxes) # B*N*4 --> B*N*C
49 | 
50 |         # learnable null embedding 
51 |         text_null  = self.null_text_feature.view(1,1,-1) # 1*1*C
52 |         image_null = self.null_image_feature.view(1,1,-1) # 1*1*C
53 |         xyxy_null  = self.null_position_feature.view(1,1,-1) # 1*1*C
54 | 
55 |         # replace padding with learnable null embedding 
56 |         text_embeddings  = text_embeddings*text_masks  + (1-text_masks)*text_null
57 |         image_embeddings = image_embeddings*image_masks + (1-image_masks)*image_null
58 |         xyxy_embedding = xyxy_embedding*masks + (1-masks)*xyxy_null
59 | 
60 |         objs_text  = self.linears_text(  torch.cat([text_embeddings, xyxy_embedding], dim=-1)  )
61 |         objs_image = self.linears_image( torch.cat([image_embeddings,xyxy_embedding], dim=-1)  )
62 |         objs = torch.cat( [objs_text,objs_image], dim=1 )
63 | 
64 |         assert objs.shape == torch.Size([B,N*2,self.out_dim])        
65 |         return objs
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/gligen/ldm/modules/distributions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/distributions/__init__.py


--------------------------------------------------------------------------------
/gligen/ldm/modules/distributions/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/distributions/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/modules/distributions/__pycache__/distributions.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/distributions/__pycache__/distributions.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/modules/distributions/distributions.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | class AbstractDistribution:
 6 |     def sample(self):
 7 |         raise NotImplementedError()
 8 | 
 9 |     def mode(self):
10 |         raise NotImplementedError()
11 | 
12 | 
13 | class DiracDistribution(AbstractDistribution):
14 |     def __init__(self, value):
15 |         self.value = value
16 | 
17 |     def sample(self):
18 |         return self.value
19 | 
20 |     def mode(self):
21 |         return self.value
22 | 
23 | 
24 | class DiagonalGaussianDistribution(object):
25 |     def __init__(self, parameters, deterministic=False):
26 |         self.parameters = parameters
27 |         self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
28 |         self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
29 |         self.deterministic = deterministic
30 |         self.std = torch.exp(0.5 * self.logvar)
31 |         self.var = torch.exp(self.logvar)
32 |         if self.deterministic:
33 |             self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
34 | 
35 |     def sample(self):
36 |         x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
37 |         return x
38 | 
39 |     def kl(self, other=None):
40 |         if self.deterministic:
41 |             return torch.Tensor([0.])
42 |         else:
43 |             if other is None:
44 |                 return 0.5 * torch.sum(torch.pow(self.mean, 2)
45 |                                        + self.var - 1.0 - self.logvar,
46 |                                        dim=[1, 2, 3])
47 |             else:
48 |                 return 0.5 * torch.sum(
49 |                     torch.pow(self.mean - other.mean, 2) / other.var
50 |                     + self.var / other.var - 1.0 - self.logvar + other.logvar,
51 |                     dim=[1, 2, 3])
52 | 
53 |     def nll(self, sample, dims=[1,2,3]):
54 |         if self.deterministic:
55 |             return torch.Tensor([0.])
56 |         logtwopi = np.log(2.0 * np.pi)
57 |         return 0.5 * torch.sum(
58 |             logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
59 |             dim=dims)
60 | 
61 |     def mode(self):
62 |         return self.mean
63 | 
64 | 
65 | def normal_kl(mean1, logvar1, mean2, logvar2):
66 |     """
67 |     source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
68 |     Compute the KL divergence between two gaussians.
69 |     Shapes are automatically broadcasted, so batches can be compared to
70 |     scalars, among other use cases.
71 |     """
72 |     tensor = None
73 |     for obj in (mean1, logvar1, mean2, logvar2):
74 |         if isinstance(obj, torch.Tensor):
75 |             tensor = obj
76 |             break
77 |     assert tensor is not None, "at least one argument must be a Tensor"
78 | 
79 |     # Force variances to be Tensors. Broadcasting helps convert scalars to
80 |     # Tensors, but it does not work for torch.exp().
81 |     logvar1, logvar2 = [
82 |         x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
83 |         for x in (logvar1, logvar2)
84 |     ]
85 | 
86 |     return 0.5 * (
87 |         -1.0
88 |         + logvar2
89 |         - logvar1
90 |         + torch.exp(logvar1 - logvar2)
91 |         + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
92 |     )
93 | 


--------------------------------------------------------------------------------
/gligen/ldm/modules/ema.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class LitEma(nn.Module):
 6 |     def __init__(self, model, decay=0.9999, use_num_upates=True):
 7 |         super().__init__()
 8 |         if decay < 0.0 or decay > 1.0:
 9 |             raise ValueError('Decay must be between 0 and 1')
10 | 
11 |         self.m_name2s_name = {}
12 |         self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
13 |         self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
14 |                              else torch.tensor(-1,dtype=torch.int))
15 | 
16 |         for name, p in model.named_parameters():
17 |             if p.requires_grad:
18 |                 #remove as '.'-character is not allowed in buffers
19 |                 s_name = name.replace('.','')
20 |                 self.m_name2s_name.update({name:s_name})
21 |                 self.register_buffer(s_name,p.clone().detach().data)
22 | 
23 |         self.collected_params = []
24 | 
25 |     def forward(self,model):
26 |         decay = self.decay
27 | 
28 |         if self.num_updates >= 0:
29 |             self.num_updates += 1
30 |             decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
31 | 
32 |         one_minus_decay = 1.0 - decay
33 | 
34 |         with torch.no_grad():
35 |             m_param = dict(model.named_parameters())
36 |             shadow_params = dict(self.named_buffers())
37 | 
38 |             for key in m_param:
39 |                 if m_param[key].requires_grad:
40 |                     sname = self.m_name2s_name[key]
41 |                     shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
42 |                     shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
43 |                 else:
44 |                     assert not key in self.m_name2s_name
45 | 
46 |     def copy_to(self, model):
47 |         m_param = dict(model.named_parameters())
48 |         shadow_params = dict(self.named_buffers())
49 |         for key in m_param:
50 |             if m_param[key].requires_grad:
51 |                 m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
52 |             else:
53 |                 assert not key in self.m_name2s_name
54 | 
55 |     def store(self, parameters):
56 |         """
57 |         Save the current parameters for restoring later.
58 |         Args:
59 |           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
60 |             temporarily stored.
61 |         """
62 |         self.collected_params = [param.clone() for param in parameters]
63 | 
64 |     def restore(self, parameters):
65 |         """
66 |         Restore the parameters stored with the `store` method.
67 |         Useful to validate the model with EMA parameters without affecting the
68 |         original optimization process. Store the parameters before the
69 |         `copy_to` method. After validation (or model saving), use this to
70 |         restore the former parameters.
71 |         Args:
72 |           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
73 |             updated with the stored parameters.
74 |         """
75 |         for c_param, param in zip(self.collected_params, parameters):
76 |             param.data.copy_(c_param.data)
77 | 


--------------------------------------------------------------------------------
/gligen/ldm/modules/encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/encoders/__init__.py


--------------------------------------------------------------------------------
/gligen/ldm/modules/encoders/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/encoders/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/modules/encoders/__pycache__/modules.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/encoders/__pycache__/modules.cpython-39.pyc


--------------------------------------------------------------------------------
/gligen/ldm/modules/image_degradation/__init__.py:
--------------------------------------------------------------------------------
1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
3 | 


--------------------------------------------------------------------------------
/gligen/ldm/modules/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator


--------------------------------------------------------------------------------
/gligen/ldm/modules/losses/contperceptual.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from taming.modules.losses.vqperceptual import *  # TODO: taming dependency yes/no?
  5 | 
  6 | 
  7 | class LPIPSWithDiscriminator(nn.Module):
  8 |     def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0,
  9 |                  disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
 10 |                  perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
 11 |                  disc_loss="hinge"):
 12 | 
 13 |         super().__init__()
 14 |         assert disc_loss in ["hinge", "vanilla"]
 15 |         self.kl_weight = kl_weight
 16 |         self.pixel_weight = pixelloss_weight
 17 |         self.perceptual_loss = LPIPS().eval()
 18 |         self.perceptual_weight = perceptual_weight
 19 |         # output log variance
 20 |         self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
 21 | 
 22 |         self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
 23 |                                                  n_layers=disc_num_layers,
 24 |                                                  use_actnorm=use_actnorm
 25 |                                                  ).apply(weights_init)
 26 |         self.discriminator_iter_start = disc_start
 27 |         self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
 28 |         self.disc_factor = disc_factor
 29 |         self.discriminator_weight = disc_weight
 30 |         self.disc_conditional = disc_conditional
 31 | 
 32 |     def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
 33 |         if last_layer is not None:
 34 |             nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
 35 |             g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
 36 |         else:
 37 |             nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
 38 |             g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
 39 | 
 40 |         d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
 41 |         d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
 42 |         d_weight = d_weight * self.discriminator_weight
 43 |         return d_weight
 44 | 
 45 |     def forward(self, inputs, reconstructions, posteriors, optimizer_idx,
 46 |                 global_step, last_layer=None, cond=None, split="train",
 47 |                 weights=None):
 48 |         rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
 49 |         if self.perceptual_weight > 0:
 50 |             p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
 51 |             rec_loss = rec_loss + self.perceptual_weight * p_loss
 52 | 
 53 |         nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
 54 |         weighted_nll_loss = nll_loss
 55 |         if weights is not None:
 56 |             weighted_nll_loss = weights*nll_loss
 57 |         weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
 58 |         nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
 59 |         kl_loss = posteriors.kl()
 60 |         kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
 61 | 
 62 |         # now the GAN part
 63 |         if optimizer_idx == 0:
 64 |             # generator update
 65 |             if cond is None:
 66 |                 assert not self.disc_conditional
 67 |                 logits_fake = self.discriminator(reconstructions.contiguous())
 68 |             else:
 69 |                 assert self.disc_conditional
 70 |                 logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
 71 |             g_loss = -torch.mean(logits_fake)
 72 | 
 73 |             if self.disc_factor > 0.0:
 74 |                 try:
 75 |                     d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
 76 |                 except RuntimeError:
 77 |                     assert not self.training
 78 |                     d_weight = torch.tensor(0.0)
 79 |             else:
 80 |                 d_weight = torch.tensor(0.0)
 81 | 
 82 |             disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
 83 |             loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
 84 | 
 85 |             log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(),
 86 |                    "{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(),
 87 |                    "{}/rec_loss".format(split): rec_loss.detach().mean(),
 88 |                    "{}/d_weight".format(split): d_weight.detach(),
 89 |                    "{}/disc_factor".format(split): torch.tensor(disc_factor),
 90 |                    "{}/g_loss".format(split): g_loss.detach().mean(),
 91 |                    }
 92 |             return loss, log
 93 | 
 94 |         if optimizer_idx == 1:
 95 |             # second pass for discriminator update
 96 |             if cond is None:
 97 |                 logits_real = self.discriminator(inputs.contiguous().detach())
 98 |                 logits_fake = self.discriminator(reconstructions.contiguous().detach())
 99 |             else:
100 |                 logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
101 |                 logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
102 | 
103 |             disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
104 |             d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
105 | 
106 |             log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
107 |                    "{}/logits_real".format(split): logits_real.detach().mean(),
108 |                    "{}/logits_fake".format(split): logits_fake.detach().mean()
109 |                    }
110 |             return d_loss, log
111 | 
112 | 


--------------------------------------------------------------------------------
/gligen/ldm/util.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | import torch
 4 | import numpy as np
 5 | 
 6 | from inspect import isfunction
 7 | from PIL import Image, ImageDraw, ImageFont
 8 | 
 9 | 
10 | def log_txt_as_img(wh, xc, size=10):
11 |     # wh a tuple of (width, height)
12 |     # xc a list of captions to plot
13 |     b = len(xc)
14 |     txts = list()
15 |     for bi in range(b):
16 |         txt = Image.new("RGB", wh, color="white")
17 |         draw = ImageDraw.Draw(txt)
18 |         font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
19 |         nc = int(40 * (wh[0] / 256))
20 |         lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
21 | 
22 |         try:
23 |             draw.text((0, 0), lines, fill="black", font=font)
24 |         except UnicodeEncodeError:
25 |             print("Cant encode string for logging. Skipping.")
26 | 
27 |         txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
28 |         txts.append(txt)
29 |     txts = np.stack(txts)
30 |     txts = torch.tensor(txts)
31 |     return txts
32 | 
33 | 
34 | def ismap(x):
35 |     if not isinstance(x, torch.Tensor):
36 |         return False
37 |     return (len(x.shape) == 4) and (x.shape[1] > 3)
38 | 
39 | 
40 | def isimage(x):
41 |     if not isinstance(x,torch.Tensor):
42 |         return False
43 |     return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
44 | 
45 | 
46 | def exists(x):
47 |     return x is not None
48 | 
49 | 
50 | def default(val, d):
51 |     if exists(val):
52 |         return val
53 |     return d() if isfunction(d) else d
54 | 
55 | 
56 | def mean_flat(tensor):
57 |     """
58 |     https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
59 |     Take the mean over all non-batch dimensions.
60 |     """
61 |     return tensor.mean(dim=list(range(1, len(tensor.shape))))
62 | 
63 | 
64 | def count_params(model, verbose=False):
65 |     total_params = sum(p.numel() for p in model.parameters())
66 |     if verbose:
67 |         print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
68 |     return total_params
69 | 
70 | 
71 | def instantiate_from_config(config):
72 |     if not "target" in config:
73 |         if config == '__is_first_stage__':
74 |             return None
75 |         elif config == "__is_unconditional__":
76 |             return None
77 |         raise KeyError("Expected key `target` to instantiate.")
78 |     return get_obj_from_str(config["target"])(**config.get("params", dict()))
79 | 
80 | 
81 | def get_obj_from_str(string, reload=False):
82 |     module, cls = string.rsplit(".", 1)
83 |     if reload:
84 |         module_imp = importlib.import_module(module)
85 |         importlib.reload(module_imp)
86 |     return getattr(importlib.import_module(module, package=None), cls)


--------------------------------------------------------------------------------
/gligen/projection_matrix.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/projection_matrix.pth


--------------------------------------------------------------------------------
/groundingdino/_C.cpython-39-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/_C.cpython-39-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/groundingdino/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/__init__.py


--------------------------------------------------------------------------------
/groundingdino/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/config/GroundingDINO_SwinB.cfg.py:
--------------------------------------------------------------------------------
 1 | batch_size = 1
 2 | modelname = "groundingdino"
 3 | backbone = "swin_B_384_22k"
 4 | position_embedding = "sine"
 5 | pe_temperatureH = 20
 6 | pe_temperatureW = 20
 7 | return_interm_indices = [1, 2, 3]
 8 | backbone_freeze_keywords = None
 9 | enc_layers = 6
10 | dec_layers = 6
11 | pre_norm = False
12 | dim_feedforward = 2048
13 | hidden_dim = 256
14 | dropout = 0.0
15 | nheads = 8
16 | num_queries = 900
17 | query_dim = 4
18 | num_patterns = 0
19 | num_feature_levels = 4
20 | enc_n_points = 4
21 | dec_n_points = 4
22 | two_stage_type = "standard"
23 | two_stage_bbox_embed_share = False
24 | two_stage_class_embed_share = False
25 | transformer_activation = "relu"
26 | dec_pred_bbox_embed_share = True
27 | dn_box_noise_scale = 1.0
28 | dn_label_noise_ratio = 0.5
29 | dn_label_coef = 1.0
30 | dn_bbox_coef = 1.0
31 | embed_init_tgt = True
32 | dn_labelbook_size = 2000
33 | max_text_len = 256
34 | text_encoder_type = "bert-base-uncased"
35 | use_text_enhancer = True
36 | use_fusion_layer = True
37 | use_checkpoint = True
38 | use_transformer_ckpt = True
39 | use_text_cross_attention = True
40 | text_dropout = 0.0
41 | fusion_dropout = 0.0
42 | fusion_droppath = 0.1
43 | sub_sentence_present = True
44 | 


--------------------------------------------------------------------------------
/groundingdino/config/GroundingDINO_SwinT_OGC.py:
--------------------------------------------------------------------------------
 1 | batch_size = 1
 2 | modelname = "groundingdino"
 3 | backbone = "swin_T_224_1k"
 4 | position_embedding = "sine"
 5 | pe_temperatureH = 20
 6 | pe_temperatureW = 20
 7 | return_interm_indices = [1, 2, 3]
 8 | backbone_freeze_keywords = None
 9 | enc_layers = 6
10 | dec_layers = 6
11 | pre_norm = False
12 | dim_feedforward = 2048
13 | hidden_dim = 256
14 | dropout = 0.0
15 | nheads = 8
16 | num_queries = 900
17 | query_dim = 4
18 | num_patterns = 0
19 | num_feature_levels = 4
20 | enc_n_points = 4
21 | dec_n_points = 4
22 | two_stage_type = "standard"
23 | two_stage_bbox_embed_share = False
24 | two_stage_class_embed_share = False
25 | transformer_activation = "relu"
26 | dec_pred_bbox_embed_share = True
27 | dn_box_noise_scale = 1.0
28 | dn_label_noise_ratio = 0.5
29 | dn_label_coef = 1.0
30 | dn_bbox_coef = 1.0
31 | embed_init_tgt = True
32 | dn_labelbook_size = 2000
33 | max_text_len = 256
34 | text_encoder_type = "bert-base-uncased"
35 | use_text_enhancer = True
36 | use_fusion_layer = True
37 | use_checkpoint = True
38 | use_transformer_ckpt = True
39 | use_text_cross_attention = True
40 | text_dropout = 0.0
41 | fusion_dropout = 0.0
42 | fusion_droppath = 0.1
43 | sub_sentence_present = True
44 | 


--------------------------------------------------------------------------------
/groundingdino/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/datasets/__init__.py


--------------------------------------------------------------------------------
/groundingdino/datasets/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/datasets/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/datasets/__pycache__/transforms.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/datasets/__pycache__/transforms.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Grounding DINO
 3 | # url: https://github.com/IDEA-Research/GroundingDINO
 4 | # Copyright (c) 2023 IDEA. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------
 7 | # Conditional DETR
 8 | # Copyright (c) 2021 Microsoft. All Rights Reserved.
 9 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
10 | # ------------------------------------------------------------------------
11 | # Copied from DETR (https://github.com/facebookresearch/detr)
12 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
13 | # ------------------------------------------------------------------------
14 | 
15 | from .groundingdino import build_groundingdino
16 | 


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/__pycache__/bertwarper.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/bertwarper.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/__pycache__/fuse_modules.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/fuse_modules.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/__pycache__/groundingdino.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/groundingdino.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/__pycache__/ms_deform_attn.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/ms_deform_attn.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/__pycache__/transformer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/transformer.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/__pycache__/transformer_vanilla.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/transformer_vanilla.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/__pycache__/utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/utils.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone import build_backbone
2 | 


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/backbone/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/backbone/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/backbone/__pycache__/backbone.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/backbone/__pycache__/backbone.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/backbone/__pycache__/position_encoding.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/backbone/__pycache__/position_encoding.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/backbone/__pycache__/swin_transformer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/backbone/__pycache__/swin_transformer.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | 
13 | #include "ms_deform_attn_cpu.h"
14 | 
15 | #ifdef WITH_CUDA
16 | #include "ms_deform_attn_cuda.h"
17 | #endif
18 | 
19 | namespace groundingdino {
20 | 
21 | at::Tensor
22 | ms_deform_attn_forward(
23 |     const at::Tensor &value, 
24 |     const at::Tensor &spatial_shapes,
25 |     const at::Tensor &level_start_index,
26 |     const at::Tensor &sampling_loc,
27 |     const at::Tensor &attn_weight,
28 |     const int im2col_step)
29 | {
30 |     if (value.type().is_cuda())
31 |     {
32 | #ifdef WITH_CUDA
33 |         return ms_deform_attn_cuda_forward(
34 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
35 | #else
36 |         AT_ERROR("Not compiled with GPU support");
37 | #endif
38 |     }
39 |     AT_ERROR("Not implemented on the CPU");
40 | }
41 | 
42 | std::vector<at::Tensor>
43 | ms_deform_attn_backward(
44 |     const at::Tensor &value, 
45 |     const at::Tensor &spatial_shapes,
46 |     const at::Tensor &level_start_index,
47 |     const at::Tensor &sampling_loc,
48 |     const at::Tensor &attn_weight,
49 |     const at::Tensor &grad_output,
50 |     const int im2col_step)
51 | {
52 |     if (value.type().is_cuda())
53 |     {
54 | #ifdef WITH_CUDA
55 |         return ms_deform_attn_cuda_backward(
56 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
57 | #else
58 |         AT_ERROR("Not compiled with GPU support");
59 | #endif
60 |     }
61 |     AT_ERROR("Not implemented on the CPU");
62 | }
63 | 
64 | } // namespace groundingdino


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include <vector>
12 | 
13 | #include <ATen/ATen.h>
14 | #include <ATen/cuda/CUDAContext.h>
15 | 
16 | namespace groundingdino {
17 | 
18 | at::Tensor
19 | ms_deform_attn_cpu_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step)
26 | {
27 |     AT_ERROR("Not implement on cpu");
28 | }
29 | 
30 | std::vector<at::Tensor>
31 | ms_deform_attn_cpu_backward(
32 |     const at::Tensor &value, 
33 |     const at::Tensor &spatial_shapes,
34 |     const at::Tensor &level_start_index,
35 |     const at::Tensor &sampling_loc,
36 |     const at::Tensor &attn_weight,
37 |     const at::Tensor &grad_output,
38 |     const int im2col_step)
39 | {
40 |     AT_ERROR("Not implement on cpu");
41 | }
42 | 
43 | } // namespace groundingdino
44 | 


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | namespace groundingdino {
15 | 
16 | at::Tensor
17 | ms_deform_attn_cpu_forward(
18 |     const at::Tensor &value, 
19 |     const at::Tensor &spatial_shapes,
20 |     const at::Tensor &level_start_index,
21 |     const at::Tensor &sampling_loc,
22 |     const at::Tensor &attn_weight,
23 |     const int im2col_step);
24 | 
25 | std::vector<at::Tensor>
26 | ms_deform_attn_cpu_backward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const at::Tensor &grad_output,
33 |     const int im2col_step);
34 | 
35 | } // namespace groundingdino
36 | 


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | namespace groundingdino {
15 | 
16 | at::Tensor ms_deform_attn_cuda_forward(
17 |     const at::Tensor &value, 
18 |     const at::Tensor &spatial_shapes,
19 |     const at::Tensor &level_start_index,
20 |     const at::Tensor &sampling_loc,
21 |     const at::Tensor &attn_weight,
22 |     const int im2col_step);
23 | 
24 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
25 |     const at::Tensor &value, 
26 |     const at::Tensor &spatial_shapes,
27 |     const at::Tensor &level_start_index,
28 |     const at::Tensor &sampling_loc,
29 |     const at::Tensor &attn_weight,
30 |     const at::Tensor &grad_output,
31 |     const int im2col_step);
32 | 
33 | } // namespace groundingdino


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/csrc/cuda_version.cu:
--------------------------------------------------------------------------------
1 | #include <cuda_runtime_api.h>
2 | 
3 | namespace groundingdino {
4 | int get_cudart_version() {
5 |   return CUDART_VERSION;
6 | }
7 | } // namespace groundingdino
8 | 


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/csrc/vision.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | 
 3 | #include "MsDeformAttn/ms_deform_attn.h"
 4 | 
 5 | namespace groundingdino {
 6 | 
 7 | #ifdef WITH_CUDA
 8 | extern int get_cudart_version();
 9 | #endif
10 | 
11 | std::string get_cuda_version() {
12 | #ifdef WITH_CUDA
13 |   std::ostringstream oss;
14 | 
15 |   // copied from
16 |   // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
17 |   auto printCudaStyleVersion = [&](int v) {
18 |     oss << (v / 1000) << "." << (v / 10 % 100);
19 |     if (v % 10 != 0) {
20 |       oss << "." << (v % 10);
21 |     }
22 |   };
23 |   printCudaStyleVersion(get_cudart_version());
24 |   return oss.str();
25 | #else
26 |   return std::string("not available");
27 | #endif
28 | }
29 | 
30 | // similar to
31 | // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
32 | std::string get_compiler_version() {
33 |   std::ostringstream ss;
34 | #if defined(__GNUC__)
35 | #ifndef __clang__
36 |   { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
37 | #endif
38 | #endif
39 | 
40 | #if defined(__clang_major__)
41 |   {
42 |     ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
43 |        << __clang_patchlevel__;
44 |   }
45 | #endif
46 | 
47 | #if defined(_MSC_VER)
48 |   { ss << "MSVC " << _MSC_FULL_VER; }
49 | #endif
50 |   return ss.str();
51 | }
52 | 
53 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
54 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
55 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
56 | }
57 | 
58 | } // namespace groundingdino


--------------------------------------------------------------------------------
/groundingdino/models/GroundingDINO/transformer_vanilla.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Grounding DINO
  3 | # url: https://github.com/IDEA-Research/GroundingDINO
  4 | # Copyright (c) 2023 IDEA. All Rights Reserved.
  5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  6 | # ------------------------------------------------------------------------
  7 | # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
  8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  9 | """
 10 | DETR Transformer class.
 11 | 
 12 | Copy-paste from torch.nn.Transformer with modifications:
 13 |     * positional encodings are passed in MHattention
 14 |     * extra LN at the end of encoder is removed
 15 |     * decoder returns a stack of activations from all decoding layers
 16 | """
 17 | from typing import Optional
 18 | 
 19 | import torch
 20 | import torch.nn.functional as F
 21 | from torch import Tensor, nn
 22 | 
 23 | from .utils import (
 24 |     MLP,
 25 |     _get_activation_fn,
 26 |     _get_clones,
 27 |     gen_encoder_output_proposals,
 28 |     gen_sineembed_for_position,
 29 |     sigmoid_focal_loss,
 30 | )
 31 | 
 32 | 
 33 | class TextTransformer(nn.Module):
 34 |     def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1):
 35 |         super().__init__()
 36 |         self.num_layers = num_layers
 37 |         self.d_model = d_model
 38 |         self.nheads = nheads
 39 |         self.dim_feedforward = dim_feedforward
 40 |         self.norm = None
 41 | 
 42 |         single_encoder_layer = TransformerEncoderLayer(
 43 |             d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout
 44 |         )
 45 |         self.layers = _get_clones(single_encoder_layer, num_layers)
 46 | 
 47 |     def forward(self, memory_text: torch.Tensor, text_attention_mask: torch.Tensor):
 48 |         """
 49 | 
 50 |         Args:
 51 |             text_attention_mask: bs, num_token
 52 |             memory_text: bs, num_token, d_model
 53 | 
 54 |         Raises:
 55 |             RuntimeError: _description_
 56 | 
 57 |         Returns:
 58 |             output: bs, num_token, d_model
 59 |         """
 60 | 
 61 |         output = memory_text.transpose(0, 1)
 62 | 
 63 |         for layer in self.layers:
 64 |             output = layer(output, src_key_padding_mask=text_attention_mask)
 65 | 
 66 |         if self.norm is not None:
 67 |             output = self.norm(output)
 68 | 
 69 |         return output.transpose(0, 1)
 70 | 
 71 | 
 72 | class TransformerEncoderLayer(nn.Module):
 73 |     def __init__(
 74 |         self,
 75 |         d_model,
 76 |         nhead,
 77 |         dim_feedforward=2048,
 78 |         dropout=0.1,
 79 |         activation="relu",
 80 |         normalize_before=False,
 81 |     ):
 82 |         super().__init__()
 83 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
 84 |         # Implementation of Feedforward model
 85 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
 86 |         self.dropout = nn.Dropout(dropout)
 87 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
 88 | 
 89 |         self.norm1 = nn.LayerNorm(d_model)
 90 |         self.norm2 = nn.LayerNorm(d_model)
 91 |         self.dropout1 = nn.Dropout(dropout)
 92 |         self.dropout2 = nn.Dropout(dropout)
 93 | 
 94 |         self.activation = _get_activation_fn(activation)
 95 |         self.normalize_before = normalize_before
 96 |         self.nhead = nhead
 97 | 
 98 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
 99 |         return tensor if pos is None else tensor + pos
100 | 
101 |     def forward(
102 |         self,
103 |         src,
104 |         src_mask: Optional[Tensor] = None,
105 |         src_key_padding_mask: Optional[Tensor] = None,
106 |         pos: Optional[Tensor] = None,
107 |     ):
108 |         # repeat attn mask
109 |         if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]:
110 |             # bs, num_q, num_k
111 |             src_mask = src_mask.repeat(self.nhead, 1, 1)
112 | 
113 |         q = k = self.with_pos_embed(src, pos)
114 | 
115 |         src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0]
116 | 
117 |         # src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
118 |         src = src + self.dropout1(src2)
119 |         src = self.norm1(src)
120 |         src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
121 |         src = src + self.dropout2(src2)
122 |         src = self.norm2(src)
123 |         return src
124 | 


--------------------------------------------------------------------------------
/groundingdino/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Grounding DINO
 3 | # url: https://github.com/IDEA-Research/GroundingDINO
 4 | # Copyright (c) 2023 IDEA. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | from .GroundingDINO import build_groundingdino
 9 | 
10 | 
11 | def build_model(args):
12 |     # we use register to maintain models from catdet6 on.
13 |     from .registry import MODULE_BUILD_FUNCS
14 | 
15 |     assert args.modelname in MODULE_BUILD_FUNCS._module_dict
16 |     build_func = MODULE_BUILD_FUNCS.get(args.modelname)
17 |     model = build_func(args)
18 |     return model
19 | 


--------------------------------------------------------------------------------
/groundingdino/models/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/__pycache__/registry.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/__pycache__/registry.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/models/registry.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Grounding DINO
 3 | # url: https://github.com/IDEA-Research/GroundingDINO
 4 | # Copyright (c) 2023 IDEA. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------
 7 | # -*- coding: utf-8 -*-
 8 | # @Author: Yihao Chen
 9 | # @Date:   2021-08-16 16:03:17
10 | # @Last Modified by:   Shilong Liu
11 | # @Last Modified time: 2022-01-23 15:26
12 | # modified from mmcv
13 | 
14 | import inspect
15 | from functools import partial
16 | 
17 | 
18 | class Registry(object):
19 |     def __init__(self, name):
20 |         self._name = name
21 |         self._module_dict = dict()
22 | 
23 |     def __repr__(self):
24 |         format_str = self.__class__.__name__ + "(name={}, items={})".format(
25 |             self._name, list(self._module_dict.keys())
26 |         )
27 |         return format_str
28 | 
29 |     def __len__(self):
30 |         return len(self._module_dict)
31 | 
32 |     @property
33 |     def name(self):
34 |         return self._name
35 | 
36 |     @property
37 |     def module_dict(self):
38 |         return self._module_dict
39 | 
40 |     def get(self, key):
41 |         return self._module_dict.get(key, None)
42 | 
43 |     def registe_with_name(self, module_name=None, force=False):
44 |         return partial(self.register, module_name=module_name, force=force)
45 | 
46 |     def register(self, module_build_function, module_name=None, force=False):
47 |         """Register a module build function.
48 |         Args:
49 |             module (:obj:`nn.Module`): Module to be registered.
50 |         """
51 |         if not inspect.isfunction(module_build_function):
52 |             raise TypeError(
53 |                 "module_build_function must be a function, but got {}".format(
54 |                     type(module_build_function)
55 |                 )
56 |             )
57 |         if module_name is None:
58 |             module_name = module_build_function.__name__
59 |         if not force and module_name in self._module_dict:
60 |             raise KeyError("{} is already registered in {}".format(module_name, self.name))
61 |         self._module_dict[module_name] = module_build_function
62 | 
63 |         return module_build_function
64 | 
65 | 
66 | MODULE_BUILD_FUNCS = Registry("model build functions")
67 | 


--------------------------------------------------------------------------------
/groundingdino/util/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 | 


--------------------------------------------------------------------------------
/groundingdino/util/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/util/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/util/__pycache__/box_ops.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/util/__pycache__/box_ops.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/util/__pycache__/slconfig.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/util/__pycache__/slconfig.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/util/__pycache__/utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/util/__pycache__/utils.cpython-39.pyc


--------------------------------------------------------------------------------
/groundingdino/util/box_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | Utilities for bounding box manipulation and GIoU.
  4 | """
  5 | import torch
  6 | from torchvision.ops.boxes import box_area
  7 | 
  8 | 
  9 | def box_cxcywh_to_xyxy(x):
 10 |     x_c, y_c, w, h = x.unbind(-1)
 11 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
 12 |     return torch.stack(b, dim=-1)
 13 | 
 14 | 
 15 | def box_xyxy_to_cxcywh(x):
 16 |     x0, y0, x1, y1 = x.unbind(-1)
 17 |     b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
 18 |     return torch.stack(b, dim=-1)
 19 | 
 20 | 
 21 | # modified from torchvision to also return the union
 22 | def box_iou(boxes1, boxes2):
 23 |     area1 = box_area(boxes1)
 24 |     area2 = box_area(boxes2)
 25 | 
 26 |     # import ipdb; ipdb.set_trace()
 27 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 28 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 29 | 
 30 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 31 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 32 | 
 33 |     union = area1[:, None] + area2 - inter
 34 | 
 35 |     iou = inter / (union + 1e-6)
 36 |     return iou, union
 37 | 
 38 | 
 39 | def generalized_box_iou(boxes1, boxes2):
 40 |     """
 41 |     Generalized IoU from https://giou.stanford.edu/
 42 | 
 43 |     The boxes should be in [x0, y0, x1, y1] format
 44 | 
 45 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 46 |     and M = len(boxes2)
 47 |     """
 48 |     # degenerate boxes gives inf / nan results
 49 |     # so do an early check
 50 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 51 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 52 |     # except:
 53 |     #     import ipdb; ipdb.set_trace()
 54 |     iou, union = box_iou(boxes1, boxes2)
 55 | 
 56 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 57 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 58 | 
 59 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 60 |     area = wh[:, :, 0] * wh[:, :, 1]
 61 | 
 62 |     return iou - (area - union) / (area + 1e-6)
 63 | 
 64 | 
 65 | # modified from torchvision to also return the union
 66 | def box_iou_pairwise(boxes1, boxes2):
 67 |     area1 = box_area(boxes1)
 68 |     area2 = box_area(boxes2)
 69 | 
 70 |     lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N,2]
 71 |     rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N,2]
 72 | 
 73 |     wh = (rb - lt).clamp(min=0)  # [N,2]
 74 |     inter = wh[:, 0] * wh[:, 1]  # [N]
 75 | 
 76 |     union = area1 + area2 - inter
 77 | 
 78 |     iou = inter / union
 79 |     return iou, union
 80 | 
 81 | 
 82 | def generalized_box_iou_pairwise(boxes1, boxes2):
 83 |     """
 84 |     Generalized IoU from https://giou.stanford.edu/
 85 | 
 86 |     Input:
 87 |         - boxes1, boxes2: N,4
 88 |     Output:
 89 |         - giou: N, 4
 90 |     """
 91 |     # degenerate boxes gives inf / nan results
 92 |     # so do an early check
 93 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 94 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 95 |     assert boxes1.shape == boxes2.shape
 96 |     iou, union = box_iou_pairwise(boxes1, boxes2)  # N, 4
 97 | 
 98 |     lt = torch.min(boxes1[:, :2], boxes2[:, :2])
 99 |     rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
100 | 
101 |     wh = (rb - lt).clamp(min=0)  # [N,2]
102 |     area = wh[:, 0] * wh[:, 1]
103 | 
104 |     return iou - (area - union) / area
105 | 
106 | 
107 | def masks_to_boxes(masks):
108 |     """Compute the bounding boxes around the provided masks
109 | 
110 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
111 | 
112 |     Returns a [N, 4] tensors, with the boxes in xyxy format
113 |     """
114 |     if masks.numel() == 0:
115 |         return torch.zeros((0, 4), device=masks.device)
116 | 
117 |     h, w = masks.shape[-2:]
118 | 
119 |     y = torch.arange(0, h, dtype=torch.float)
120 |     x = torch.arange(0, w, dtype=torch.float)
121 |     y, x = torch.meshgrid(y, x)
122 | 
123 |     x_mask = masks * x.unsqueeze(0)
124 |     x_max = x_mask.flatten(1).max(-1)[0]
125 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
126 | 
127 |     y_mask = masks * y.unsqueeze(0)
128 |     y_max = y_mask.flatten(1).max(-1)[0]
129 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
130 | 
131 |     return torch.stack([x_min, y_min, x_max, y_max], 1)
132 | 
133 | 
134 | if __name__ == "__main__":
135 |     x = torch.rand(5, 4)
136 |     y = torch.rand(3, 4)
137 |     iou, union = box_iou(x, y)
138 |     import ipdb
139 | 
140 |     ipdb.set_trace()
141 | 


--------------------------------------------------------------------------------
/groundingdino/util/get_tokenlizer.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, BertModel, BertTokenizer, RobertaModel, RobertaTokenizerFast
 2 | 
 3 | 
 4 | def get_tokenlizer(text_encoder_type):
 5 |     if not isinstance(text_encoder_type, str):
 6 |         # print("text_encoder_type is not a str")
 7 |         if hasattr(text_encoder_type, "text_encoder_type"):
 8 |             text_encoder_type = text_encoder_type.text_encoder_type
 9 |         elif text_encoder_type.get("text_encoder_type", False):
10 |             text_encoder_type = text_encoder_type.get("text_encoder_type")
11 |         else:
12 |             raise ValueError(
13 |                 "Unknown type of text_encoder_type: {}".format(type(text_encoder_type))
14 |             )
15 |     print("final text_encoder_type: {}".format(text_encoder_type))
16 | 
17 |     tokenizer = AutoTokenizer.from_pretrained(text_encoder_type)
18 |     return tokenizer
19 | 
20 | 
21 | def get_pretrained_language_model(text_encoder_type):
22 |     if text_encoder_type == "bert-base-uncased":
23 |         return BertModel.from_pretrained(text_encoder_type)
24 |     if text_encoder_type == "roberta-base":
25 |         return RobertaModel.from_pretrained(text_encoder_type)
26 |     raise ValueError("Unknown text_encoder_type {}".format(text_encoder_type))
27 | 


--------------------------------------------------------------------------------
/groundingdino/util/inference.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple, List
 2 | 
 3 | import cv2
 4 | import numpy as np
 5 | import supervision as sv
 6 | import torch
 7 | from PIL import Image
 8 | from torchvision.ops import box_convert
 9 | 
10 | import groundingdino.datasets.transforms as T
11 | from groundingdino.models import build_model
12 | from groundingdino.util.misc import clean_state_dict
13 | from groundingdino.util.slconfig import SLConfig
14 | from groundingdino.util.utils import get_phrases_from_posmap
15 | 
16 | 
17 | def preprocess_caption(caption: str) -> str:
18 |     result = caption.lower().strip()
19 |     if result.endswith("."):
20 |         return result
21 |     return result + "."
22 | 
23 | 
24 | def load_model(model_config_path: str, model_checkpoint_path: str, device: str = "cuda"):
25 |     args = SLConfig.fromfile(model_config_path)
26 |     args.device = device
27 |     model = build_model(args)
28 |     checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
29 |     model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
30 |     model.eval()
31 |     return model
32 | 
33 | 
34 | def load_image(image_path: str) -> Tuple[np.array, torch.Tensor]:
35 |     transform = T.Compose(
36 |         [
37 |             T.RandomResize([800], max_size=1333),
38 |             T.ToTensor(),
39 |             T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
40 |         ]
41 |     )
42 |     image_source = Image.open(image_path).convert("RGB")
43 |     image = np.asarray(image_source)
44 |     image_transformed, _ = transform(image_source, None)
45 |     return image, image_transformed
46 | 
47 | 
48 | def predict(
49 |         model,
50 |         image: torch.Tensor,
51 |         caption: str,
52 |         box_threshold: float,
53 |         text_threshold: float,
54 |         device: str = "cuda"
55 | ) -> Tuple[torch.Tensor, torch.Tensor, List[str]]:
56 |     caption = preprocess_caption(caption=caption)
57 | 
58 |     model = model.to(device)
59 |     image = image.to(device)
60 | 
61 |     with torch.no_grad():
62 |         outputs = model(image[None], captions=[caption])
63 | 
64 |     prediction_logits = outputs["pred_logits"].cpu().sigmoid()[0]  # prediction_logits.shape = (nq, 256)
65 |     prediction_boxes = outputs["pred_boxes"].cpu()[0]  # prediction_boxes.shape = (nq, 4)
66 | 
67 |     mask = prediction_logits.max(dim=1)[0] > box_threshold
68 |     logits = prediction_logits[mask]  # logits.shape = (n, 256)
69 |     boxes = prediction_boxes[mask]  # boxes.shape = (n, 4)
70 | 
71 |     tokenizer = model.tokenizer
72 |     tokenized = tokenizer(caption)
73 | 
74 |     phrases = [
75 |         get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer).replace('.', '')
76 |         for logit
77 |         in logits
78 |     ]
79 | 
80 |     return boxes, logits.max(dim=1)[0], phrases
81 | 
82 | 
83 | def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str]) -> np.ndarray:
84 |     h, w, _ = image_source.shape
85 |     boxes = boxes * torch.Tensor([w, h, w, h])
86 |     xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
87 |     detections = sv.Detections(xyxy=xyxy)
88 | 
89 |     labels = [
90 |         f"{phrase} {logit:.2f}"
91 |         for phrase, logit
92 |         in zip(phrases, logits)
93 |     ]
94 | 
95 |     box_annotator = sv.BoxAnnotator()
96 |     annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
97 |     annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
98 |     return annotated_frame
99 | 


--------------------------------------------------------------------------------
/groundingdino/util/logger.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | import functools
 3 | import logging
 4 | import os
 5 | import sys
 6 | 
 7 | from termcolor import colored
 8 | 
 9 | 
10 | class _ColorfulFormatter(logging.Formatter):
11 |     def __init__(self, *args, **kwargs):
12 |         self._root_name = kwargs.pop("root_name") + "."
13 |         self._abbrev_name = kwargs.pop("abbrev_name", "")
14 |         if len(self._abbrev_name):
15 |             self._abbrev_name = self._abbrev_name + "."
16 |         super(_ColorfulFormatter, self).__init__(*args, **kwargs)
17 | 
18 |     def formatMessage(self, record):
19 |         record.name = record.name.replace(self._root_name, self._abbrev_name)
20 |         log = super(_ColorfulFormatter, self).formatMessage(record)
21 |         if record.levelno == logging.WARNING:
22 |             prefix = colored("WARNING", "red", attrs=["blink"])
23 |         elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL:
24 |             prefix = colored("ERROR", "red", attrs=["blink", "underline"])
25 |         else:
26 |             return log
27 |         return prefix + " " + log
28 | 
29 | 
30 | # so that calling setup_logger multiple times won't add many handlers
31 | @functools.lru_cache()
32 | def setup_logger(output=None, distributed_rank=0, *, color=True, name="imagenet", abbrev_name=None):
33 |     """
34 |     Initialize the detectron2 logger and set its verbosity level to "INFO".
35 | 
36 |     Args:
37 |         output (str): a file name or a directory to save log. If None, will not save log file.
38 |             If ends with ".txt" or ".log", assumed to be a file name.
39 |             Otherwise, logs will be saved to `output/log.txt`.
40 |         name (str): the root module name of this logger
41 | 
42 |     Returns:
43 |         logging.Logger: a logger
44 |     """
45 |     logger = logging.getLogger(name)
46 |     logger.setLevel(logging.DEBUG)
47 |     logger.propagate = False
48 | 
49 |     if abbrev_name is None:
50 |         abbrev_name = name
51 | 
52 |     plain_formatter = logging.Formatter(
53 |         "[%(asctime)s.%(msecs)03d]: %(message)s", datefmt="%m/%d %H:%M:%S"
54 |     )
55 |     # stdout logging: master only
56 |     if distributed_rank == 0:
57 |         ch = logging.StreamHandler(stream=sys.stdout)
58 |         ch.setLevel(logging.DEBUG)
59 |         if color:
60 |             formatter = _ColorfulFormatter(
61 |                 colored("[%(asctime)s.%(msecs)03d]: ", "green") + "%(message)s",
62 |                 datefmt="%m/%d %H:%M:%S",
63 |                 root_name=name,
64 |                 abbrev_name=str(abbrev_name),
65 |             )
66 |         else:
67 |             formatter = plain_formatter
68 |         ch.setFormatter(formatter)
69 |         logger.addHandler(ch)
70 | 
71 |     # file logging: all workers
72 |     if output is not None:
73 |         if output.endswith(".txt") or output.endswith(".log"):
74 |             filename = output
75 |         else:
76 |             filename = os.path.join(output, "log.txt")
77 |         if distributed_rank > 0:
78 |             filename = filename + f".rank{distributed_rank}"
79 |         os.makedirs(os.path.dirname(filename), exist_ok=True)
80 | 
81 |         fh = logging.StreamHandler(_cached_log_stream(filename))
82 |         fh.setLevel(logging.DEBUG)
83 |         fh.setFormatter(plain_formatter)
84 |         logger.addHandler(fh)
85 | 
86 |     return logger
87 | 
88 | 
89 | # cache the opened file object, so that different calls to `setup_logger`
90 | # with the same file name can safely write to the same file.
91 | @functools.lru_cache(maxsize=None)
92 | def _cached_log_stream(filename):
93 |     return open(filename, "a")
94 | 


--------------------------------------------------------------------------------
/groundingdino/util/slio.py:
--------------------------------------------------------------------------------
  1 | # ==========================================================
  2 | # Modified from mmcv
  3 | # ==========================================================
  4 | 
  5 | import json
  6 | import pickle
  7 | from abc import ABCMeta, abstractmethod
  8 | from pathlib import Path
  9 | 
 10 | import yaml
 11 | 
 12 | try:
 13 |     from yaml import CLoader as Loader, CDumper as Dumper
 14 | except ImportError:
 15 |     from yaml import Loader, Dumper
 16 | 
 17 | 
 18 | # ===========================
 19 | # Rigister handler
 20 | # ===========================
 21 | 
 22 | 
 23 | class BaseFileHandler(metaclass=ABCMeta):
 24 |     @abstractmethod
 25 |     def load_from_fileobj(self, file, **kwargs):
 26 |         pass
 27 | 
 28 |     @abstractmethod
 29 |     def dump_to_fileobj(self, obj, file, **kwargs):
 30 |         pass
 31 | 
 32 |     @abstractmethod
 33 |     def dump_to_str(self, obj, **kwargs):
 34 |         pass
 35 | 
 36 |     def load_from_path(self, filepath, mode="r", **kwargs):
 37 |         with open(filepath, mode) as f:
 38 |             return self.load_from_fileobj(f, **kwargs)
 39 | 
 40 |     def dump_to_path(self, obj, filepath, mode="w", **kwargs):
 41 |         with open(filepath, mode) as f:
 42 |             self.dump_to_fileobj(obj, f, **kwargs)
 43 | 
 44 | 
 45 | class JsonHandler(BaseFileHandler):
 46 |     def load_from_fileobj(self, file):
 47 |         return json.load(file)
 48 | 
 49 |     def dump_to_fileobj(self, obj, file, **kwargs):
 50 |         json.dump(obj, file, **kwargs)
 51 | 
 52 |     def dump_to_str(self, obj, **kwargs):
 53 |         return json.dumps(obj, **kwargs)
 54 | 
 55 | 
 56 | class PickleHandler(BaseFileHandler):
 57 |     def load_from_fileobj(self, file, **kwargs):
 58 |         return pickle.load(file, **kwargs)
 59 | 
 60 |     def load_from_path(self, filepath, **kwargs):
 61 |         return super(PickleHandler, self).load_from_path(filepath, mode="rb", **kwargs)
 62 | 
 63 |     def dump_to_str(self, obj, **kwargs):
 64 |         kwargs.setdefault("protocol", 2)
 65 |         return pickle.dumps(obj, **kwargs)
 66 | 
 67 |     def dump_to_fileobj(self, obj, file, **kwargs):
 68 |         kwargs.setdefault("protocol", 2)
 69 |         pickle.dump(obj, file, **kwargs)
 70 | 
 71 |     def dump_to_path(self, obj, filepath, **kwargs):
 72 |         super(PickleHandler, self).dump_to_path(obj, filepath, mode="wb", **kwargs)
 73 | 
 74 | 
 75 | class YamlHandler(BaseFileHandler):
 76 |     def load_from_fileobj(self, file, **kwargs):
 77 |         kwargs.setdefault("Loader", Loader)
 78 |         return yaml.load(file, **kwargs)
 79 | 
 80 |     def dump_to_fileobj(self, obj, file, **kwargs):
 81 |         kwargs.setdefault("Dumper", Dumper)
 82 |         yaml.dump(obj, file, **kwargs)
 83 | 
 84 |     def dump_to_str(self, obj, **kwargs):
 85 |         kwargs.setdefault("Dumper", Dumper)
 86 |         return yaml.dump(obj, **kwargs)
 87 | 
 88 | 
 89 | file_handlers = {
 90 |     "json": JsonHandler(),
 91 |     "yaml": YamlHandler(),
 92 |     "yml": YamlHandler(),
 93 |     "pickle": PickleHandler(),
 94 |     "pkl": PickleHandler(),
 95 | }
 96 | 
 97 | # ===========================
 98 | # load and dump
 99 | # ===========================
100 | 
101 | 
102 | def is_str(x):
103 |     """Whether the input is an string instance.
104 | 
105 |     Note: This method is deprecated since python 2 is no longer supported.
106 |     """
107 |     return isinstance(x, str)
108 | 
109 | 
110 | def slload(file, file_format=None, **kwargs):
111 |     """Load data from json/yaml/pickle files.
112 | 
113 |     This method provides a unified api for loading data from serialized files.
114 | 
115 |     Args:
116 |         file (str or :obj:`Path` or file-like object): Filename or a file-like
117 |             object.
118 |         file_format (str, optional): If not specified, the file format will be
119 |             inferred from the file extension, otherwise use the specified one.
120 |             Currently supported formats include "json", "yaml/yml" and
121 |             "pickle/pkl".
122 | 
123 |     Returns:
124 |         The content from the file.
125 |     """
126 |     if isinstance(file, Path):
127 |         file = str(file)
128 |     if file_format is None and is_str(file):
129 |         file_format = file.split(".")[-1]
130 |     if file_format not in file_handlers:
131 |         raise TypeError(f"Unsupported format: {file_format}")
132 | 
133 |     handler = file_handlers[file_format]
134 |     if is_str(file):
135 |         obj = handler.load_from_path(file, **kwargs)
136 |     elif hasattr(file, "read"):
137 |         obj = handler.load_from_fileobj(file, **kwargs)
138 |     else:
139 |         raise TypeError('"file" must be a filepath str or a file-object')
140 |     return obj
141 | 
142 | 
143 | def sldump(obj, file=None, file_format=None, **kwargs):
144 |     """Dump data to json/yaml/pickle strings or files.
145 | 
146 |     This method provides a unified api for dumping data as strings or to files,
147 |     and also supports custom arguments for each file format.
148 | 
149 |     Args:
150 |         obj (any): The python object to be dumped.
151 |         file (str or :obj:`Path` or file-like object, optional): If not
152 |             specified, then the object is dump to a str, otherwise to a file
153 |             specified by the filename or file-like object.
154 |         file_format (str, optional): Same as :func:`load`.
155 | 
156 |     Returns:
157 |         bool: True for success, False otherwise.
158 |     """
159 |     if isinstance(file, Path):
160 |         file = str(file)
161 |     if file_format is None:
162 |         if is_str(file):
163 |             file_format = file.split(".")[-1]
164 |         elif file is None:
165 |             raise ValueError("file_format must be specified since file is None")
166 |     if file_format not in file_handlers:
167 |         raise TypeError(f"Unsupported format: {file_format}")
168 | 
169 |     handler = file_handlers[file_format]
170 |     if file is None:
171 |         return handler.dump_to_str(obj, **kwargs)
172 |     elif is_str(file):
173 |         handler.dump_to_path(obj, file, **kwargs)
174 |     elif hasattr(file, "write"):
175 |         handler.dump_to_fileobj(obj, file, **kwargs)
176 |     else:
177 |         raise TypeError('"file" must be a filename str or a file-object')
178 | 


--------------------------------------------------------------------------------
/groundingdino/util/time_counter.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | 
 4 | 
 5 | class TimeCounter:
 6 |     def __init__(self) -> None:
 7 |         pass
 8 | 
 9 |     def clear(self):
10 |         self.timedict = {}
11 |         self.basetime = time.perf_counter()
12 | 
13 |     def timeit(self, name):
14 |         nowtime = time.perf_counter() - self.basetime
15 |         self.timedict[name] = nowtime
16 |         self.basetime = time.perf_counter()
17 | 
18 | 
19 | class TimeHolder:
20 |     def __init__(self) -> None:
21 |         self.timedict = {}
22 | 
23 |     def update(self, _timedict: dict):
24 |         for k, v in _timedict.items():
25 |             if k not in self.timedict:
26 |                 self.timedict[k] = AverageMeter(name=k, val_only=True)
27 |             self.timedict[k].update(val=v)
28 | 
29 |     def final_res(self):
30 |         return {k: v.avg for k, v in self.timedict.items()}
31 | 
32 |     def __str__(self):
33 |         return json.dumps(self.final_res(), indent=2)
34 | 
35 | 
36 | class AverageMeter(object):
37 |     """Computes and stores the average and current value"""
38 | 
39 |     def __init__(self, name, fmt=":f", val_only=False):
40 |         self.name = name
41 |         self.fmt = fmt
42 |         self.val_only = val_only
43 |         self.reset()
44 | 
45 |     def reset(self):
46 |         self.val = 0
47 |         self.avg = 0
48 |         self.sum = 0
49 |         self.count = 0
50 | 
51 |     def update(self, val, n=1):
52 |         self.val = val
53 |         self.sum += val * n
54 |         self.count += n
55 |         self.avg = self.sum / self.count
56 | 
57 |     def __str__(self):
58 |         if self.val_only:
59 |             fmtstr = "{name} {val" + self.fmt + "}"
60 |         else:
61 |             fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
62 |         return fmtstr.format(**self.__dict__)
63 | 


--------------------------------------------------------------------------------
/groundingdino/util/vl_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | from typing import List
  4 | 
  5 | import torch
  6 | 
  7 | 
  8 | def create_positive_map_from_span(tokenized, token_span, max_text_len=256):
  9 |     """construct a map such that positive_map[i,j] = True iff box i is associated to token j
 10 |     Input:
 11 |         - tokenized:
 12 |             - input_ids: Tensor[1, ntokens]
 13 |             - attention_mask: Tensor[1, ntokens]
 14 |         - token_span: list with length num_boxes.
 15 |             - each item: [start_idx, end_idx]
 16 |     """
 17 |     positive_map = torch.zeros((len(token_span), max_text_len), dtype=torch.float)
 18 |     for j, tok_list in enumerate(token_span):
 19 |         for (beg, end) in tok_list:
 20 |             beg_pos = tokenized.char_to_token(beg)
 21 |             end_pos = tokenized.char_to_token(end - 1)
 22 |             if beg_pos is None:
 23 |                 try:
 24 |                     beg_pos = tokenized.char_to_token(beg + 1)
 25 |                     if beg_pos is None:
 26 |                         beg_pos = tokenized.char_to_token(beg + 2)
 27 |                 except:
 28 |                     beg_pos = None
 29 |             if end_pos is None:
 30 |                 try:
 31 |                     end_pos = tokenized.char_to_token(end - 2)
 32 |                     if end_pos is None:
 33 |                         end_pos = tokenized.char_to_token(end - 3)
 34 |                 except:
 35 |                     end_pos = None
 36 |             if beg_pos is None or end_pos is None:
 37 |                 continue
 38 | 
 39 |             assert beg_pos is not None and end_pos is not None
 40 |             if os.environ.get("SHILONG_DEBUG_ONLY_ONE_POS", None) == "TRUE":
 41 |                 positive_map[j, beg_pos] = 1
 42 |                 break
 43 |             else:
 44 |                 positive_map[j, beg_pos : end_pos + 1].fill_(1)
 45 | 
 46 |     return positive_map / (positive_map.sum(-1)[:, None] + 1e-6)
 47 | 
 48 | 
 49 | def build_captions_and_token_span(cat_list, force_lowercase):
 50 |     """
 51 |     Return:
 52 |         captions: str
 53 |         cat2tokenspan: dict
 54 |             {
 55 |                 'dog': [[0, 2]],
 56 |                 ...
 57 |             }
 58 |     """
 59 | 
 60 |     cat2tokenspan = {}
 61 |     captions = ""
 62 |     for catname in cat_list:
 63 |         class_name = catname
 64 |         if force_lowercase:
 65 |             class_name = class_name.lower()
 66 |         if "/" in class_name:
 67 |             class_name_list: List = class_name.strip().split("/")
 68 |             class_name_list.append(class_name)
 69 |             class_name: str = random.choice(class_name_list)
 70 | 
 71 |         tokens_positive_i = []
 72 |         subnamelist = [i.strip() for i in class_name.strip().split(" ")]
 73 |         for subname in subnamelist:
 74 |             if len(subname) == 0:
 75 |                 continue
 76 |             if len(captions) > 0:
 77 |                 captions = captions + " "
 78 |             strat_idx = len(captions)
 79 |             end_idx = strat_idx + len(subname)
 80 |             tokens_positive_i.append([strat_idx, end_idx])
 81 |             captions = captions + subname
 82 | 
 83 |         if len(tokens_positive_i) > 0:
 84 |             captions = captions + " ."
 85 |             cat2tokenspan[class_name] = tokens_positive_i
 86 | 
 87 |     return captions, cat2tokenspan
 88 | 
 89 | 
 90 | def build_id2posspan_and_caption(category_dict: dict):
 91 |     """Build id2pos_span and caption from category_dict
 92 | 
 93 |     Args:
 94 |         category_dict (dict): category_dict
 95 |     """
 96 |     cat_list = [item["name"].lower() for item in category_dict]
 97 |     id2catname = {item["id"]: item["name"].lower() for item in category_dict}
 98 |     caption, cat2posspan = build_captions_and_token_span(cat_list, force_lowercase=True)
 99 |     id2posspan = {catid: cat2posspan[catname] for catid, catname in id2catname.items()}
100 |     return id2posspan, caption
101 | 


--------------------------------------------------------------------------------
/groundingdino/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.0'
2 | 


--------------------------------------------------------------------------------
/linter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | {
 5 |   black --version | grep -E "23\." > /dev/null
 6 | } || {
 7 |   echo "Linter requires 'black==23.*' !"
 8 |   exit 1
 9 | }
10 | 
11 | ISORT_VERSION=$(isort --version-number)
12 | if [[ "$ISORT_VERSION" != 5.12* ]]; then
13 |   echo "Linter requires isort==5.12.0 !"
14 |   exit 1
15 | fi
16 | 
17 | echo "Running isort ..."
18 | isort . --atomic
19 | 
20 | echo "Running black ..."
21 | black -l 100 .
22 | 
23 | echo "Running flake8 ..."
24 | if [ -x "$(command -v flake8)" ]; then
25 |   flake8 .
26 | else
27 |   python3 -m flake8 .
28 | fi
29 | 
30 | echo "Running mypy..."
31 | 
32 | mypy --exclude 'setup.py|notebooks' .
33 | 


--------------------------------------------------------------------------------
/modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .sam import Sam
 8 | from .image_encoder import ImageEncoderViT
 9 | from .mask_decoder import MaskDecoder
10 | from .prompt_encoder import PromptEncoder
11 | from .transformer import TwoWayTransformer
12 | 


--------------------------------------------------------------------------------
/modeling/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/modeling/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/modeling/__pycache__/common.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/modeling/__pycache__/common.cpython-39.pyc


--------------------------------------------------------------------------------
/modeling/__pycache__/image_encoder.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/modeling/__pycache__/image_encoder.cpython-39.pyc


--------------------------------------------------------------------------------
/modeling/__pycache__/mask_decoder.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/modeling/__pycache__/mask_decoder.cpython-39.pyc


--------------------------------------------------------------------------------
/modeling/__pycache__/prompt_encoder.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/modeling/__pycache__/prompt_encoder.cpython-39.pyc


--------------------------------------------------------------------------------
/modeling/__pycache__/sam.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/modeling/__pycache__/sam.cpython-39.pyc


--------------------------------------------------------------------------------
/modeling/__pycache__/transformer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/modeling/__pycache__/transformer.cpython-39.pyc


--------------------------------------------------------------------------------
/modeling/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | 
10 | from typing import Type
11 | 
12 | 
13 | class MLPBlock(nn.Module):
14 |     def __init__(
15 |         self,
16 |         embedding_dim: int,
17 |         mlp_dim: int,
18 |         act: Type[nn.Module] = nn.GELU,
19 |     ) -> None:
20 |         super().__init__()
21 |         self.lin1 = nn.Linear(embedding_dim, mlp_dim)
22 |         self.lin2 = nn.Linear(mlp_dim, embedding_dim)
23 |         self.act = act()
24 | 
25 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
26 |         return self.lin2(self.act(self.lin1(x)))
27 | 
28 | 
29 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
30 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
31 | class LayerNorm2d(nn.Module):
32 |     def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
33 |         super().__init__()
34 |         self.weight = nn.Parameter(torch.ones(num_channels))
35 |         self.bias = nn.Parameter(torch.zeros(num_channels))
36 |         self.eps = eps
37 | 
38 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
39 |         u = x.mean(1, keepdim=True)
40 |         s = (x - u).pow(2).mean(1, keepdim=True)
41 |         x = (x - u) / torch.sqrt(s + self.eps)
42 |         x = self.weight[:, None, None] * x + self.bias[:, None, None]
43 |         return x
44 | 


--------------------------------------------------------------------------------
/notebooks/images/dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/notebooks/images/dog.jpg


--------------------------------------------------------------------------------
/notebooks/images/groceries.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/notebooks/images/groceries.jpg


--------------------------------------------------------------------------------
/notebooks/images/truck.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/notebooks/images/truck.jpg


--------------------------------------------------------------------------------
/outputs/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/outputs/.placeholder


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | addict
 2 | diffusers
 3 | gradio
 4 | huggingface_hub
 5 | matplotlib
 6 | numpy
 7 | onnxruntime
 8 | opencv_python
 9 | Pillow
10 | pycocotools
11 | PyYAML
12 | requests
13 | setuptools
14 | supervision
15 | termcolor
16 | timm
17 | torch
18 | torchvision
19 | transformers==4.28.1
20 | yapf
21 | accelerate
22 | exif
23 | textblob
24 | einops
25 | omegaconf
26 | ultralytics==8.0.95
27 | clip
28 | inflect


--------------------------------------------------------------------------------
/runs/detect/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/runs/detect/.placeholder


--------------------------------------------------------------------------------
/segment_anything/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = W503, E203, E221, C901, C408, E741, C407, B017, F811, C101, EXE001, EXE002
3 | max-line-length = 100
4 | max-complexity = 18
5 | select = B,C,E,F,W,T4,B9
6 | per-file-ignores =
7 |   **/__init__.py:F401,F403,E402
8 | 


--------------------------------------------------------------------------------
/segment_anything/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@fb.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq
81 | 


--------------------------------------------------------------------------------
/segment_anything/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to segment-anything
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `main`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints, using the `linter.sh` script in the project's root directory. Linting requires `black==23.*`, `isort==5.12.0`, `flake8`, and `mypy`.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Facebook's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to segment-anything, you agree that your contributions will be licensed
31 | under the LICENSE file in the root directory of this source tree.
32 | 


--------------------------------------------------------------------------------
/segment_anything/README.md:
--------------------------------------------------------------------------------
  1 | # Segment Anything
  2 | 
  3 | **[Meta AI Research, FAIR](https://ai.facebook.com/research/)**
  4 | 
  5 | [Alexander Kirillov](https://alexander-kirillov.github.io/), [Eric Mintun](https://ericmintun.github.io/), [Nikhila Ravi](https://nikhilaravi.com/), [Hanzi Mao](https://hanzimao.me/), Chloe Rolland, Laura Gustafson, [Tete Xiao](https://tetexiao.com), [Spencer Whitehead](https://www.spencerwhitehead.com/), Alex Berg, Wan-Yen Lo, [Piotr Dollar](https://pdollar.github.io/), [Ross Girshick](https://www.rossgirshick.info/)
  6 | 
  7 | [[`Paper`](https://ai.facebook.com/research/publications/segment-anything/)] [[`Project`](https://segment-anything.com/)] [[`Demo`](https://segment-anything.com/demo)] [[`Dataset`](https://segment-anything.com/dataset/index.html)] [[`Blog`](https://ai.facebook.com/blog/segment-anything-foundation-model-image-segmentation/)]
  8 | 
  9 | ![SAM design](assets/model_diagram.png?raw=true)
 10 | 
 11 | The **Segment Anything Model (SAM)** produces high quality object masks from input prompts such as points or boxes, and it can be used to generate masks for all objects in an image. It has been trained on a [dataset](https://segment-anything.com/dataset/index.html) of 11 million images and 1.1 billion masks, and has strong zero-shot performance on a variety of segmentation tasks.
 12 | 
 13 | <p float="left">
 14 |   <img src="assets/masks1.png?raw=true" width="37.25%" />
 15 |   <img src="assets/masks2.jpg?raw=true" width="61.5%" /> 
 16 | </p>
 17 | 
 18 | ## Installation
 19 | 
 20 | The code requires `python>=3.8`, as well as `pytorch>=1.7` and `torchvision>=0.8`. Please follow the instructions [here](https://pytorch.org/get-started/locally/) to install both PyTorch and TorchVision dependencies. Installing both PyTorch and TorchVision with CUDA support is strongly recommended.
 21 | 
 22 | Install Segment Anything:
 23 | 
 24 | ```
 25 | pip install git+https://github.com/facebookresearch/segment-anything.git
 26 | ```
 27 | 
 28 | or clone the repository locally and install with
 29 | 
 30 | ```
 31 | git clone git@github.com:facebookresearch/segment-anything.git
 32 | cd segment-anything; pip install -e .
 33 | ```
 34 | 
 35 | The following optional dependencies are necessary for mask post-processing, saving masks in COCO format, the example notebooks, and exporting the model in ONNX format. `jupyter` is also required to run the example notebooks.
 36 | ```
 37 | pip install opencv-python pycocotools matplotlib onnxruntime onnx
 38 | ```
 39 | 
 40 | 
 41 | ## <a name="GettingStarted"></a>Getting Started
 42 | 
 43 | First download a [model checkpoint](#model-checkpoints). Then the model can be used in just a few lines to get masks from a given prompt:
 44 | 
 45 | ```
 46 | from segment_anything import build_sam, SamPredictor 
 47 | predictor = SamPredictor(build_sam(checkpoint="</path/to/model.pth>"))
 48 | predictor.set_image(<your_image>)
 49 | masks, _, _ = predictor.predict(<input_prompts>)
 50 | ```
 51 | 
 52 | or generate masks for an entire image:
 53 | 
 54 | ```
 55 | from segment_anything import build_sam, SamAutomaticMaskGenerator
 56 | mask_generator = SamAutomaticMaskGenerator(build_sam(checkpoint="</path/to/model.pth>"))
 57 | masks = mask_generator_generate(<your_image>)
 58 | ```
 59 | 
 60 | Additionally, masks can be generated for images from the command line:
 61 | 
 62 | ```
 63 | python scripts/amg.py --checkpoint <path/to/sam/checkpoint> --input <image_or_folder> --output <output_directory>
 64 | ```
 65 | 
 66 | See the examples notebooks on [using SAM with prompts](/notebooks/predictor_example.ipynb) and [automatically generating masks](/notebooks/automatic_mask_generator_example.ipynb) for more details.
 67 | 
 68 | <p float="left">
 69 |   <img src="assets/notebook1.png?raw=true" width="49.1%" />
 70 |   <img src="assets/notebook2.png?raw=true" width="48.9%" />
 71 | </p>
 72 | 
 73 | ## ONNX Export
 74 | 
 75 | SAM's lightweight mask decoder can be exported to ONNX format so that it can be run in any environment that supports ONNX runtime, such as in-browser as showcased in the [demo](https://segment-anything.com/demo). Export the model with
 76 | 
 77 | ```
 78 | python scripts/export_onnx_model.py --checkpoint <path/to/checkpoint> --output <path/to/output>
 79 | ```
 80 | 
 81 | See the [example notebook](https://github.com/facebookresearch/segment-anything/blob/main/notebooks/onnx_model_example.ipynb) for details on how to combine image preprocessing via SAM's backbone with mask prediction using the ONNX model. It is recommended to use the latest stable version of PyTorch for ONNX export.
 82 | 
 83 | ## <a name="Models"></a>Model Checkpoints
 84 | 
 85 | Three model versions of the model are available with different backbone sizes. These models can be instantiated by running 
 86 | ```
 87 | from segment_anything import sam_model_registry
 88 | sam = sam_model_registry["<name>"](checkpoint="<path/to/checkpoint>")
 89 | ```
 90 | Click the links below to download the checkpoint for the corresponding model name. The default model in bold can also be instantiated with `build_sam`, as in the examples in [Getting Started](#getting-started).
 91 | 
 92 | * **`default` or `vit_h`: [ViT-H SAM model.](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth)**
 93 | * `vit_l`: [ViT-L SAM model.](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth)
 94 | * `vit_b`: [ViT-B SAM model.](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth)
 95 | 
 96 | ## License
 97 | The model is licensed under the [Apache 2.0 license](LICENSE).
 98 | 
 99 | ## Contributing
100 | 
101 | See [contributing](CONTRIBUTING.md) and the [code of conduct](CODE_OF_CONDUCT.md).
102 | 
103 | ## Contributors
104 | 
105 | The Segment Anything project was made possible with the help of many contributors (alphabetical):
106 | 
107 | Aaron Adcock, Vaibhav Aggarwal, Morteza Behrooz, Cheng-Yang Fu, Ashley Gabriel, Ahuva Goldstand, Allen Goodman, Sumanth Gurram, Jiabo Hu, Somya Jain, Devansh Kukreja, Robert Kuo, Joshua Lane, Yanghao Li, Lilian Luong, Jitendra Malik, Mallika Malhotra, William Ngan, Omkar Parkhi, Nikhil Raina, Dirk Rowe, Neil Sejoor, Vanessa Stark, Bala Varadarajan, Bram Wasti, Zachary Winstrom
108 | 


--------------------------------------------------------------------------------
/segment_anything/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .build_sam import (
 8 |     build_sam,
 9 |     build_sam_vit_h,
10 |     build_sam_vit_l,
11 |     build_sam_vit_b,
12 |     sam_model_registry,
13 | )
14 | from .predictor import SamPredictor
15 | from .automatic_mask_generator import SamAutomaticMaskGenerator
16 | 


--------------------------------------------------------------------------------
/segment_anything/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/segment_anything/__pycache__/automatic_mask_generator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/__pycache__/automatic_mask_generator.cpython-39.pyc


--------------------------------------------------------------------------------
/segment_anything/__pycache__/build_sam.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/__pycache__/build_sam.cpython-39.pyc


--------------------------------------------------------------------------------
/segment_anything/__pycache__/predictor.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/__pycache__/predictor.cpython-39.pyc


--------------------------------------------------------------------------------
/segment_anything/assets/masks1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/assets/masks1.png


--------------------------------------------------------------------------------
/segment_anything/assets/masks2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/assets/masks2.jpg


--------------------------------------------------------------------------------
/segment_anything/assets/model_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/assets/model_diagram.png


--------------------------------------------------------------------------------
/segment_anything/assets/notebook1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/assets/notebook1.png


--------------------------------------------------------------------------------
/segment_anything/assets/notebook2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/assets/notebook2.png


--------------------------------------------------------------------------------
/segment_anything/build_sam.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import torch
  8 | 
  9 | from functools import partial
 10 | 
 11 | from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
 12 | 
 13 | 
 14 | def build_sam_vit_h(checkpoint=None):
 15 |     return _build_sam(
 16 |         encoder_embed_dim=1280,
 17 |         encoder_depth=32,
 18 |         encoder_num_heads=16,
 19 |         encoder_global_attn_indexes=[7, 15, 23, 31],
 20 |         checkpoint=checkpoint,
 21 |     )
 22 | 
 23 | 
 24 | build_sam = build_sam_vit_h
 25 | 
 26 | 
 27 | def build_sam_vit_l(checkpoint=None):
 28 |     return _build_sam(
 29 |         encoder_embed_dim=1024,
 30 |         encoder_depth=24,
 31 |         encoder_num_heads=16,
 32 |         encoder_global_attn_indexes=[5, 11, 17, 23],
 33 |         checkpoint=checkpoint,
 34 |     )
 35 | 
 36 | 
 37 | def build_sam_vit_b(checkpoint=None):
 38 |     return _build_sam(
 39 |         encoder_embed_dim=768,
 40 |         encoder_depth=12,
 41 |         encoder_num_heads=12,
 42 |         encoder_global_attn_indexes=[2, 5, 8, 11],
 43 |         checkpoint=checkpoint,
 44 |     )
 45 | 
 46 | 
 47 | sam_model_registry = {
 48 |     "default": build_sam,
 49 |     "vit_h": build_sam,
 50 |     "vit_l": build_sam_vit_l,
 51 |     "vit_b": build_sam_vit_b,
 52 | }
 53 | 
 54 | 
 55 | def _build_sam(
 56 |     encoder_embed_dim,
 57 |     encoder_depth,
 58 |     encoder_num_heads,
 59 |     encoder_global_attn_indexes,
 60 |     checkpoint=None,
 61 | ):
 62 |     prompt_embed_dim = 256
 63 |     image_size = 1024
 64 |     vit_patch_size = 16
 65 |     image_embedding_size = image_size // vit_patch_size
 66 |     sam = Sam(
 67 |         image_encoder=ImageEncoderViT(
 68 |             depth=encoder_depth,
 69 |             embed_dim=encoder_embed_dim,
 70 |             img_size=image_size,
 71 |             mlp_ratio=4,
 72 |             norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
 73 |             num_heads=encoder_num_heads,
 74 |             patch_size=vit_patch_size,
 75 |             qkv_bias=True,
 76 |             use_rel_pos=True,
 77 |             global_attn_indexes=encoder_global_attn_indexes,
 78 |             window_size=14,
 79 |             out_chans=prompt_embed_dim,
 80 |         ),
 81 |         prompt_encoder=PromptEncoder(
 82 |             embed_dim=prompt_embed_dim,
 83 |             image_embedding_size=(image_embedding_size, image_embedding_size),
 84 |             input_image_size=(image_size, image_size),
 85 |             mask_in_chans=16,
 86 |         ),
 87 |         mask_decoder=MaskDecoder(
 88 |             num_multimask_outputs=3,
 89 |             transformer=TwoWayTransformer(
 90 |                 depth=2,
 91 |                 embedding_dim=prompt_embed_dim,
 92 |                 mlp_dim=2048,
 93 |                 num_heads=8,
 94 |             ),
 95 |             transformer_dim=prompt_embed_dim,
 96 |             iou_head_depth=3,
 97 |             iou_head_hidden_dim=256,
 98 |         ),
 99 |         pixel_mean=[123.675, 116.28, 103.53],
100 |         pixel_std=[58.395, 57.12, 57.375],
101 |     )
102 |     sam.eval()
103 |     if checkpoint is not None:
104 |         with open(checkpoint, "rb") as f:
105 |             state_dict = torch.load(f)
106 |         sam.load_state_dict(state_dict)
107 |     return sam
108 | 


--------------------------------------------------------------------------------
/segment_anything/linter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | {
 5 |   black --version | grep -E "23\." > /dev/null
 6 | } || {
 7 |   echo "Linter requires 'black==23.*' !"
 8 |   exit 1
 9 | }
10 | 
11 | ISORT_VERSION=$(isort --version-number)
12 | if [[ "$ISORT_VERSION" != 5.12* ]]; then
13 |   echo "Linter requires isort==5.12.0 !"
14 |   exit 1
15 | fi
16 | 
17 | echo "Running isort ..."
18 | isort . --atomic
19 | 
20 | echo "Running black ..."
21 | black -l 100 .
22 | 
23 | echo "Running flake8 ..."
24 | if [ -x "$(command -v flake8)" ]; then
25 |   flake8 .
26 | else
27 |   python3 -m flake8 .
28 | fi
29 | 
30 | echo "Running mypy..."
31 | 
32 | mypy --exclude 'setup.py|notebooks' .
33 | 


--------------------------------------------------------------------------------
/segment_anything/modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .sam import Sam
 8 | from .image_encoder import ImageEncoderViT
 9 | from .mask_decoder import MaskDecoder
10 | from .prompt_encoder import PromptEncoder
11 | from .transformer import TwoWayTransformer
12 | 


--------------------------------------------------------------------------------
/segment_anything/modeling/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/modeling/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/segment_anything/modeling/__pycache__/common.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/modeling/__pycache__/common.cpython-39.pyc


--------------------------------------------------------------------------------
/segment_anything/modeling/__pycache__/image_encoder.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/modeling/__pycache__/image_encoder.cpython-39.pyc


--------------------------------------------------------------------------------
/segment_anything/modeling/__pycache__/mask_decoder.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/modeling/__pycache__/mask_decoder.cpython-39.pyc


--------------------------------------------------------------------------------
/segment_anything/modeling/__pycache__/prompt_encoder.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/modeling/__pycache__/prompt_encoder.cpython-39.pyc


--------------------------------------------------------------------------------
/segment_anything/modeling/__pycache__/sam.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/modeling/__pycache__/sam.cpython-39.pyc


--------------------------------------------------------------------------------
/segment_anything/modeling/__pycache__/transformer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/modeling/__pycache__/transformer.cpython-39.pyc


--------------------------------------------------------------------------------
/segment_anything/modeling/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | 
10 | from typing import Type
11 | 
12 | 
13 | class MLPBlock(nn.Module):
14 |     def __init__(
15 |         self,
16 |         embedding_dim: int,
17 |         mlp_dim: int,
18 |         act: Type[nn.Module] = nn.GELU,
19 |     ) -> None:
20 |         super().__init__()
21 |         self.lin1 = nn.Linear(embedding_dim, mlp_dim)
22 |         self.lin2 = nn.Linear(mlp_dim, embedding_dim)
23 |         self.act = act()
24 | 
25 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
26 |         return self.lin2(self.act(self.lin1(x)))
27 | 
28 | 
29 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
30 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
31 | class LayerNorm2d(nn.Module):
32 |     def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
33 |         super().__init__()
34 |         self.weight = nn.Parameter(torch.ones(num_channels))
35 |         self.bias = nn.Parameter(torch.zeros(num_channels))
36 |         self.eps = eps
37 | 
38 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
39 |         u = x.mean(1, keepdim=True)
40 |         s = (x - u).pow(2).mean(1, keepdim=True)
41 |         x = (x - u) / torch.sqrt(s + self.eps)
42 |         x = self.weight[:, None, None] * x + self.bias[:, None, None]
43 |         return x
44 | 


--------------------------------------------------------------------------------
/segment_anything/notebooks/images/dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/notebooks/images/dog.jpg


--------------------------------------------------------------------------------
/segment_anything/notebooks/images/groceries.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/notebooks/images/groceries.jpg


--------------------------------------------------------------------------------
/segment_anything/notebooks/images/truck.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/notebooks/images/truck.jpg


--------------------------------------------------------------------------------
/segment_anything/segment_anything.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: segment-anything
3 | Version: 1.0
4 | Provides-Extra: all
5 | Provides-Extra: dev
6 | License-File: LICENSE
7 | 


--------------------------------------------------------------------------------
/segment_anything/segment_anything.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | LICENSE
 2 | README.md
 3 | setup.cfg
 4 | setup.py
 5 | modeling/__init__.py
 6 | modeling/common.py
 7 | modeling/image_encoder.py
 8 | modeling/mask_decoder.py
 9 | modeling/prompt_encoder.py
10 | modeling/sam.py
11 | modeling/transformer.py
12 | segment_anything/__init__.py
13 | segment_anything/automatic_mask_generator.py
14 | segment_anything/build_sam.py
15 | segment_anything/predictor.py
16 | segment_anything.egg-info/PKG-INFO
17 | segment_anything.egg-info/SOURCES.txt
18 | segment_anything.egg-info/dependency_links.txt
19 | segment_anything.egg-info/requires.txt
20 | segment_anything.egg-info/top_level.txt
21 | segment_anything/modeling/__init__.py
22 | segment_anything/modeling/common.py
23 | segment_anything/modeling/image_encoder.py
24 | segment_anything/modeling/mask_decoder.py
25 | segment_anything/modeling/prompt_encoder.py
26 | segment_anything/modeling/sam.py
27 | segment_anything/modeling/transformer.py
28 | segment_anything/utils/__init__.py
29 | segment_anything/utils/amg.py
30 | segment_anything/utils/onnx.py
31 | segment_anything/utils/transforms.py
32 | utils/__init__.py
33 | utils/amg.py
34 | utils/onnx.py
35 | utils/transforms.py


--------------------------------------------------------------------------------
/segment_anything/segment_anything.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/segment_anything/segment_anything.egg-info/requires.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | [all]
 3 | matplotlib
 4 | pycocotools
 5 | opencv-python
 6 | onnx
 7 | onnxruntime
 8 | 
 9 | [dev]
10 | flake8
11 | isort
12 | black
13 | mypy
14 | 


--------------------------------------------------------------------------------
/segment_anything/segment_anything.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | modeling
2 | segment_anything
3 | utils
4 | 


--------------------------------------------------------------------------------
/segment_anything/segment_anything/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .build_sam import (
 8 |     build_sam,
 9 |     build_sam_vit_h,
10 |     build_sam_vit_l,
11 |     build_sam_vit_b,
12 |     sam_model_registry,
13 | )
14 | from .predictor import SamPredictor
15 | from .automatic_mask_generator import SamAutomaticMaskGenerator
16 | 


--------------------------------------------------------------------------------
/segment_anything/segment_anything/build_sam.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import torch
  8 | 
  9 | from functools import partial
 10 | 
 11 | from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
 12 | 
 13 | 
 14 | def build_sam_vit_h(checkpoint=None):
 15 |     return _build_sam(
 16 |         encoder_embed_dim=1280,
 17 |         encoder_depth=32,
 18 |         encoder_num_heads=16,
 19 |         encoder_global_attn_indexes=[7, 15, 23, 31],
 20 |         checkpoint=checkpoint,
 21 |     )
 22 | 
 23 | 
 24 | build_sam = build_sam_vit_h
 25 | 
 26 | 
 27 | def build_sam_vit_l(checkpoint=None):
 28 |     return _build_sam(
 29 |         encoder_embed_dim=1024,
 30 |         encoder_depth=24,
 31 |         encoder_num_heads=16,
 32 |         encoder_global_attn_indexes=[5, 11, 17, 23],
 33 |         checkpoint=checkpoint,
 34 |     )
 35 | 
 36 | 
 37 | def build_sam_vit_b(checkpoint=None):
 38 |     return _build_sam(
 39 |         encoder_embed_dim=768,
 40 |         encoder_depth=12,
 41 |         encoder_num_heads=12,
 42 |         encoder_global_attn_indexes=[2, 5, 8, 11],
 43 |         checkpoint=checkpoint,
 44 |     )
 45 | 
 46 | 
 47 | sam_model_registry = {
 48 |     "default": build_sam,
 49 |     "vit_h": build_sam,
 50 |     "vit_l": build_sam_vit_l,
 51 |     "vit_b": build_sam_vit_b,
 52 | }
 53 | 
 54 | 
 55 | def _build_sam(
 56 |     encoder_embed_dim,
 57 |     encoder_depth,
 58 |     encoder_num_heads,
 59 |     encoder_global_attn_indexes,
 60 |     checkpoint=None,
 61 | ):
 62 |     prompt_embed_dim = 256
 63 |     image_size = 1024
 64 |     vit_patch_size = 16
 65 |     image_embedding_size = image_size // vit_patch_size
 66 |     sam = Sam(
 67 |         image_encoder=ImageEncoderViT(
 68 |             depth=encoder_depth,
 69 |             embed_dim=encoder_embed_dim,
 70 |             img_size=image_size,
 71 |             mlp_ratio=4,
 72 |             norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
 73 |             num_heads=encoder_num_heads,
 74 |             patch_size=vit_patch_size,
 75 |             qkv_bias=True,
 76 |             use_rel_pos=True,
 77 |             global_attn_indexes=encoder_global_attn_indexes,
 78 |             window_size=14,
 79 |             out_chans=prompt_embed_dim,
 80 |         ),
 81 |         prompt_encoder=PromptEncoder(
 82 |             embed_dim=prompt_embed_dim,
 83 |             image_embedding_size=(image_embedding_size, image_embedding_size),
 84 |             input_image_size=(image_size, image_size),
 85 |             mask_in_chans=16,
 86 |         ),
 87 |         mask_decoder=MaskDecoder(
 88 |             num_multimask_outputs=3,
 89 |             transformer=TwoWayTransformer(
 90 |                 depth=2,
 91 |                 embedding_dim=prompt_embed_dim,
 92 |                 mlp_dim=2048,
 93 |                 num_heads=8,
 94 |             ),
 95 |             transformer_dim=prompt_embed_dim,
 96 |             iou_head_depth=3,
 97 |             iou_head_hidden_dim=256,
 98 |         ),
 99 |         pixel_mean=[123.675, 116.28, 103.53],
100 |         pixel_std=[58.395, 57.12, 57.375],
101 |     )
102 |     sam.eval()
103 |     if checkpoint is not None:
104 |         with open(checkpoint, "rb") as f:
105 |             state_dict = torch.load(f)
106 |         sam.load_state_dict(state_dict)
107 |     return sam
108 | 


--------------------------------------------------------------------------------
/segment_anything/segment_anything/modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .sam import Sam
 8 | from .image_encoder import ImageEncoderViT
 9 | from .mask_decoder import MaskDecoder
10 | from .prompt_encoder import PromptEncoder
11 | from .transformer import TwoWayTransformer
12 | 


--------------------------------------------------------------------------------
/segment_anything/segment_anything/modeling/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | 
10 | from typing import Type
11 | 
12 | 
13 | class MLPBlock(nn.Module):
14 |     def __init__(
15 |         self,
16 |         embedding_dim: int,
17 |         mlp_dim: int,
18 |         act: Type[nn.Module] = nn.GELU,
19 |     ) -> None:
20 |         super().__init__()
21 |         self.lin1 = nn.Linear(embedding_dim, mlp_dim)
22 |         self.lin2 = nn.Linear(mlp_dim, embedding_dim)
23 |         self.act = act()
24 | 
25 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
26 |         return self.lin2(self.act(self.lin1(x)))
27 | 
28 | 
29 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
30 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
31 | class LayerNorm2d(nn.Module):
32 |     def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
33 |         super().__init__()
34 |         self.weight = nn.Parameter(torch.ones(num_channels))
35 |         self.bias = nn.Parameter(torch.zeros(num_channels))
36 |         self.eps = eps
37 | 
38 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
39 |         u = x.mean(1, keepdim=True)
40 |         s = (x - u).pow(2).mean(1, keepdim=True)
41 |         x = (x - u) / torch.sqrt(s + self.eps)
42 |         x = self.weight[:, None, None] * x + self.bias[:, None, None]
43 |         return x
44 | 


--------------------------------------------------------------------------------
/segment_anything/segment_anything/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/segment_anything/segment_anything/utils/onnx.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.nn import functional as F
 10 | 
 11 | from typing import Tuple
 12 | 
 13 | from ..modeling import Sam
 14 | from .amg import calculate_stability_score
 15 | 
 16 | 
 17 | class SamOnnxModel(nn.Module):
 18 |     """
 19 |     This model should not be called directly, but is used in ONNX export.
 20 |     It combines the prompt encoder, mask decoder, and mask postprocessing of Sam,
 21 |     with some functions modified to enable model tracing. Also supports extra
 22 |     options controlling what information. See the ONNX export script for details.
 23 |     """
 24 | 
 25 |     def __init__(
 26 |         self,
 27 |         model: Sam,
 28 |         return_single_mask: bool,
 29 |         use_stability_score: bool = False,
 30 |         return_extra_metrics: bool = False,
 31 |     ) -> None:
 32 |         super().__init__()
 33 |         self.mask_decoder = model.mask_decoder
 34 |         self.model = model
 35 |         self.img_size = model.image_encoder.img_size
 36 |         self.return_single_mask = return_single_mask
 37 |         self.use_stability_score = use_stability_score
 38 |         self.stability_score_offset = 1.0
 39 |         self.return_extra_metrics = return_extra_metrics
 40 | 
 41 |     @staticmethod
 42 |     def resize_longest_image_size(
 43 |         input_image_size: torch.Tensor, longest_side: int
 44 |     ) -> torch.Tensor:
 45 |         input_image_size = input_image_size.to(torch.float32)
 46 |         scale = longest_side / torch.max(input_image_size)
 47 |         transformed_size = scale * input_image_size
 48 |         transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64)
 49 |         return transformed_size
 50 | 
 51 |     def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor:
 52 |         point_coords = point_coords + 0.5
 53 |         point_coords = point_coords / self.img_size
 54 |         point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords)
 55 |         point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding)
 56 | 
 57 |         point_embedding = point_embedding * (point_labels != -1)
 58 |         point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * (
 59 |             point_labels == -1
 60 |         )
 61 | 
 62 |         for i in range(self.model.prompt_encoder.num_point_embeddings):
 63 |             point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[
 64 |                 i
 65 |             ].weight * (point_labels == i)
 66 | 
 67 |         return point_embedding
 68 | 
 69 |     def _embed_masks(self, input_mask: torch.Tensor, has_mask_input: torch.Tensor) -> torch.Tensor:
 70 |         mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(input_mask)
 71 |         mask_embedding = mask_embedding + (
 72 |             1 - has_mask_input
 73 |         ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1)
 74 |         return mask_embedding
 75 | 
 76 |     def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -> torch.Tensor:
 77 |         masks = F.interpolate(
 78 |             masks,
 79 |             size=(self.img_size, self.img_size),
 80 |             mode="bilinear",
 81 |             align_corners=False,
 82 |         )
 83 | 
 84 |         prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size)
 85 |         masks = masks[..., : int(prepadded_size[0]), : int(prepadded_size[1])]
 86 | 
 87 |         orig_im_size = orig_im_size.to(torch.int64)
 88 |         h, w = orig_im_size[0], orig_im_size[1]
 89 |         masks = F.interpolate(masks, size=(h, w), mode="bilinear", align_corners=False)
 90 |         return masks
 91 | 
 92 |     def select_masks(
 93 |         self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int
 94 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 95 |         # Determine if we should return the multiclick mask or not from the number of points.
 96 |         # The reweighting is used to avoid control flow.
 97 |         score_reweight = torch.tensor(
 98 |             [[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)]
 99 |         ).to(iou_preds.device)
100 |         score = iou_preds + (num_points - 2.5) * score_reweight
101 |         best_idx = torch.argmax(score, dim=1)
102 |         masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1)
103 |         iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1)
104 | 
105 |         return masks, iou_preds
106 | 
107 |     @torch.no_grad()
108 |     def forward(
109 |         self,
110 |         image_embeddings: torch.Tensor,
111 |         point_coords: torch.Tensor,
112 |         point_labels: torch.Tensor,
113 |         mask_input: torch.Tensor,
114 |         has_mask_input: torch.Tensor,
115 |         orig_im_size: torch.Tensor,
116 |     ):
117 |         sparse_embedding = self._embed_points(point_coords, point_labels)
118 |         dense_embedding = self._embed_masks(mask_input, has_mask_input)
119 | 
120 |         masks, scores = self.model.mask_decoder.predict_masks(
121 |             image_embeddings=image_embeddings,
122 |             image_pe=self.model.prompt_encoder.get_dense_pe(),
123 |             sparse_prompt_embeddings=sparse_embedding,
124 |             dense_prompt_embeddings=dense_embedding,
125 |         )
126 | 
127 |         if self.use_stability_score:
128 |             scores = calculate_stability_score(
129 |                 masks, self.model.mask_threshold, self.stability_score_offset
130 |             )
131 | 
132 |         if self.return_single_mask:
133 |             masks, scores = self.select_masks(masks, scores, point_coords.shape[1])
134 | 
135 |         upscaled_masks = self.mask_postprocessing(masks, orig_im_size)
136 | 
137 |         if self.return_extra_metrics:
138 |             stability_scores = calculate_stability_score(
139 |                 upscaled_masks, self.model.mask_threshold, self.stability_score_offset
140 |             )
141 |             areas = (upscaled_masks > self.model.mask_threshold).sum(-1).sum(-1)
142 |             return upscaled_masks, scores, stability_scores, areas, masks
143 | 
144 |         return upscaled_masks, scores, masks
145 | 


--------------------------------------------------------------------------------
/segment_anything/segment_anything/utils/transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from torch.nn import functional as F
 10 | from torchvision.transforms.functional import resize, to_pil_image  # type: ignore
 11 | 
 12 | from copy import deepcopy
 13 | from typing import Tuple
 14 | 
 15 | 
 16 | class ResizeLongestSide:
 17 |     """
 18 |     Resizes images to longest side 'target_length', as well as provides
 19 |     methods for resizing coordinates and boxes. Provides methods for
 20 |     transforming both numpy array and batched torch tensors.
 21 |     """
 22 | 
 23 |     def __init__(self, target_length: int) -> None:
 24 |         self.target_length = target_length
 25 | 
 26 |     def apply_image(self, image: np.ndarray) -> np.ndarray:
 27 |         """
 28 |         Expects a numpy array with shape HxWxC in uint8 format.
 29 |         """
 30 |         target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
 31 |         return np.array(resize(to_pil_image(image), target_size))
 32 | 
 33 |     def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
 34 |         """
 35 |         Expects a numpy array of length 2 in the final dimension. Requires the
 36 |         original image size in (H, W) format.
 37 |         """
 38 |         old_h, old_w = original_size
 39 |         new_h, new_w = self.get_preprocess_shape(
 40 |             original_size[0], original_size[1], self.target_length
 41 |         )
 42 |         coords = deepcopy(coords).astype(float)
 43 |         coords[..., 0] = coords[..., 0] * (new_w / old_w)
 44 |         coords[..., 1] = coords[..., 1] * (new_h / old_h)
 45 |         return coords
 46 | 
 47 |     def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
 48 |         """
 49 |         Expects a numpy array shape Bx4. Requires the original image size
 50 |         in (H, W) format.
 51 |         """
 52 |         boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
 53 |         return boxes.reshape(-1, 4)
 54 | 
 55 |     def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
 56 |         """
 57 |         Expects batched images with shape BxCxHxW and float format. This
 58 |         transformation may not exactly match apply_image. apply_image is
 59 |         the transformation expected by the model.
 60 |         """
 61 |         # Expects an image in BCHW format. May not exactly match apply_image.
 62 |         target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
 63 |         return F.interpolate(
 64 |             image, target_size, mode="bilinear", align_corners=False, antialias=True
 65 |         )
 66 | 
 67 |     def apply_coords_torch(
 68 |         self, coords: torch.Tensor, original_size: Tuple[int, ...]
 69 |     ) -> torch.Tensor:
 70 |         """
 71 |         Expects a torch tensor with length 2 in the last dimension. Requires the
 72 |         original image size in (H, W) format.
 73 |         """
 74 |         old_h, old_w = original_size
 75 |         new_h, new_w = self.get_preprocess_shape(
 76 |             original_size[0], original_size[1], self.target_length
 77 |         )
 78 |         coords = deepcopy(coords).to(torch.float)
 79 |         coords[..., 0] = coords[..., 0] * (new_w / old_w)
 80 |         coords[..., 1] = coords[..., 1] * (new_h / old_h)
 81 |         return coords
 82 | 
 83 |     def apply_boxes_torch(
 84 |         self, boxes: torch.Tensor, original_size: Tuple[int, ...]
 85 |     ) -> torch.Tensor:
 86 |         """
 87 |         Expects a torch tensor with shape Bx4. Requires the original image
 88 |         size in (H, W) format.
 89 |         """
 90 |         boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
 91 |         return boxes.reshape(-1, 4)
 92 | 
 93 |     @staticmethod
 94 |     def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
 95 |         """
 96 |         Compute the output size given input size and target long side length.
 97 |         """
 98 |         scale = long_side_length * 1.0 / max(oldh, oldw)
 99 |         newh, neww = oldh * scale, oldw * scale
100 |         neww = int(neww + 0.5)
101 |         newh = int(newh + 0.5)
102 |         return (newh, neww)
103 | 


--------------------------------------------------------------------------------
/segment_anything/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | line_length=100
 3 | multi_line_output=3
 4 | include_trailing_comma=True
 5 | known_standard_library=numpy,setuptools
 6 | skip_glob=*/__init__.py
 7 | known_myself=segment_anything
 8 | known_third_party=matplotlib,cv2,torch,torchvision,pycocotools,onnx,black,isort
 9 | no_lines_before=STDLIB,THIRDPARTY
10 | sections=FUTURE,STDLIB,THIRDPARTY,MYSELF,FIRSTPARTY,LOCALFOLDER
11 | default_section=FIRSTPARTY
12 | 


--------------------------------------------------------------------------------
/segment_anything/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from setuptools import find_packages, setup
 8 | 
 9 | setup(
10 |     name="segment_anything",
11 |     version="1.0",
12 |     install_requires=[],
13 |     packages=find_packages(exclude="notebooks"),
14 |     extras_require={
15 |         "all": ["matplotlib", "pycocotools", "opencv-python", "onnx", "onnxruntime"],
16 |         "dev": ["flake8", "isort", "black", "mypy"],
17 |     },
18 | )
19 | 


--------------------------------------------------------------------------------
/segment_anything/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/segment_anything/utils/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/utils/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/segment_anything/utils/__pycache__/amg.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/utils/__pycache__/amg.cpython-39.pyc


--------------------------------------------------------------------------------
/segment_anything/utils/__pycache__/transforms.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/utils/__pycache__/transforms.cpython-39.pyc


--------------------------------------------------------------------------------
/segment_anything/utils/onnx.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.nn import functional as F
 10 | 
 11 | from typing import Tuple
 12 | 
 13 | from ..modeling import Sam
 14 | from .amg import calculate_stability_score
 15 | 
 16 | 
 17 | class SamOnnxModel(nn.Module):
 18 |     """
 19 |     This model should not be called directly, but is used in ONNX export.
 20 |     It combines the prompt encoder, mask decoder, and mask postprocessing of Sam,
 21 |     with some functions modified to enable model tracing. Also supports extra
 22 |     options controlling what information. See the ONNX export script for details.
 23 |     """
 24 | 
 25 |     def __init__(
 26 |         self,
 27 |         model: Sam,
 28 |         return_single_mask: bool,
 29 |         use_stability_score: bool = False,
 30 |         return_extra_metrics: bool = False,
 31 |     ) -> None:
 32 |         super().__init__()
 33 |         self.mask_decoder = model.mask_decoder
 34 |         self.model = model
 35 |         self.img_size = model.image_encoder.img_size
 36 |         self.return_single_mask = return_single_mask
 37 |         self.use_stability_score = use_stability_score
 38 |         self.stability_score_offset = 1.0
 39 |         self.return_extra_metrics = return_extra_metrics
 40 | 
 41 |     @staticmethod
 42 |     def resize_longest_image_size(
 43 |         input_image_size: torch.Tensor, longest_side: int
 44 |     ) -> torch.Tensor:
 45 |         input_image_size = input_image_size.to(torch.float32)
 46 |         scale = longest_side / torch.max(input_image_size)
 47 |         transformed_size = scale * input_image_size
 48 |         transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64)
 49 |         return transformed_size
 50 | 
 51 |     def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor:
 52 |         point_coords = point_coords + 0.5
 53 |         point_coords = point_coords / self.img_size
 54 |         point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords)
 55 |         point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding)
 56 | 
 57 |         point_embedding = point_embedding * (point_labels != -1)
 58 |         point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * (
 59 |             point_labels == -1
 60 |         )
 61 | 
 62 |         for i in range(self.model.prompt_encoder.num_point_embeddings):
 63 |             point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[
 64 |                 i
 65 |             ].weight * (point_labels == i)
 66 | 
 67 |         return point_embedding
 68 | 
 69 |     def _embed_masks(self, input_mask: torch.Tensor, has_mask_input: torch.Tensor) -> torch.Tensor:
 70 |         mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(input_mask)
 71 |         mask_embedding = mask_embedding + (
 72 |             1 - has_mask_input
 73 |         ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1)
 74 |         return mask_embedding
 75 | 
 76 |     def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -> torch.Tensor:
 77 |         masks = F.interpolate(
 78 |             masks,
 79 |             size=(self.img_size, self.img_size),
 80 |             mode="bilinear",
 81 |             align_corners=False,
 82 |         )
 83 | 
 84 |         prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size)
 85 |         masks = masks[..., : int(prepadded_size[0]), : int(prepadded_size[1])]
 86 | 
 87 |         orig_im_size = orig_im_size.to(torch.int64)
 88 |         h, w = orig_im_size[0], orig_im_size[1]
 89 |         masks = F.interpolate(masks, size=(h, w), mode="bilinear", align_corners=False)
 90 |         return masks
 91 | 
 92 |     def select_masks(
 93 |         self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int
 94 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 95 |         # Determine if we should return the multiclick mask or not from the number of points.
 96 |         # The reweighting is used to avoid control flow.
 97 |         score_reweight = torch.tensor(
 98 |             [[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)]
 99 |         ).to(iou_preds.device)
100 |         score = iou_preds + (num_points - 2.5) * score_reweight
101 |         best_idx = torch.argmax(score, dim=1)
102 |         masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1)
103 |         iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1)
104 | 
105 |         return masks, iou_preds
106 | 
107 |     @torch.no_grad()
108 |     def forward(
109 |         self,
110 |         image_embeddings: torch.Tensor,
111 |         point_coords: torch.Tensor,
112 |         point_labels: torch.Tensor,
113 |         mask_input: torch.Tensor,
114 |         has_mask_input: torch.Tensor,
115 |         orig_im_size: torch.Tensor,
116 |     ):
117 |         sparse_embedding = self._embed_points(point_coords, point_labels)
118 |         dense_embedding = self._embed_masks(mask_input, has_mask_input)
119 | 
120 |         masks, scores = self.model.mask_decoder.predict_masks(
121 |             image_embeddings=image_embeddings,
122 |             image_pe=self.model.prompt_encoder.get_dense_pe(),
123 |             sparse_prompt_embeddings=sparse_embedding,
124 |             dense_prompt_embeddings=dense_embedding,
125 |         )
126 | 
127 |         if self.use_stability_score:
128 |             scores = calculate_stability_score(
129 |                 masks, self.model.mask_threshold, self.stability_score_offset
130 |             )
131 | 
132 |         if self.return_single_mask:
133 |             masks, scores = self.select_masks(masks, scores, point_coords.shape[1])
134 | 
135 |         upscaled_masks = self.mask_postprocessing(masks, orig_im_size)
136 | 
137 |         if self.return_extra_metrics:
138 |             stability_scores = calculate_stability_score(
139 |                 upscaled_masks, self.model.mask_threshold, self.stability_score_offset
140 |             )
141 |             areas = (upscaled_masks > self.model.mask_threshold).sum(-1).sum(-1)
142 |             return upscaled_masks, scores, stability_scores, areas, masks
143 | 
144 |         return upscaled_masks, scores, masks
145 | 


--------------------------------------------------------------------------------
/segment_anything/utils/transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from torch.nn import functional as F
 10 | from torchvision.transforms.functional import resize, to_pil_image  # type: ignore
 11 | 
 12 | from copy import deepcopy
 13 | from typing import Tuple
 14 | 
 15 | 
 16 | class ResizeLongestSide:
 17 |     """
 18 |     Resizes images to longest side 'target_length', as well as provides
 19 |     methods for resizing coordinates and boxes. Provides methods for
 20 |     transforming both numpy array and batched torch tensors.
 21 |     """
 22 | 
 23 |     def __init__(self, target_length: int) -> None:
 24 |         self.target_length = target_length
 25 | 
 26 |     def apply_image(self, image: np.ndarray) -> np.ndarray:
 27 |         """
 28 |         Expects a numpy array with shape HxWxC in uint8 format.
 29 |         """
 30 |         target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
 31 |         return np.array(resize(to_pil_image(image), target_size))
 32 | 
 33 |     def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
 34 |         """
 35 |         Expects a numpy array of length 2 in the final dimension. Requires the
 36 |         original image size in (H, W) format.
 37 |         """
 38 |         old_h, old_w = original_size
 39 |         new_h, new_w = self.get_preprocess_shape(
 40 |             original_size[0], original_size[1], self.target_length
 41 |         )
 42 |         coords = deepcopy(coords).astype(float)
 43 |         coords[..., 0] = coords[..., 0] * (new_w / old_w)
 44 |         coords[..., 1] = coords[..., 1] * (new_h / old_h)
 45 |         return coords
 46 | 
 47 |     def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
 48 |         """
 49 |         Expects a numpy array shape Bx4. Requires the original image size
 50 |         in (H, W) format.
 51 |         """
 52 |         boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
 53 |         return boxes.reshape(-1, 4)
 54 | 
 55 |     def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
 56 |         """
 57 |         Expects batched images with shape BxCxHxW and float format. This
 58 |         transformation may not exactly match apply_image. apply_image is
 59 |         the transformation expected by the model.
 60 |         """
 61 |         # Expects an image in BCHW format. May not exactly match apply_image.
 62 |         target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
 63 |         return F.interpolate(
 64 |             image, target_size, mode="bilinear", align_corners=False, antialias=True
 65 |         )
 66 | 
 67 |     def apply_coords_torch(
 68 |         self, coords: torch.Tensor, original_size: Tuple[int, ...]
 69 |     ) -> torch.Tensor:
 70 |         """
 71 |         Expects a torch tensor with length 2 in the last dimension. Requires the
 72 |         original image size in (H, W) format.
 73 |         """
 74 |         old_h, old_w = original_size
 75 |         new_h, new_w = self.get_preprocess_shape(
 76 |             original_size[0], original_size[1], self.target_length
 77 |         )
 78 |         coords = deepcopy(coords).to(torch.float)
 79 |         coords[..., 0] = coords[..., 0] * (new_w / old_w)
 80 |         coords[..., 1] = coords[..., 1] * (new_h / old_h)
 81 |         return coords
 82 | 
 83 |     def apply_boxes_torch(
 84 |         self, boxes: torch.Tensor, original_size: Tuple[int, ...]
 85 |     ) -> torch.Tensor:
 86 |         """
 87 |         Expects a torch tensor with shape Bx4. Requires the original image
 88 |         size in (H, W) format.
 89 |         """
 90 |         boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
 91 |         return boxes.reshape(-1, 4)
 92 | 
 93 |     @staticmethod
 94 |     def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
 95 |         """
 96 |         Compute the output size given input size and target long side length.
 97 |         """
 98 |         scale = long_side_length * 1.0 / max(oldh, oldw)
 99 |         newh, neww = oldh * scale, oldw * scale
100 |         neww = int(neww + 0.5)
101 |         newh = int(newh + 0.5)
102 |         return (newh, neww)
103 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | line_length=100
 3 | multi_line_output=3
 4 | include_trailing_comma=True
 5 | known_standard_library=numpy,setuptools
 6 | skip_glob=*/__init__.py
 7 | known_myself=segment_anything
 8 | known_third_party=matplotlib,cv2,torch,torchvision,pycocotools,onnx,black,isort
 9 | no_lines_before=STDLIB,THIRDPARTY
10 | sections=FUTURE,STDLIB,THIRDPARTY,MYSELF,FIRSTPARTY,LOCALFOLDER
11 | default_section=FIRSTPARTY
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from setuptools import find_packages, setup
 8 | 
 9 | setup(
10 |     name="segment_anything",
11 |     version="1.0",
12 |     install_requires=[],
13 |     packages=find_packages(exclude="notebooks"),
14 |     extras_require={
15 |         "all": ["matplotlib", "pycocotools", "opencv-python", "onnx", "onnxruntime"],
16 |         "dev": ["flake8", "isort", "black", "mypy"],
17 |     },
18 | )
19 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
1 | pip install -r requirements.txt
2 | 
3 | 
4 | wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
5 | wget https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
6 | 
7 | wandb login --anonymously 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/utils/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/amg.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/utils/__pycache__/amg.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/transforms.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/utils/__pycache__/transforms.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/onnx.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.nn import functional as F
 10 | 
 11 | from typing import Tuple
 12 | 
 13 | from ..modeling import Sam
 14 | from .amg import calculate_stability_score
 15 | 
 16 | 
 17 | class SamOnnxModel(nn.Module):
 18 |     """
 19 |     This model should not be called directly, but is used in ONNX export.
 20 |     It combines the prompt encoder, mask decoder, and mask postprocessing of Sam,
 21 |     with some functions modified to enable model tracing. Also supports extra
 22 |     options controlling what information. See the ONNX export script for details.
 23 |     """
 24 | 
 25 |     def __init__(
 26 |         self,
 27 |         model: Sam,
 28 |         return_single_mask: bool,
 29 |         use_stability_score: bool = False,
 30 |         return_extra_metrics: bool = False,
 31 |     ) -> None:
 32 |         super().__init__()
 33 |         self.mask_decoder = model.mask_decoder
 34 |         self.model = model
 35 |         self.img_size = model.image_encoder.img_size
 36 |         self.return_single_mask = return_single_mask
 37 |         self.use_stability_score = use_stability_score
 38 |         self.stability_score_offset = 1.0
 39 |         self.return_extra_metrics = return_extra_metrics
 40 | 
 41 |     @staticmethod
 42 |     def resize_longest_image_size(
 43 |         input_image_size: torch.Tensor, longest_side: int
 44 |     ) -> torch.Tensor:
 45 |         input_image_size = input_image_size.to(torch.float32)
 46 |         scale = longest_side / torch.max(input_image_size)
 47 |         transformed_size = scale * input_image_size
 48 |         transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64)
 49 |         return transformed_size
 50 | 
 51 |     def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor:
 52 |         point_coords = point_coords + 0.5
 53 |         point_coords = point_coords / self.img_size
 54 |         point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords)
 55 |         point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding)
 56 | 
 57 |         point_embedding = point_embedding * (point_labels != -1)
 58 |         point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * (
 59 |             point_labels == -1
 60 |         )
 61 | 
 62 |         for i in range(self.model.prompt_encoder.num_point_embeddings):
 63 |             point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[
 64 |                 i
 65 |             ].weight * (point_labels == i)
 66 | 
 67 |         return point_embedding
 68 | 
 69 |     def _embed_masks(self, input_mask: torch.Tensor, has_mask_input: torch.Tensor) -> torch.Tensor:
 70 |         mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(input_mask)
 71 |         mask_embedding = mask_embedding + (
 72 |             1 - has_mask_input
 73 |         ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1)
 74 |         return mask_embedding
 75 | 
 76 |     def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -> torch.Tensor:
 77 |         masks = F.interpolate(
 78 |             masks,
 79 |             size=(self.img_size, self.img_size),
 80 |             mode="bilinear",
 81 |             align_corners=False,
 82 |         )
 83 | 
 84 |         prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size)
 85 |         masks = masks[..., : int(prepadded_size[0]), : int(prepadded_size[1])]
 86 | 
 87 |         orig_im_size = orig_im_size.to(torch.int64)
 88 |         h, w = orig_im_size[0], orig_im_size[1]
 89 |         masks = F.interpolate(masks, size=(h, w), mode="bilinear", align_corners=False)
 90 |         return masks
 91 | 
 92 |     def select_masks(
 93 |         self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int
 94 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 95 |         # Determine if we should return the multiclick mask or not from the number of points.
 96 |         # The reweighting is used to avoid control flow.
 97 |         score_reweight = torch.tensor(
 98 |             [[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)]
 99 |         ).to(iou_preds.device)
100 |         score = iou_preds + (num_points - 2.5) * score_reweight
101 |         best_idx = torch.argmax(score, dim=1)
102 |         masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1)
103 |         iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1)
104 | 
105 |         return masks, iou_preds
106 | 
107 |     @torch.no_grad()
108 |     def forward(
109 |         self,
110 |         image_embeddings: torch.Tensor,
111 |         point_coords: torch.Tensor,
112 |         point_labels: torch.Tensor,
113 |         mask_input: torch.Tensor,
114 |         has_mask_input: torch.Tensor,
115 |         orig_im_size: torch.Tensor,
116 |     ):
117 |         sparse_embedding = self._embed_points(point_coords, point_labels)
118 |         dense_embedding = self._embed_masks(mask_input, has_mask_input)
119 | 
120 |         masks, scores = self.model.mask_decoder.predict_masks(
121 |             image_embeddings=image_embeddings,
122 |             image_pe=self.model.prompt_encoder.get_dense_pe(),
123 |             sparse_prompt_embeddings=sparse_embedding,
124 |             dense_prompt_embeddings=dense_embedding,
125 |         )
126 | 
127 |         if self.use_stability_score:
128 |             scores = calculate_stability_score(
129 |                 masks, self.model.mask_threshold, self.stability_score_offset
130 |             )
131 | 
132 |         if self.return_single_mask:
133 |             masks, scores = self.select_masks(masks, scores, point_coords.shape[1])
134 | 
135 |         upscaled_masks = self.mask_postprocessing(masks, orig_im_size)
136 | 
137 |         if self.return_extra_metrics:
138 |             stability_scores = calculate_stability_score(
139 |                 upscaled_masks, self.model.mask_threshold, self.stability_score_offset
140 |             )
141 |             areas = (upscaled_masks > self.model.mask_threshold).sum(-1).sum(-1)
142 |             return upscaled_masks, scores, stability_scores, areas, masks
143 | 
144 |         return upscaled_masks, scores, masks
145 | 


--------------------------------------------------------------------------------
/utils/transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from torch.nn import functional as F
 10 | from torchvision.transforms.functional import resize, to_pil_image  # type: ignore
 11 | 
 12 | from copy import deepcopy
 13 | from typing import Tuple
 14 | 
 15 | 
 16 | class ResizeLongestSide:
 17 |     """
 18 |     Resizes images to longest side 'target_length', as well as provides
 19 |     methods for resizing coordinates and boxes. Provides methods for
 20 |     transforming both numpy array and batched torch tensors.
 21 |     """
 22 | 
 23 |     def __init__(self, target_length: int) -> None:
 24 |         self.target_length = target_length
 25 | 
 26 |     def apply_image(self, image: np.ndarray) -> np.ndarray:
 27 |         """
 28 |         Expects a numpy array with shape HxWxC in uint8 format.
 29 |         """
 30 |         target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
 31 |         return np.array(resize(to_pil_image(image), target_size))
 32 | 
 33 |     def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
 34 |         """
 35 |         Expects a numpy array of length 2 in the final dimension. Requires the
 36 |         original image size in (H, W) format.
 37 |         """
 38 |         old_h, old_w = original_size
 39 |         new_h, new_w = self.get_preprocess_shape(
 40 |             original_size[0], original_size[1], self.target_length
 41 |         )
 42 |         coords = deepcopy(coords).astype(float)
 43 |         coords[..., 0] = coords[..., 0] * (new_w / old_w)
 44 |         coords[..., 1] = coords[..., 1] * (new_h / old_h)
 45 |         return coords
 46 | 
 47 |     def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
 48 |         """
 49 |         Expects a numpy array shape Bx4. Requires the original image size
 50 |         in (H, W) format.
 51 |         """
 52 |         boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
 53 |         return boxes.reshape(-1, 4)
 54 | 
 55 |     def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
 56 |         """
 57 |         Expects batched images with shape BxCxHxW and float format. This
 58 |         transformation may not exactly match apply_image. apply_image is
 59 |         the transformation expected by the model.
 60 |         """
 61 |         # Expects an image in BCHW format. May not exactly match apply_image.
 62 |         target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
 63 |         return F.interpolate(
 64 |             image, target_size, mode="bilinear", align_corners=False, antialias=True
 65 |         )
 66 | 
 67 |     def apply_coords_torch(
 68 |         self, coords: torch.Tensor, original_size: Tuple[int, ...]
 69 |     ) -> torch.Tensor:
 70 |         """
 71 |         Expects a torch tensor with length 2 in the last dimension. Requires the
 72 |         original image size in (H, W) format.
 73 |         """
 74 |         old_h, old_w = original_size
 75 |         new_h, new_w = self.get_preprocess_shape(
 76 |             original_size[0], original_size[1], self.target_length
 77 |         )
 78 |         coords = deepcopy(coords).to(torch.float)
 79 |         coords[..., 0] = coords[..., 0] * (new_w / old_w)
 80 |         coords[..., 1] = coords[..., 1] * (new_h / old_h)
 81 |         return coords
 82 | 
 83 |     def apply_boxes_torch(
 84 |         self, boxes: torch.Tensor, original_size: Tuple[int, ...]
 85 |     ) -> torch.Tensor:
 86 |         """
 87 |         Expects a torch tensor with shape Bx4. Requires the original image
 88 |         size in (H, W) format.
 89 |         """
 90 |         boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
 91 |         return boxes.reshape(-1, 4)
 92 | 
 93 |     @staticmethod
 94 |     def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
 95 |         """
 96 |         Compute the output size given input size and target long side length.
 97 |         """
 98 |         scale = long_side_length * 1.0 / max(oldh, oldw)
 99 |         newh, neww = oldh * scale, oldw * scale
100 |         neww = int(neww + 0.5)
101 |         newh = int(newh + 0.5)
102 |         return (newh, neww)
103 | 


--------------------------------------------------------------------------------